1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
|
/* main
This file is part of the Public Domain C Library (PDCLib).
Permission is granted to use, modify, and / or redistribute at will.
*/
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef TEST
#include <wctype.h>
#endif
#include "uctype.h"
/* RLE Compressed Output
<wctype.h> requires *11* flags:
iswupper, iswlower, iswalpha, iswdigit, iswblank, iswspace,
iswcntrl, iswxdigit, iswgraph, iswprint.
iswalnum (the 12th classification function) is *defined* as
iswalpha || iswdigit. And iswdigit and iswxdigit are defined
in a rather restrictive way that can be expressed by simple
ranges instead of lookup tables. And isgraph is defined as
isprint && ! isspace (which is trivial to check that it holds
true for all the records provided by get-unicode-ctype, at
least up to Unicode 11.0).
So we have only 8 flags we actually need in a lookup... nicely
reducing the storage requirement to an unsigned char.
Another trick is to express toupper / tolower as offsets
instead of absolute values, which will allow run-time-length
compression of the data.
*/
struct output_record_t
{
size_t codepoint;
int toupper_diff;
int tolower_diff;
unsigned char flags;
};
#ifdef TEST
static void print_codepoint_age( size_t codepoint, struct derived_properties_t * age )
{
size_t index = age->count;
while ( index )
{
--index;
if ( lookup_property( age, age->name[ index ], codepoint ) )
{
printf( "%s", age->name[ index ] );
return;
}
}
}
static void print_additional_codepoint_info( size_t codepoint, struct unicode_record_t * ur )
{
printf( " - %s", ur->name );
printf( " - %s", ur->general_category );
printf( " - %d", ur->canonical_combining_class );
printf( " - %s", ur->bidi_class );
printf( " - %s", ( ur->decomposition ? ur->decomposition : "NULL" ) );
printf( " - %d", ur->numeric_type );
printf( " - %d", ur->numeric_digit );
printf( " - %s", ( ur->numeric_value ? ur->numeric_value : "NULL" ) );
printf( " - %c", ur->bidi_mirrored );
printf( " - U+%06zx", ur->simple_uppercase_mapping );
printf( " - U+%06zx", ur->simple_lowercase_mapping );
printf( " - U+%06zx", ur->simple_titlecase_mapping );
printf( " - " );
/* Implementations are at liberty to return non-zero values other
than 1 for "true".
*/
printf( "%d", ( iswupper( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswlower( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswalpha( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswdigit( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswblank( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswspace( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswcntrl( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswxdigit( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswgraph( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswprint( codepoint ) ) ? 1 : 0 );
printf( "%d", ( iswpunct( codepoint ) ) ? 1 : 0 );
}
static void print_codepoint_info( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core, struct derived_properties_t * age )
{
int rc;
int equal = 1;
if ( codepoint % 20 == 0 )
{
printf( " cp up low UlA0_WCXGP.\n" );
}
printf( "U+%06zX ", codepoint );
rc = get_towupper( codepoint, ur ); equal &= ( (unsigned)rc == towupper( codepoint ) ); printf( "U+%06X ", rc );
rc = get_towlower( codepoint, ur ); equal &= ( (unsigned)rc == towlower( codepoint ) ); printf( "U+%06X ", rc );
rc = get_iswupper( codepoint, ur, core ); equal &= ( iswupper( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswlower( codepoint, ur, core ); equal &= ( iswlower( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswalpha( codepoint, ur, core ); equal &= ( iswalpha( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswdigit( codepoint ); equal &= ( iswdigit( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswblank( codepoint, ur ); equal &= ( iswblank( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswspace( codepoint, ur ); equal &= ( iswspace( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswcntrl( codepoint, ur ); equal &= ( iswcntrl( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswxdigit( codepoint ); equal &= ( iswxdigit( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswgraph( codepoint, ur ); equal &= ( iswgraph( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswprint( codepoint, ur ); equal &= ( iswprint( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
rc = get_iswpunct( codepoint, ur, core ); equal &= ( iswpunct( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
if ( codepoint != ur->code_point )
{
/* These two may only differ for codepoint "ranges", which are
signified by "..., First>" / "..., Last>" pairs in UnicodeData.
If they differ and it's NOT a range, that is an error of some
kind.
*/
if ( ! strstr( ur->name, ", Last>" ) || codepoint < ( ur - 1 )->code_point )
{
printf( " ERROR: U+%06zX != U+%06zX outside of First, Last codepoint range. ", codepoint, ur->code_point );
}
}
if ( ! equal )
{
printf( " ERROR: Deviation from SysLib: " );
print_codepoint_age( codepoint, age );
print_additional_codepoint_info( codepoint, ur );
}
printf( "\n" );
}
#else
static struct output_record_t get_output_record( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core )
{
struct output_record_t rc;
char buffer[ 9 ];
rc.codepoint = codepoint;
rc.toupper_diff = get_towupper( codepoint, ur ) - codepoint;
rc.tolower_diff = get_towlower( codepoint, ur ) - codepoint;
sprintf( buffer, "%zu%zu%zu%zu%zu%zu%zu%zu",
get_iswupper( codepoint, ur, core ),
get_iswlower( codepoint, ur, core ),
get_iswalpha( codepoint, ur, core ),
get_iswblank( codepoint, ur ),
get_iswspace( codepoint, ur ),
get_iswcntrl( codepoint, ur ),
get_iswprint( codepoint, ur ),
get_iswpunct( codepoint, ur, core ) );
rc.flags = strtoul( buffer, NULL, 2 );
return rc;
}
#endif
int main( int argc, char * argv[] )
{
struct unicode_data_t * ud;
struct derived_properties_t * core;
#ifdef TEST
struct derived_properties_t * age;
#endif
char * locale = setlocale( LC_CTYPE, "" );
if ( ! strstr( locale, "UTF-8" ) || strstr( locale, "TR" ) || strstr( locale, "tr" ) )
{
fprintf( stderr, "Need non-turkish locale to work correctly.\n'%s' will not do.\n", locale );
return EXIT_FAILURE;
}
if ( argc != 4 )
{
printf( "\n"
"Usage: get-uctypes <UnicodeData.txt> <DerivedCoreProperties.txt>"
#ifdef TEST
" <DerivedAge.txt>"
#endif
"\n\n"
"Generates lookup tables for <wctype.h> from files available from\n"
"the Unicode Consortium.\n"
"\n"
"The required files can be retrieved from the following URL:\n"
"\n"
"http://www.unicode.org/Public/UCD/latest/ucd/\n"
"\n" );
return EXIT_FAILURE;
}
if ( ( ud = read_unicode_data( argv[ 1 ] ) ) != NULL )
{
if ( ( core = read_derived_properties( argv[ 2 ] ) ) != NULL )
{
#ifndef TEST
/* Print (to file) RLE compressed data */
FILE * fh = fopen( "ctype.dat", "wb" );
if ( fh )
{
size_t codepoint = 0;
size_t i = 0;
struct unicode_record_t * ur = &(ud->records[i]);
/* Name substring indicating a code point _range_ */
const char * last = ", Last>";
struct output_record_t previous = get_output_record( codepoint, ur, core );
fprintf( fh, "%zx ", previous.codepoint );
for ( codepoint = 1; codepoint < 0x10fffe; ++codepoint )
{
struct output_record_t current;
while ( codepoint > ur->code_point )
{
ur = &(ud->records[++i]);
}
if ( codepoint != ur->code_point && ( ur->name && ( strstr( ur->name, last ) != ( ur->name + strlen( ur->name ) - strlen( last ) ) ) ) )
{
/* Unregistered Code Point */
continue;
}
current = get_output_record( codepoint, ur, core );
/* RLE */
if ( current.codepoint != previous.codepoint + 1 ||
current.toupper_diff != previous.toupper_diff ||
current.tolower_diff != previous.tolower_diff ||
current.flags != previous.flags )
{
fprintf( fh, "%zx %d %d %hhx\n", previous.codepoint, previous.toupper_diff, previous.tolower_diff, previous.flags );
fprintf( fh, "%zx ", current.codepoint );
}
previous = current;
}
fprintf( fh, "%zx %d %d %hhx\n", previous.codepoint, previous.toupper_diff, previous.tolower_diff, previous.flags );
fclose( fh );
}
else
{
fprintf( stderr, "Could not open 'ctype.dat' for writing.\n" );
}
#else
if ( ( age = read_derived_properties( argv[ 3 ] ) ) != NULL )
{
/* Print (to screen) raw data comparing our results
to the system library.
Differences are often because the system library
uses older data, which is why we add the age to
the output.
*/
size_t codepoint = 0;
size_t i = 0;
struct unicode_record_t * ur = &(ud->records[i]);
/* Name substring indicating a code point _range_ */
const char * last = ", Last>";
for ( codepoint = 0; codepoint < 0x10fffe; ++codepoint )
{
while ( codepoint > ur->code_point )
{
ur = &(ud->records[++i]);
}
if ( codepoint != ur->code_point && ! name_ends_with( ur, last ) )
{
/* Unregistered Code Point */
continue;
}
print_codepoint_info( codepoint, ur, core, age );
}
release_derived_properties( age );
}
#endif
release_derived_properties( core );
}
release_unicode_data( ud );
}
return EXIT_SUCCESS;
}
|