/* main This file is part of the Public Domain C Library (PDCLib). Permission is granted to use, modify, and / or redistribute at will. */ #include #include #include #include #ifdef TEST #include #endif #include "uctype.h" /* RLE Compressed Output requires *11* flags: iswupper, iswlower, iswalpha, iswdigit, iswblank, iswspace, iswcntrl, iswxdigit, iswgraph, iswprint. iswalnum (the 12th classification function) is *defined* as iswalpha || iswdigit. And iswdigit and iswxdigit are defined in a rather restrictive way that can be expressed by simple ranges instead of lookup tables. And isgraph is defined as isprint && ! isspace (which is trivial to check that it holds true for all the records provided by get-unicode-ctype, at least up to Unicode 11.0). So we have only 8 flags we actually need in a lookup... nicely reducing the storage requirement to an unsigned char. Another trick is to express toupper / tolower as offsets instead of absolute values, which will allow run-time-length compression of the data. */ struct output_record_t { size_t codepoint; int toupper_diff; int tolower_diff; unsigned char flags; }; #ifdef TEST static void print_codepoint_age( size_t codepoint, struct derived_properties_t * age ) { size_t index = age->count; while ( index ) { --index; if ( lookup_property( age, age->name[ index ], codepoint ) ) { printf( "%s", age->name[ index ] ); return; } } } static void print_additional_codepoint_info( size_t codepoint, struct unicode_record_t * ur ) { printf( " - %s", ur->name ); printf( " - %s", ur->general_category ); printf( " - %d", ur->canonical_combining_class ); printf( " - %s", ur->bidi_class ); printf( " - %s", ( ur->decomposition ? ur->decomposition : "NULL" ) ); printf( " - %d", ur->numeric_type ); printf( " - %d", ur->numeric_digit ); printf( " - %s", ( ur->numeric_value ? ur->numeric_value : "NULL" ) ); printf( " - %c", ur->bidi_mirrored ); printf( " - U+%06zx", ur->simple_uppercase_mapping ); printf( " - U+%06zx", ur->simple_lowercase_mapping ); printf( " - U+%06zx", ur->simple_titlecase_mapping ); printf( " - " ); /* Implementations are at liberty to return non-zero values other than 1 for "true". */ printf( "%d", ( iswupper( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswlower( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswalpha( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswdigit( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswblank( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswspace( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswcntrl( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswxdigit( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswgraph( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswprint( codepoint ) ) ? 1 : 0 ); printf( "%d", ( iswpunct( codepoint ) ) ? 1 : 0 ); } static void print_codepoint_info( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core, struct derived_properties_t * age ) { int rc; int equal = 1; if ( codepoint % 20 == 0 ) { printf( " cp up low UlA0_WCXGP.\n" ); } printf( "U+%06zX ", codepoint ); rc = get_towupper( codepoint, ur ); equal &= ( (unsigned)rc == towupper( codepoint ) ); printf( "U+%06X ", rc ); rc = get_towlower( codepoint, ur ); equal &= ( (unsigned)rc == towlower( codepoint ) ); printf( "U+%06X ", rc ); rc = get_iswupper( codepoint, ur, core ); equal &= ( iswupper( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswlower( codepoint, ur, core ); equal &= ( iswlower( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswalpha( codepoint, ur, core ); equal &= ( iswalpha( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswdigit( codepoint ); equal &= ( iswdigit( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswblank( codepoint, ur ); equal &= ( iswblank( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswspace( codepoint, ur ); equal &= ( iswspace( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswcntrl( codepoint, ur ); equal &= ( iswcntrl( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswxdigit( codepoint ); equal &= ( iswxdigit( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswgraph( codepoint, ur ); equal &= ( iswgraph( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswprint( codepoint, ur ); equal &= ( iswprint( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); rc = get_iswpunct( codepoint, ur, core ); equal &= ( iswpunct( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); if ( codepoint != ur->code_point ) { /* These two may only differ for codepoint "ranges", which are signified by "..., First>" / "..., Last>" pairs in UnicodeData. If they differ and it's NOT a range, that is an error of some kind. */ if ( ! strstr( ur->name, ", Last>" ) || codepoint < ( ur - 1 )->code_point ) { printf( " ERROR: U+%06zX != U+%06zX outside of First, Last codepoint range. ", codepoint, ur->code_point ); } } if ( ! equal ) { printf( " ERROR: Deviation from SysLib: " ); print_codepoint_age( codepoint, age ); print_additional_codepoint_info( codepoint, ur ); } printf( "\n" ); } #else static struct output_record_t get_output_record( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ) { struct output_record_t rc; char buffer[ 9 ]; rc.codepoint = codepoint; rc.toupper_diff = get_towupper( codepoint, ur ) - codepoint; rc.tolower_diff = get_towlower( codepoint, ur ) - codepoint; sprintf( buffer, "%zu%zu%zu%zu%zu%zu%zu%zu", get_iswupper( codepoint, ur, core ), get_iswlower( codepoint, ur, core ), get_iswalpha( codepoint, ur, core ), get_iswblank( codepoint, ur ), get_iswspace( codepoint, ur ), get_iswcntrl( codepoint, ur ), get_iswprint( codepoint, ur ), get_iswpunct( codepoint, ur, core ) ); rc.flags = strtoul( buffer, NULL, 2 ); return rc; } #endif int main( int argc, char * argv[] ) { struct unicode_data_t * ud; struct derived_properties_t * core; #ifdef TEST struct derived_properties_t * age; #endif char * locale = setlocale( LC_CTYPE, "" ); if ( ! strstr( locale, "UTF-8" ) || strstr( locale, "TR" ) || strstr( locale, "tr" ) ) { fprintf( stderr, "Need non-turkish locale to work correctly.\n'%s' will not do.\n", locale ); return EXIT_FAILURE; } if ( argc != 4 ) { printf( "\n" "Usage: get-uctypes " #ifdef TEST " " #endif "\n\n" "Generates lookup tables for from files available from\n" "the Unicode Consortium.\n" "\n" "The required files can be retrieved from the following URL:\n" "\n" "http://www.unicode.org/Public/UCD/latest/ucd/\n" "\n" ); return EXIT_FAILURE; } if ( ( ud = read_unicode_data( argv[ 1 ] ) ) != NULL ) { if ( ( core = read_derived_properties( argv[ 2 ] ) ) != NULL ) { #ifndef TEST /* Print (to file) RLE compressed data */ FILE * fh = fopen( "ctype.dat", "wb" ); if ( fh ) { size_t codepoint = 0; size_t i = 0; struct unicode_record_t * ur = &(ud->records[i]); /* Name substring indicating a code point _range_ */ const char * last = ", Last>"; struct output_record_t previous = get_output_record( codepoint, ur, core ); fprintf( fh, "%zx ", previous.codepoint ); for ( codepoint = 1; codepoint < 0x10fffe; ++codepoint ) { struct output_record_t current; while ( codepoint > ur->code_point ) { ur = &(ud->records[++i]); } if ( codepoint != ur->code_point && ( ur->name && ( strstr( ur->name, last ) != ( ur->name + strlen( ur->name ) - strlen( last ) ) ) ) ) { /* Unregistered Code Point */ continue; } current = get_output_record( codepoint, ur, core ); /* RLE */ if ( current.codepoint != previous.codepoint + 1 || current.toupper_diff != previous.toupper_diff || current.tolower_diff != previous.tolower_diff || current.flags != previous.flags ) { fprintf( fh, "%zx %d %d %hhx\n", previous.codepoint, previous.toupper_diff, previous.tolower_diff, previous.flags ); fprintf( fh, "%zx ", current.codepoint ); } previous = current; } fprintf( fh, "%zx %d %d %hhx\n", previous.codepoint, previous.toupper_diff, previous.tolower_diff, previous.flags ); fclose( fh ); } else { fprintf( stderr, "Could not open 'ctype.dat' for writing.\n" ); } #else if ( ( age = read_derived_properties( argv[ 3 ] ) ) != NULL ) { /* Print (to screen) raw data comparing our results to the system library. Differences are often because the system library uses older data, which is why we add the age to the output. */ size_t codepoint = 0; size_t i = 0; struct unicode_record_t * ur = &(ud->records[i]); /* Name substring indicating a code point _range_ */ const char * last = ", Last>"; for ( codepoint = 0; codepoint < 0x10fffe; ++codepoint ) { while ( codepoint > ur->code_point ) { ur = &(ud->records[++i]); } if ( codepoint != ur->code_point && ! name_ends_with( ur, last ) ) { /* Unregistered Code Point */ continue; } print_codepoint_info( codepoint, ur, core, age ); } release_derived_properties( age ); } #endif release_derived_properties( core ); } release_unicode_data( ud ); } return EXIT_SUCCESS; }