diff options
Diffstat (limited to 'src/pdclib/auxiliary')
-rw-r--r-- | src/pdclib/auxiliary/uctype/Makefile | 48 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/derived_properties.c | 300 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/derived_properties.h | 34 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/main.c | 300 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/test.h | 19 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/text_utilities.c | 206 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/text_utilities.h | 59 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/uctype.c | 85 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/uctype.h | 29 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/unicode_data.c | 224 | ||||
-rw-r--r-- | src/pdclib/auxiliary/uctype/unicode_data.h | 77 |
11 files changed, 1381 insertions, 0 deletions
diff --git a/src/pdclib/auxiliary/uctype/Makefile b/src/pdclib/auxiliary/uctype/Makefile new file mode 100644 index 0000000..0d34b98 --- /dev/null +++ b/src/pdclib/auxiliary/uctype/Makefile @@ -0,0 +1,48 @@ +TARGET := get-uctypes +# All source files of the project +SRCFILES := $(wildcard *.c) +# All header files of the project +HDRFILES := $(wildcard *.h) +# All object files in the project +OBJFILES := $(patsubst %.c,%.o,$(SRCFILES)) +# All test drivers (_t) +TSTFILES := $(patsubst %.c,%_t,$(SRCFILES)) +# All dependency files (.d) +DEPFILES := $(patsubst %.c,%.d,$(SRCFILES)) +# All test driver dependency files (_t.d) +TSTDEPFILES := $(patsubst %,%.d,$(TSTFILES)) +# All test driver dependency files (_t.d) + +WARNINGS := -Wall -Wextra -pedantic -Wno-unused-parameter -Wshadow -Wpointer-arith -Wcast-align -Wwrite-strings -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls -Wnested-externs -Winline -Wno-long-long -Wuninitialized -Wstrict-prototypes -Wdeclaration-after-statement +CFLAGS := -g -std=c99 $(WARNINGS) $(USERFLAGS) -I. + +.PHONY: all clean tests + +all: $(TARGET) + +$(TARGET): $(OBJFILES) + @echo " CC $@" + @$(CC) $^ -o $@ + @echo + +tests: testdrivers + -@rc=0; count=0; failed=""; for file in $(TSTFILES); do echo " TST $$file"; ./$$file; test=$$?; if [ $$test != 0 ]; then rc=`expr $$rc + $$test`; failed="$$failed $$file"; fi; count=`expr $$count + 1`; done; echo; echo "Tests executed: $$count Tests failed: $$rc"; echo; for file in $$failed; do echo "Failed: $$file"; done; echo + +testdrivers: $(TSTFILES) + @echo + +-include $(DEPFILES) $(TSTDEPFILES) + +clean: + -@$(RM) $(wildcard $(OBJFILES) $(DEPFILES) $(TSTFILES) $(TSTDEPFILES) $(TARGET) aux.a) + +%.o: %.c Makefile + @echo " CC $@" + @$(CC) $(CFLAGS) -MMD -MP -c $< -o $@ + +%_t: %.c Makefile aux.a + @echo " CC $@" + @$(CC) $(CFLAGS) -MMD -MP -DTEST $< aux.a -o $@ + +aux.a: $(OBJFILES) + @ar rc $@ $^ diff --git a/src/pdclib/auxiliary/uctype/derived_properties.c b/src/pdclib/auxiliary/uctype/derived_properties.c new file mode 100644 index 0000000..c024efe --- /dev/null +++ b/src/pdclib/auxiliary/uctype/derived_properties.c @@ -0,0 +1,300 @@ +/* derived properties + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "text_utilities.h" + +#include "derived_properties.h" + +#define LINE_BUFFER_SIZE 500u + +struct derived_properties_t * read_derived_properties( const char * filename ) +{ + FILE * fh; + char buffer[ LINE_BUFFER_SIZE ]; + struct derived_properties_t * dp = NULL; + size_t code_points = 0; + size_t properties = 0; + const char * code_point_count = "# Total code points: "; + + if ( ( fh = fopen( filename, "r" ) ) == NULL ) + { + fprintf( stderr, "Could not open '%s' for reading.\n", filename ); + return NULL; + } + + if ( ( check_file( fh, LINE_BUFFER_SIZE, ';', sizeof( derived_properties_fields ) / sizeof( int ), derived_properties_fields ) ) != (size_t)-1 ) + { + while ( fgets( buffer, LINE_BUFFER_SIZE, fh ) ) + { + if ( strstr( buffer, code_point_count ) != NULL ) + { + size_t count = strtoul( buffer + strlen( code_point_count ), NULL, 10 ); + + if ( ( SIZE_MAX - count ) < code_points ) + { + fprintf( stderr, "Summing up total code points in '%s' would overflow.\n", filename ); + fclose( fh ); + return NULL; + } + + code_points += count; + ++properties; + } + } + + rewind( fh ); + + if ( ( dp = malloc( sizeof( struct derived_properties_t ) ) ) ) + { + dp->count = properties; + + if ( ( dp->name = calloc( properties, sizeof( char * ) ) ) ) + { + if ( ( dp->begin = calloc( properties, sizeof( size_t ) ) ) ) + { + if ( ( dp->end = calloc( properties, sizeof( size_t ) ) ) ) + { + if ( ( dp->code_points = malloc( code_points * sizeof( size_t ) ) ) ) + { + char * p; + char * range; + properties = 0; /* Re-using the variable */ + code_points = 0; /* Re-using the variable */ + + while ( fgets( buffer, LINE_BUFFER_SIZE, fh ) ) + { + /* Remove comments */ + if ( ( p = strchr( buffer, '#' ) ) != NULL ) + { + *p = '\0'; + } + + /* > 0 because of newline */ + if ( strlen( buffer ) > 1 ) + { + size_t first; + size_t last; + + range = next_token( buffer, ';' ); + p = next_token( NULL, ';' ); + + if ( ! range || ! p ) + { + size_t i; + + fprintf( stderr, "Parse error, malformed input.\n" ); + + for ( i = 0; i < properties; ++i ) + { + free( dp->name[ i ] ); + } + + free( dp->name ); + free( dp->begin ); + free( dp->end ); + free( dp->code_points ); + free( dp ); + return NULL; + } + + /* If we got to a new property (except the first) */ + if ( dp->name[ properties ] && strcmp( p, dp->name[ properties ] ) ) + { + /* Index into ->code_points where the previous property ends */ + dp->end[ properties ] = code_points; + ++properties; + } + + /* If we got to a new property, even the first */ + if ( dp->name[ properties ] == NULL ) + { + dp->name[ properties ] = malloc( strlen( p ) + 1 ); + strcpy( dp->name[ properties ], p ); + + /* Index into ->code_points where this property begins */ + dp->begin[ properties ] = code_points; + } + + /* Re-using p, as we have done everything related to the property + name at this point. + */ + first = strtoul( range, &p, 16 ); + + if ( *p == '\0' ) + { + last = first; + } + else + { + while ( *p && ! isxdigit( *p ) ) + { + ++p; + } + + last = strtoul( p, NULL, 16 ); + + if ( last <= first ) + { + size_t i; + + fprintf( stderr, "Parse error, malformed input.\n" ); + + for ( i = 0; i < properties; ++i ) + { + free( dp->name[ i ] ); + } + + free( dp->name ); + free( dp->begin ); + free( dp->end ); + free( dp->code_points ); + free( dp ); + return NULL; + } + } + + for ( ; first <= last; ++first ) + { + dp->code_points[ code_points++ ] = first; + } + } + } + + /* Have to end the last property as well */ + dp->end[ properties ] = code_points; + } + else + { + fprintf( stderr, "Memory allocation failure.\n" ); + free( dp->name ); + free( dp->begin ); + free( dp->end ); + free( dp ); + dp = NULL; + } + } + else + { + fprintf( stderr, "Memory allocation failure.\n" ); + free( dp->name ); + free( dp->begin ); + free( dp ); + dp = NULL; + } + } + else + { + fprintf( stderr, "Memory allocation failure.\n" ); + free( dp->name ); + free( dp ); + dp = NULL; + } + } + else + { + fprintf( stderr, "Memory allocation failure.\n" ); + free( dp ); + dp = NULL; + } + } + else + { + fprintf( stderr, "Memory allocation failure.\n" ); + } + } + + fclose( fh ); + return dp; +} + +static int comp( const void * l, const void * r ) +{ + const size_t * lhs = l; + const size_t * rhs = r; + + return ( *lhs < *rhs ) ? -1 : ( *lhs > *rhs ) ? 1 : 0; +} + +int lookup_property( struct derived_properties_t * dp, const char * property, size_t codepoint ) +{ + size_t i; + + for ( i = 0; i < dp->count; ++i ) + { + /* Look for the requested property */ + if ( strcmp( dp->name[ i ], property ) == 0 ) + { + size_t cp = dp->begin[ i ]; + + return bsearch( &codepoint, dp->code_points + cp, dp->end[ i ] - cp, sizeof( size_t ), comp ) != NULL; + } + } + + return 0; +} + +void release_derived_properties( struct derived_properties_t * dp ) +{ + size_t i; + + for ( i = 0; i < dp->count; ++i ) + { + free( dp->name[ i ] ); + } + + free( dp->name ); + free( dp->begin ); + free( dp->end ); + free( dp->code_points ); + free( dp ); +} + +#ifdef TEST + +#include "test.h" + +int main( void ) +{ + FILE * fh = fopen( "test.txt", "wb+" ); + struct derived_properties_t * dp; + + TESTCASE( fh != NULL ); + TESTCASE( fprintf( fh, "0000..0006 ; Test1 \n" ) == 20 ); + TESTCASE( fprintf( fh, "# Total code points: 7\n" ) == 23 ); + TESTCASE( fprintf( fh, "0001;Test2\n" ) == 11 ); + TESTCASE( fprintf( fh, "# Total code points: 1\n" ) == 23 ); + + fclose( fh ); + dp = read_derived_properties( "test.txt" ); + + TESTCASE( dp != NULL ); + TESTCASE( dp->count == 2 ); + TESTCASE( ! strcmp( dp->name[0], "Test1" ) ); + TESTCASE( ! strcmp( dp->name[1], "Test2" ) ); + + TESTCASE( lookup_property( dp, "Test1", 0 ) ); + TESTCASE( lookup_property( dp, "Test1", 6 ) ); + TESTCASE( ! lookup_property( dp, "Test1", 7 ) ); + + TESTCASE( ! lookup_property( dp, "Test2", 0 ) ); + TESTCASE( lookup_property( dp, "Test2", 1 ) ); + TESTCASE( ! lookup_property( dp, "Test2", 2 ) ); + + TESTCASE( ! lookup_property( dp, "Test", 0 ) ); + TESTCASE( ! lookup_property( dp, "Test3", 0 ) ); + + release_derived_properties( dp ); + remove( "test.txt" ); + + return TEST_RESULTS; +} + +#endif diff --git a/src/pdclib/auxiliary/uctype/derived_properties.h b/src/pdclib/auxiliary/uctype/derived_properties.h new file mode 100644 index 0000000..d06ac84 --- /dev/null +++ b/src/pdclib/auxiliary/uctype/derived_properties.h @@ -0,0 +1,34 @@ +/* derived properties + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#ifndef DERIVED_PROPERTIES +#define DERIVED_PROPERTIES DERIVED_PROPERTIES + +#include <stddef.h> + +/* https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt */ + +struct derived_properties_t +{ + size_t count; + char * * name; + size_t * begin; + size_t * end; + size_t * code_points; +}; + +static const int derived_properties_fields[] = { + -1, /* code point or code point range */ + -1 /* property name */ +}; + +struct derived_properties_t * read_derived_properties( const char * filename ); + +int lookup_property( struct derived_properties_t * dp, const char * property, size_t codepoint ); + +void release_derived_properties( struct derived_properties_t * dp ); + +#endif diff --git a/src/pdclib/auxiliary/uctype/main.c b/src/pdclib/auxiliary/uctype/main.c new file mode 100644 index 0000000..ef60bb4 --- /dev/null +++ b/src/pdclib/auxiliary/uctype/main.c @@ -0,0 +1,300 @@ +/* main + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#include <locale.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#ifdef TEST +#include <wctype.h> +#endif + +#include "uctype.h" + +/* RLE Compressed Output + + <wctype.h> requires *11* flags: + iswupper, iswlower, iswalpha, iswdigit, iswblank, iswspace, + iswcntrl, iswxdigit, iswgraph, iswprint. + iswalnum (the 12th classification function) is *defined* as + iswalpha || iswdigit. And iswdigit and iswxdigit are defined + in a rather restrictive way that can be expressed by simple + ranges instead of lookup tables. And isgraph is defined as + isprint && ! isspace (which is trivial to check that it holds + true for all the records provided by get-unicode-ctype, at + least up to Unicode 11.0). + So we have only 8 flags we actually need in a lookup... nicely + reducing the storage requirement to an unsigned char. + + Another trick is to express toupper / tolower as offsets + instead of absolute values, which will allow run-time-length + compression of the data. +*/ + +struct output_record_t +{ + size_t codepoint; + int toupper_diff; + int tolower_diff; + unsigned char flags; +}; + +#ifdef TEST +static void print_codepoint_age( size_t codepoint, struct derived_properties_t * age ) +{ + size_t index = age->count; + + while ( index ) + { + --index; + + if ( lookup_property( age, age->name[ index ], codepoint ) ) + { + printf( "%s", age->name[ index ] ); + return; + } + } +} + +static void print_additional_codepoint_info( size_t codepoint, struct unicode_record_t * ur ) +{ + printf( " - %s", ur->name ); + printf( " - %s", ur->general_category ); + printf( " - %d", ur->canonical_combining_class ); + printf( " - %s", ur->bidi_class ); + printf( " - %s", ( ur->decomposition ? ur->decomposition : "NULL" ) ); + printf( " - %d", ur->numeric_type ); + printf( " - %d", ur->numeric_digit ); + printf( " - %s", ( ur->numeric_value ? ur->numeric_value : "NULL" ) ); + printf( " - %c", ur->bidi_mirrored ); + printf( " - U+%06zx", ur->simple_uppercase_mapping ); + printf( " - U+%06zx", ur->simple_lowercase_mapping ); + printf( " - U+%06zx", ur->simple_titlecase_mapping ); + printf( " - " ); + + /* Implementations are at liberty to return non-zero values other + than 1 for "true". + */ + printf( "%d", ( iswupper( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswlower( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswalpha( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswdigit( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswblank( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswspace( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswcntrl( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswxdigit( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswgraph( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswprint( codepoint ) ) ? 1 : 0 ); + printf( "%d", ( iswpunct( codepoint ) ) ? 1 : 0 ); +} + +static void print_codepoint_info( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core, struct derived_properties_t * age ) +{ + int rc; + int equal = 1; + + if ( codepoint % 20 == 0 ) + { + printf( " cp up low UlA0_WCXGP.\n" ); + } + + printf( "U+%06zX ", codepoint ); + rc = get_towupper( codepoint, ur ); equal &= ( (unsigned)rc == towupper( codepoint ) ); printf( "U+%06X ", rc ); + rc = get_towlower( codepoint, ur ); equal &= ( (unsigned)rc == towlower( codepoint ) ); printf( "U+%06X ", rc ); + rc = get_iswupper( codepoint, ur, core ); equal &= ( iswupper( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswlower( codepoint, ur, core ); equal &= ( iswlower( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswalpha( codepoint, ur, core ); equal &= ( iswalpha( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswdigit( codepoint ); equal &= ( iswdigit( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswblank( codepoint, ur ); equal &= ( iswblank( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswspace( codepoint, ur ); equal &= ( iswspace( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswcntrl( codepoint, ur ); equal &= ( iswcntrl( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswxdigit( codepoint ); equal &= ( iswxdigit( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswgraph( codepoint, ur ); equal &= ( iswgraph( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswprint( codepoint, ur ); equal &= ( iswprint( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + rc = get_iswpunct( codepoint, ur, core ); equal &= ( iswpunct( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 ); + + if ( codepoint != ur->code_point ) + { + /* These two may only differ for codepoint "ranges", which are + signified by "..., First>" / "..., Last>" pairs in UnicodeData. + If they differ and it's NOT a range, that is an error of some + kind. + */ + if ( ! strstr( ur->name, ", Last>" ) || codepoint < ( ur - 1 )->code_point ) + { + printf( " ERROR: U+%06zX != U+%06zX outside of First, Last codepoint range. ", codepoint, ur->code_point ); + } + } + + if ( ! equal ) + { + printf( " ERROR: Deviation from SysLib: " ); + print_codepoint_age( codepoint, age ); + print_additional_codepoint_info( codepoint, ur ); + } + + printf( "\n" ); +} +#else +static struct output_record_t get_output_record( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ) +{ + struct output_record_t rc; + char buffer[ 9 ]; + + rc.codepoint = codepoint; + rc.toupper_diff = get_towupper( codepoint, ur ) - codepoint; + rc.tolower_diff = get_towlower( codepoint, ur ) - codepoint; + + sprintf( buffer, "%zu%zu%zu%zu%zu%zu%zu%zu", + get_iswupper( codepoint, ur, core ), + get_iswlower( codepoint, ur, core ), + get_iswalpha( codepoint, ur, core ), + get_iswblank( codepoint, ur ), + get_iswspace( codepoint, ur ), + get_iswcntrl( codepoint, ur ), + get_iswprint( codepoint, ur ), + get_iswpunct( codepoint, ur, core ) ); + + rc.flags = strtoul( buffer, NULL, 2 ); + + return rc; +} +#endif + +int main( int argc, char * argv[] ) +{ + struct unicode_data_t * ud; + struct derived_properties_t * core; +#ifdef TEST + struct derived_properties_t * age; +#endif + + char * locale = setlocale( LC_CTYPE, "" ); + + if ( ! strstr( locale, "UTF-8" ) || strstr( locale, "TR" ) || strstr( locale, "tr" ) ) + { + fprintf( stderr, "Need non-turkish locale to work correctly.\n'%s' will not do.\n", locale ); + return EXIT_FAILURE; + } + + if ( argc != 4 ) + { + printf( "\n" + "Usage: get-uctypes <UnicodeData.txt> <DerivedCoreProperties.txt>" +#ifdef TEST + " <DerivedAge.txt>" +#endif + "\n\n" + "Generates lookup tables for <wctype.h> from files available from\n" + "the Unicode Consortium.\n" + "\n" + "The required files can be retrieved from the following URL:\n" + "\n" + "http://www.unicode.org/Public/UCD/latest/ucd/\n" + "\n" ); + return EXIT_FAILURE; + } + + if ( ( ud = read_unicode_data( argv[ 1 ] ) ) != NULL ) + { + if ( ( core = read_derived_properties( argv[ 2 ] ) ) != NULL ) + { +#ifndef TEST + /* Print (to file) RLE compressed data */ + FILE * fh = fopen( "ctype.dat", "wb" ); + + if ( fh ) + { + size_t codepoint = 0; + size_t i = 0; + struct unicode_record_t * ur = &(ud->records[i]); + /* Name substring indicating a code point _range_ */ + const char * last = ", Last>"; + + struct output_record_t previous = get_output_record( codepoint, ur, core ); + + fprintf( fh, "%zx ", previous.codepoint ); + + for ( codepoint = 1; codepoint < 0x10fffe; ++codepoint ) + { + struct output_record_t current; + + while ( codepoint > ur->code_point ) + { + ur = &(ud->records[++i]); + } + + if ( codepoint != ur->code_point && ( ur->name && ( strstr( ur->name, last ) != ( ur->name + strlen( ur->name ) - strlen( last ) ) ) ) ) + { + /* Unregistered Code Point */ + continue; + } + + current = get_output_record( codepoint, ur, core ); + + /* RLE */ + if ( current.codepoint != previous.codepoint + 1 || + current.toupper_diff != previous.toupper_diff || + current.tolower_diff != previous.tolower_diff || + current.flags != previous.flags ) + { + fprintf( fh, "%zx %d %d %hhx\n", previous.codepoint, previous.toupper_diff, previous.tolower_diff, previous.flags ); + fprintf( fh, "%zx ", current.codepoint ); + } + + previous = current; + } + + fprintf( fh, "%zx %d %d %hhx\n", previous.codepoint, previous.toupper_diff, previous.tolower_diff, previous.flags ); + fclose( fh ); + } + else + { + fprintf( stderr, "Could not open 'ctype.dat' for writing.\n" ); + } +#else + if ( ( age = read_derived_properties( argv[ 3 ] ) ) != NULL ) + { + /* Print (to screen) raw data comparing our results + to the system library. + Differences are often because the system library + uses older data, which is why we add the age to + the output. + */ + size_t codepoint = 0; + size_t i = 0; + struct unicode_record_t * ur = &(ud->records[i]); + /* Name substring indicating a code point _range_ */ + const char * last = ", Last>"; + + for ( codepoint = 0; codepoint < 0x10fffe; ++codepoint ) + { + while ( codepoint > ur->code_point ) + { + ur = &(ud->records[++i]); + } + + if ( codepoint != ur->code_point && ! name_ends_with( ur, last ) ) + { + /* Unregistered Code Point */ + continue; + } + + print_codepoint_info( codepoint, ur, core, age ); + } + + release_derived_properties( age ); + } +#endif + + release_derived_properties( core ); + } + + release_unicode_data( ud ); + } + + return EXIT_SUCCESS; +} diff --git a/src/pdclib/auxiliary/uctype/test.h b/src/pdclib/auxiliary/uctype/test.h new file mode 100644 index 0000000..3cd33a8 --- /dev/null +++ b/src/pdclib/auxiliary/uctype/test.h @@ -0,0 +1,19 @@ +/* test + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#ifndef TEST_H +#define TEST_H TEST_H + +#include <stdio.h> + +#define NO_TESTDRIVER 0 + +static int TEST_RESULTS = 0; + +#define TESTCASE( x ) if ( x ) {} \ + else { TEST_RESULTS += 1; printf( "FAILED: " __FILE__ ", line %d - %s\n", __LINE__, #x ); } + +#endif diff --git a/src/pdclib/auxiliary/uctype/text_utilities.c b/src/pdclib/auxiliary/uctype/text_utilities.c new file mode 100644 index 0000000..20973d9 --- /dev/null +++ b/src/pdclib/auxiliary/uctype/text_utilities.c @@ -0,0 +1,206 @@ +/* text utilities + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#include "text_utilities.h" + +#include <ctype.h> +#include <stdlib.h> +#include <string.h> + +char * trim( char * s ) +{ + char * p; + + /* Skip over leading whitespace */ + while ( *s && isspace( *s ) ) + { + ++s; + } + + /* Trim trailing whitespace */ + p = s; + + while ( *p ) + { + ++p; + } + + while ( isspace( *(--p) ) ) + { + *p = '\0'; + } + + return s; +} + +char * next_token( char * s, char delim ) +{ + static char * p = NULL; + char * rc; + + if ( s != NULL ) + { + /* Re-init the to-be-tokenized string */ + p = s; + } + + /* Remembering the start of the next token */ + rc = p; + + /* In case the function has not been initialized, or the previous + string been exhaused, do nothing. + */ + if ( p ) + { + /* Re-using s here */ + if ( ( s = strchr( p, delim ) ) ) + { + /* Null the delimiter */ + *s = '\0'; + /* Make the internal, static pointer point to the next token */ + p = s + 1; + } + else + { + /* Delimiter not found, end-of-string reached. */ + p = NULL; + } + + /* Trim the result */ + rc = trim( rc ); + } + + return rc; +} + +size_t check_file( FILE * fh, size_t buffer_size, char delim, size_t fields, int const * widths ) +{ + /* Dynamically allocated buffer */ + char * buffer = malloc( buffer_size ); + size_t lines = 0; + + rewind( fh ); + + while ( fgets( buffer, buffer_size, fh ) ) + { + size_t i; + char * p; + + ++lines; + + /* Check line for complete read */ + if ( buffer[ strlen( buffer ) - 1 ] != '\n' ) + { + fprintf( stderr, "Line %zu will not fit into a %zu-character buffer.\n", lines, buffer_size ); + rewind( fh ); + free( buffer ); + return -1; + } + + /* Remove comments */ + if ( ( p = strchr( buffer, '#' ) ) != NULL ) + { + *p = '\0'; + } + + /* > 1 because of newline */ + if ( strlen( buffer ) > 1 ) + { + /* Check field count and field widths */ + p = next_token( buffer, delim ); + + for ( i = 0; i < fields; ++i ) + { + if ( ! p ) + { + fprintf( stderr, "Line %zu contains less than %zu fields.\n", lines, fields ); + rewind( fh ); + free( buffer ); + return -1; + } + + if ( widths[ i ] >= 0 && strlen( p ) >= (unsigned)widths[ i ] ) + { + fprintf( stderr, "Line %zu: Field %zu '%s' will not fit in a %d character string.\n", lines, i + 1, p, widths[ i ] ); + rewind( fh ); + free( buffer ); + return -1; + } + + p = next_token( NULL, delim ); + } + + if ( p ) + { + fprintf( stderr, "Line %zu contains more than %zu fields.\n", lines, fields ); + rewind( fh ); + free( buffer ); + return -1; + } + } + } + + /* Rewind, free the buffer, and report the number of lines */ + rewind( fh ); + free( buffer ); + return lines; +} + +#ifdef TEST + +#include "test.h" + +int main( void ) +{ + FILE * fh = fopen( "test.txt", "wb+" ); + int widths[] = { 4, 4, 4 }; + char buffer[ 500 ]; + + /* check_file() (and as dependency, next_token() */ + + /* All ok */ + TESTCASE( fprintf( fh, "%s;%s;%s\n", "1", "123", "12" ) == 9 ); + TESTCASE( fprintf( fh, ";;\n" ) == 3 ); + TESTCASE( check_file( fh, 10, ';', 3, widths ) == 2 ); + /* Field 1 too long */ + TESTCASE( fprintf( fh, "%s;%s;%s\n", "", "1234", "1" ) == 8 ); + TESTCASE( check_file( fh, 10, ';', 3, widths ) == (size_t)-1 ); + /* Too few fields */ + TESTCASE( fprintf( fh, "%s;%s\n", "123", "123" ) == 8 ); + TESTCASE( check_file( fh, 10, ';', 3, widths )== (size_t)-1 ); + /* Too many fields */ + TESTCASE( fprintf( fh, "%s;%s;%s;%s\n", "1", "1", "1", "1" ) == 8 ); + TESTCASE( check_file( fh, 10, ';', 3, widths )== (size_t)-1 ); + /* Line too long */ + TESTCASE( fprintf( fh, "%s;%s;%s\n", "12", "123", "12" ) == 10 ); + TESTCASE( check_file( fh, 10, ';', 3, widths )== (size_t)-1 ); + + fclose( fh ); + remove( "test.txt" ); + + /* trim() */ + + strcpy( buffer, " xyz" ); + TESTCASE( ! strcmp( trim( buffer ), "xyz" ) ); + strcpy( buffer, "xyz " ); + TESTCASE( ! strcmp( trim( buffer ), "xyz" ) ); + strcpy( buffer, " xyz " ); + TESTCASE( ! strcmp( trim( buffer ), "xyz" ) ); + strcpy( buffer, " x" ); + TESTCASE( ! strcmp( trim( buffer ), "x" ) ); + strcpy( buffer, "x " ); + TESTCASE( ! strcmp( trim( buffer ), "x" ) ); + strcpy( buffer, " " ); + TESTCASE( ! strcmp( trim( buffer ), "" ) ); + strcpy( buffer, " " ); + TESTCASE( ! strcmp( trim( buffer ), "" ) ); + strcpy( buffer, "" ); + TESTCASE( ! strcmp( trim( buffer ), "" ) ); + + return TEST_RESULTS; +} + +#endif diff --git a/src/pdclib/auxiliary/uctype/text_utilities.h b/src/pdclib/auxiliary/uctype/text_utilities.h new file mode 100644 index 0000000..f961e6b --- /dev/null +++ b/src/pdclib/auxiliary/uctype/text_utilities.h @@ -0,0 +1,59 @@ +/* text utilities + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#ifndef TEXT_UTILITIES_H +#define TEXT_UTILITIES_H TEXT_UTILITIES_H + +#include <inttypes.h> +#include <stdio.h> + +/* Trim leading and trailing whitespace from a given string. + Will return a pointer beyond leading whitespace, and overwrite trailing + whitespace with null bytes. +*/ +char * trim( char * s ); + +/* A function similar to strtok(), that returns the next token in a string, + up to the next separator character (which is replaced with a null byte) + or up to end-of-string. + As opposed to strtok(), which treats consecutive separators as one, this + function will work "correctly" for those as well, returning a (pointer + to an) empty string in those cases. + Pass the string as first parameter IN THE FIRST CALL ONLY, and NULL in + subsequent calls. The function holds an internal, static pointer to the + string being processed. This, of course, means the function is not thread- + safe. +*/ +char * next_token( char * s, char delim ); + +/* When processing a file with delimited-values, there are a couple of things + you want to be sure about before parsing it: + - the number of lines (data records) in the file; + - that all lines of the file will fit the intended line buffer size; + - that all records in the file indeed have the expected number of fields; + - that none of the fields for which you are assuming a given size exceeds + that size. + (For line buffer size, consider that the buffer must be large enough for + the line contents, the newline (to check that the line was actually read + in full), and the null terminator.) + This function does all that for you in a single pass. The parameters are: + - FILE handle to the file (function will rewind the file before checking, + and rewind again when it is done); + - the intended line buffer size; + - the field delimiter; + - the expected number of fields; + - a pointer to an array holding the expected maximum width for each field, + with a negative value indicating that this field's width need not be + checked. + The function will return the number of lines in the file, or (size_t)-1 + if one of the checks failed. The reason for the failed check will be + written to stderr. (The file will not be rewound in this case.) + This requires reading and tokenizing the file twice, but removes lots of + error checking from the actual parsing, making for cleaner code. +*/ +size_t check_file( FILE * fh, size_t max_line_length, char delim, size_t fields, int const * widths ); + +#endif diff --git a/src/pdclib/auxiliary/uctype/uctype.c b/src/pdclib/auxiliary/uctype/uctype.c new file mode 100644 index 0000000..ce8d8ef --- /dev/null +++ b/src/pdclib/auxiliary/uctype/uctype.c @@ -0,0 +1,85 @@ +/* uctype + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#include "uctype.h" + +size_t get_towupper( size_t codepoint, struct unicode_record_t * ur ) +{ + return towupper_differs( ur, codepoint ) ? ur->simple_uppercase_mapping : codepoint; +} + +size_t get_towlower( size_t codepoint, struct unicode_record_t * ur ) +{ + return towlower_differs( ur, codepoint ) ? ur->simple_lowercase_mapping : codepoint; +} + +size_t get_iswupper( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ) +{ + return towlower_differs( ur, codepoint ) || lookup_property( core, "Uppercase", codepoint ); +} + +size_t get_iswlower( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ) +{ + return towupper_differs( ur, codepoint ) || lookup_property( core, "Lowercase", codepoint ); +} + +size_t get_iswalpha( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ) +{ + return lookup_property( core, "Alphabetic", codepoint ) || ( is_general_category( ur, "Nd" ) && ! get_iswdigit( codepoint ) ); +} + +size_t get_iswdigit( size_t codepoint ) +{ + return codepoint >= 0x0030 && codepoint <= 0x0039; +} + +size_t get_iswxdigit( size_t codepoint ) +{ + return get_iswdigit( codepoint ) || ( codepoint >= 0x0041 && codepoint <= 0x0046 ) || ( codepoint >= 0x0061 && codepoint <= 0x0066 ); +} + +size_t get_iswblank( size_t codepoint, struct unicode_record_t * ur ) +{ + return ( codepoint == 0x0009 ) || ( is_general_category( ur, "Zs" ) && ! decomposition_contains( ur, "<noBreak>" ) ); +} + +size_t get_iswspace( size_t codepoint, struct unicode_record_t * ur ) +{ + return is_general_category( ur, "Zl" ) || is_general_category( ur, "Zp" ) || ( is_general_category( ur, "Zs" ) && ! decomposition_contains( ur, "<noBreak>" ) ) || ( codepoint == 0x0020 ) || ( codepoint >= 0x0009 && codepoint <= 0x000D ); +} + +size_t get_iswcntrl( size_t codepoint, struct unicode_record_t * ur ) +{ + return is_general_category( ur, "Zl" ) || is_general_category( ur, "Zp" ) || has_name( ur, "<control>" ); +} + +size_t get_iswgraph( size_t codepoint, struct unicode_record_t * ur ) +{ + return ! is_general_category( ur, "Cs" ) && ! has_name( ur, "<control>" ) && ! get_iswspace( codepoint, ur ); +} + +size_t get_iswprint( size_t codepoint, struct unicode_record_t * ur ) +{ + return ! is_general_category( ur, "Zp" ) && ! is_general_category( ur, "Zl" ) && ! is_general_category( ur, "Cs" ) && ! has_name( ur, "<control>" ); +} + +size_t get_iswpunct( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ) +{ + return ! get_iswalpha( codepoint, ur, core ) && ! get_iswdigit( codepoint ) && ( ! has_name( ur, "<control>" ) && ! get_iswspace( codepoint, ur ) ) && ! is_general_category( ur, "Cs" ); +} + +#ifdef TEST + +#include "test.h" + +int main( void ) +{ + TESTCASE( NO_TESTDRIVER ); + + return TEST_RESULTS; +} + +#endif diff --git a/src/pdclib/auxiliary/uctype/uctype.h b/src/pdclib/auxiliary/uctype/uctype.h new file mode 100644 index 0000000..8cdda43 --- /dev/null +++ b/src/pdclib/auxiliary/uctype/uctype.h @@ -0,0 +1,29 @@ +/* uctype data + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#ifndef UCTYPE +#define UCTYPE + +#include "derived_properties.h" +#include "unicode_data.h" + +size_t get_towupper( size_t codepoint, struct unicode_record_t * ur ); +size_t get_towlower( size_t codepoint, struct unicode_record_t * ur ); +size_t get_iswupper( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ); +size_t get_iswlower( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ); +size_t get_iswalpha( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ); +size_t get_iswdigit( size_t codepoint ); +size_t get_iswxdigit( size_t codepoint ); +size_t get_iswblank( size_t codepoint, struct unicode_record_t * ur ); +size_t get_iswspace( size_t codepoint, struct unicode_record_t * ur ); +size_t get_iswcntrl( size_t codepoint, struct unicode_record_t * ur ); +size_t get_iswgraph( size_t codepoint, struct unicode_record_t * ur ); +size_t get_iswprint( size_t codepoint, struct unicode_record_t * ur ); +size_t get_iswpunct( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core ); + + +#endif + diff --git a/src/pdclib/auxiliary/uctype/unicode_data.c b/src/pdclib/auxiliary/uctype/unicode_data.c new file mode 100644 index 0000000..5d92fda --- /dev/null +++ b/src/pdclib/auxiliary/uctype/unicode_data.c @@ -0,0 +1,224 @@ +/* unicode data + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "text_utilities.h" + +#include "unicode_data.h" + +#define LINE_BUFFER_SIZE 500u + +struct unicode_data_t * read_unicode_data( const char * filename ) +{ + FILE * fh; + char buffer[ LINE_BUFFER_SIZE ]; + struct unicode_data_t * ud = NULL; + size_t lines; + + if ( ( fh = fopen( filename, "r" ) ) == NULL ) + { + fprintf( stderr, "Could not open '%s' for reading.\n", filename ); + return NULL; + } + + if ( ( lines = check_file( fh, LINE_BUFFER_SIZE, ';', sizeof( unicode_record_fields ) / sizeof( int ), unicode_record_fields ) ) != (size_t)-1 ) + { + if ( ( ud = malloc( sizeof( struct unicode_data_t ) ) ) ) + { + ud->size = lines; + + if ( ( ud->records = calloc( lines, sizeof( struct unicode_record_t ) ) ) ) + { + size_t i; + + for ( i = 0; i < lines; ++i ) + { + char * p; + + fgets( buffer, LINE_BUFFER_SIZE, fh ); + + ud->records[ i ].code_point = strtoul( next_token( buffer, ';' ), NULL, 16 ); + + p = next_token( NULL, ';' ); + if ( *p ) + { + ud->records[ i ].name = malloc( strlen( p ) + 1 ); + strcpy( ud->records[ i ].name, p ); + } + + strcpy( ud->records[ i ].general_category, next_token( NULL, ';' ) ); + + p = next_token( NULL, ';' ); + ud->records[ i ].canonical_combining_class = ( *p ) ? strtol( p, NULL, 10 ) : -1l; + + strcpy( ud->records[ i ].bidi_class, next_token( NULL, ';' ) ); + + p = next_token( NULL, ';' ); + if ( *p ) + { + ud->records[ i ].decomposition = malloc( strlen( p ) + 1 ); + strcpy( ud->records[ i ].decomposition, p ); + } + + p = next_token( NULL, ';' ); + ud->records[ i ].numeric_type = ( *p ) ? strtol( p, NULL, 10 ) : -1l; + + p = next_token( NULL, ';' ); + ud->records[ i ].numeric_digit = ( *p ) ? strtol( p, NULL, 10 ) : -1l; + + p = next_token( NULL, ';' ); + if ( *p ) + { + ud->records[ i ].numeric_value = malloc( strlen( p ) + 1 ); + strcpy( ud->records[ i ].numeric_value, p ); + } + + p = next_token( NULL, ';' ); + ud->records[ i ].bidi_mirrored = ( *p ) ? *p : '\0'; + + next_token( NULL, ';' ); /* Unicode_1_Name */ + next_token( NULL, ';' ); /* ISO_Comment */ + + ud->records[ i ].simple_uppercase_mapping = strtoul( next_token( NULL, ';' ), NULL, 16 ); + ud->records[ i ].simple_lowercase_mapping = strtoul( next_token( NULL, ';' ), NULL, 16 ); + ud->records[ i ].simple_titlecase_mapping = strtoul( next_token( NULL, ';' ), NULL, 16 ); + } + } + else + { + fprintf( stderr, "Memory allocation failure.\n" ); + free( ud ); + ud = NULL; + } + } + else + { + fprintf( stderr, "Memory allocation failure.\n" ); + } + } + + fclose( fh ); + return ud; +} + +int has_name( struct unicode_record_t * ur, const char * name ) +{ + return strcmp( ur->name, name ) == 0; +} + +int name_ends_with( struct unicode_record_t * ur, const char * name ) +{ + return strstr( ur->name, name ) == ( ur->name + ( strlen( ur->name ) - strlen( name ) ) ); +} + +int is_general_category( struct unicode_record_t * ur, const char * category ) +{ + return strcmp( ur->general_category, category ) == 0; +} + +int decomposition_contains( struct unicode_record_t * ur, const char * substring ) +{ + return ur->decomposition && strstr( ur->decomposition, substring ) != NULL; +} + +int towupper_differs( struct unicode_record_t * ur, size_t codepoint ) +{ + return ur->simple_uppercase_mapping && ( ur->simple_uppercase_mapping != codepoint ); +} + +int towlower_differs( struct unicode_record_t * ur, size_t codepoint ) +{ + return ur->simple_lowercase_mapping && ( ur->simple_lowercase_mapping != codepoint ); +} + +void release_unicode_data( struct unicode_data_t * ud ) +{ + size_t i; + + for ( i = 0; i < ud->size; ++i ) + { + free( ud->records[i].name ); + free( ud->records[i].decomposition ); + free( ud->records[i].numeric_value ); + } + + free( ud->records ); + free( ud ); +} + +#ifdef TEST + +#include "test.h" + +#include <inttypes.h> + +int main( void ) +{ + FILE * fh = fopen( "test.txt", "w" ); + struct unicode_data_t * ud; + int rc; + + TESTCASE( fh != NULL ); + TESTCASE( fprintf( fh, "%04x;%s;%s;%d;%s;;;;;%c;%s;;;;\n", 0, "<control>", "Cc", 0, "BN", 'N', "NULL" ) == 38 ); + TESTCASE( ( rc = fprintf( fh, "%04x;%s;%s;%d;%s;%s;;;%s;%c;;;%04x;;%04x\n", 0x2170, "SMALL ROMAN NUMERAL ONE", "Nl", 0, "L", "<compat> 0069", "1", 'N', 0x2160, 0x2160 ) ) == 69 ); + + fclose( fh ); + ud = read_unicode_data( "test.txt" ); + remove( "test.txt" ); + + TESTCASE( ud != NULL ); + TESTCASE( ud->size == 2 ); + + TESTCASE( ud->records[0].code_point == 0 ); + TESTCASE( strcmp( ud->records[0].name, "<control>" ) == 0 ); + TESTCASE( strcmp( ud->records[0].general_category, "Cc" ) == 0 ); + TESTCASE( ud->records[0].canonical_combining_class == 0 ); + TESTCASE( strcmp( ud->records[0].bidi_class, "BN" ) == 0 ); + TESTCASE( ud->records[0].decomposition == NULL ); + TESTCASE( ud->records[0].numeric_type == -1 ); + TESTCASE( ud->records[0].numeric_digit == -1 ); + TESTCASE( ud->records[0].numeric_value == NULL ); + TESTCASE( ud->records[0].bidi_mirrored == 'N' ); + TESTCASE( ud->records[0].simple_uppercase_mapping == 0 ); + TESTCASE( ud->records[0].simple_lowercase_mapping == 0 ); + TESTCASE( ud->records[0].simple_titlecase_mapping == 0 ); + + TESTCASE( ud->records[1].code_point == 0x2170 ); + TESTCASE( strcmp( ud->records[1].name, "SMALL ROMAN NUMERAL ONE" ) == 0 ); + TESTCASE( strcmp( ud->records[1].general_category, "Nl" ) == 0 ); + TESTCASE( ud->records[1].canonical_combining_class == 0 ); + TESTCASE( strcmp( ud->records[1].bidi_class, "L" ) == 0 ); + TESTCASE( strcmp( ud->records[1].decomposition, "<compat> 0069" ) == 0 ); + TESTCASE( ud->records[1].numeric_type == -1 ); + TESTCASE( ud->records[1].numeric_digit == -1 ); + TESTCASE( strcmp( ud->records[1].numeric_value, "1" ) == 0 ); + TESTCASE( ud->records[1].bidi_mirrored == 'N' ); + TESTCASE( ud->records[1].simple_uppercase_mapping == 0x2160 ); + TESTCASE( ud->records[1].simple_lowercase_mapping == 0 ); + TESTCASE( ud->records[1].simple_titlecase_mapping == 0x2160 ); + + TESTCASE( is_general_category( &(ud->records[0]), "Cc" ) ); + TESTCASE( ! is_general_category( &(ud->records[0]), "" ) ); + TESTCASE( is_general_category( &(ud->records[1]), "Nl" ) ); + TESTCASE( ! is_general_category( &(ud->records[1]), "Foo" ) ); + + TESTCASE( decomposition_contains( &(ud->records[1]), "<compat>" ) ); + TESTCASE( ! decomposition_contains( &(ud->records[1]), "Foo" ) ); + + TESTCASE( ! towupper_differs( &(ud->records[0]), 0 ) ); + TESTCASE( ! towlower_differs( &(ud->records[0]), 0 ) ); + TESTCASE( towupper_differs( &(ud->records[1]), 0x2170 ) ); + TESTCASE( ! towlower_differs( &(ud->records[1]), 0x2170 ) ); + + release_unicode_data( ud ); + + return TEST_RESULTS; +} + +#endif diff --git a/src/pdclib/auxiliary/uctype/unicode_data.h b/src/pdclib/auxiliary/uctype/unicode_data.h new file mode 100644 index 0000000..8cd4832 --- /dev/null +++ b/src/pdclib/auxiliary/uctype/unicode_data.h @@ -0,0 +1,77 @@ +/* unicode data + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#ifndef UNICODE_DATA +#define UNICODE_DATA UNICODE_DATA + +#include <stddef.h> + +/* https://www.unicode.org/reports/tr44/#UnicodeData.txt */ + +/* We do not need all these fields at this point, but we read them anyway + so we do not need to change much should the need arise later. +*/ +struct unicode_record_t +{ + size_t code_point; + char * name; + char general_category[ 3 ]; + int canonical_combining_class; + char bidi_class[ 4 ]; + char * decomposition; + int numeric_type; + int numeric_digit; + char * numeric_value; + char bidi_mirrored; + /*char * unicode_1_name;*/ /* Obsolete as of 6.2.0 */ + /*char * iso_comment;*/ /* Obsoƶete as of 5.2.0 */ + size_t simple_uppercase_mapping; + size_t simple_lowercase_mapping; + size_t simple_titlecase_mapping; +}; + +struct unicode_data_t +{ + size_t size; + struct unicode_record_t * records; +}; + +/* The assumed field widths, for use with check_file(). */ +static const int unicode_record_fields[] = { + -1, /* code_point */ + -1, /* name */ + 3, /* general_category */ + -1, /* canonical_combining_class */ + 4, /* bidi_class */ + -1, /* decomposition */ + -1, /* numeric_type */ + -1, /* numeric_digit */ + -1, /* numeric_value */ + 2, /* bidi_mirrored */ + -1, /* unicode_1_name */ + -1, /* iso_comment */ + -1, /* simple_uppercase_mapping */ + -1, /* simple_lowercase_mapping */ + -1 /* simple_titlecase_mapping */ +}; + +struct unicode_data_t * read_unicode_data( const char * filename ); + +int has_name( struct unicode_record_t * ur, const char * name ); + +int name_ends_with( struct unicode_record_t * ur, const char * name ); + +int is_general_category( struct unicode_record_t * ur, const char * category ); + +int decomposition_contains( struct unicode_record_t * ur, const char * substring ); + +int towupper_differs( struct unicode_record_t * ur, size_t codepoint ); + +int towlower_differs( struct unicode_record_t * ur, size_t codepoint ); + +void release_unicode_data( struct unicode_data_t * ud ); + +#endif |