aboutsummaryrefslogtreecommitdiffstats
path: root/src/pdclib/auxiliary
diff options
context:
space:
mode:
Diffstat (limited to 'src/pdclib/auxiliary')
-rw-r--r--src/pdclib/auxiliary/uctype/Makefile48
-rw-r--r--src/pdclib/auxiliary/uctype/derived_properties.c300
-rw-r--r--src/pdclib/auxiliary/uctype/derived_properties.h34
-rw-r--r--src/pdclib/auxiliary/uctype/main.c300
-rw-r--r--src/pdclib/auxiliary/uctype/test.h19
-rw-r--r--src/pdclib/auxiliary/uctype/text_utilities.c206
-rw-r--r--src/pdclib/auxiliary/uctype/text_utilities.h59
-rw-r--r--src/pdclib/auxiliary/uctype/uctype.c85
-rw-r--r--src/pdclib/auxiliary/uctype/uctype.h29
-rw-r--r--src/pdclib/auxiliary/uctype/unicode_data.c224
-rw-r--r--src/pdclib/auxiliary/uctype/unicode_data.h77
11 files changed, 1381 insertions, 0 deletions
diff --git a/src/pdclib/auxiliary/uctype/Makefile b/src/pdclib/auxiliary/uctype/Makefile
new file mode 100644
index 0000000..0d34b98
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/Makefile
@@ -0,0 +1,48 @@
+TARGET := get-uctypes
+# All source files of the project
+SRCFILES := $(wildcard *.c)
+# All header files of the project
+HDRFILES := $(wildcard *.h)
+# All object files in the project
+OBJFILES := $(patsubst %.c,%.o,$(SRCFILES))
+# All test drivers (_t)
+TSTFILES := $(patsubst %.c,%_t,$(SRCFILES))
+# All dependency files (.d)
+DEPFILES := $(patsubst %.c,%.d,$(SRCFILES))
+# All test driver dependency files (_t.d)
+TSTDEPFILES := $(patsubst %,%.d,$(TSTFILES))
+# All test driver dependency files (_t.d)
+
+WARNINGS := -Wall -Wextra -pedantic -Wno-unused-parameter -Wshadow -Wpointer-arith -Wcast-align -Wwrite-strings -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls -Wnested-externs -Winline -Wno-long-long -Wuninitialized -Wstrict-prototypes -Wdeclaration-after-statement
+CFLAGS := -g -std=c99 $(WARNINGS) $(USERFLAGS) -I.
+
+.PHONY: all clean tests
+
+all: $(TARGET)
+
+$(TARGET): $(OBJFILES)
+ @echo " CC $@"
+ @$(CC) $^ -o $@
+ @echo
+
+tests: testdrivers
+ -@rc=0; count=0; failed=""; for file in $(TSTFILES); do echo " TST $$file"; ./$$file; test=$$?; if [ $$test != 0 ]; then rc=`expr $$rc + $$test`; failed="$$failed $$file"; fi; count=`expr $$count + 1`; done; echo; echo "Tests executed: $$count Tests failed: $$rc"; echo; for file in $$failed; do echo "Failed: $$file"; done; echo
+
+testdrivers: $(TSTFILES)
+ @echo
+
+-include $(DEPFILES) $(TSTDEPFILES)
+
+clean:
+ -@$(RM) $(wildcard $(OBJFILES) $(DEPFILES) $(TSTFILES) $(TSTDEPFILES) $(TARGET) aux.a)
+
+%.o: %.c Makefile
+ @echo " CC $@"
+ @$(CC) $(CFLAGS) -MMD -MP -c $< -o $@
+
+%_t: %.c Makefile aux.a
+ @echo " CC $@"
+ @$(CC) $(CFLAGS) -MMD -MP -DTEST $< aux.a -o $@
+
+aux.a: $(OBJFILES)
+ @ar rc $@ $^
diff --git a/src/pdclib/auxiliary/uctype/derived_properties.c b/src/pdclib/auxiliary/uctype/derived_properties.c
new file mode 100644
index 0000000..c024efe
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/derived_properties.c
@@ -0,0 +1,300 @@
+/* derived properties
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "text_utilities.h"
+
+#include "derived_properties.h"
+
+#define LINE_BUFFER_SIZE 500u
+
+struct derived_properties_t * read_derived_properties( const char * filename )
+{
+ FILE * fh;
+ char buffer[ LINE_BUFFER_SIZE ];
+ struct derived_properties_t * dp = NULL;
+ size_t code_points = 0;
+ size_t properties = 0;
+ const char * code_point_count = "# Total code points: ";
+
+ if ( ( fh = fopen( filename, "r" ) ) == NULL )
+ {
+ fprintf( stderr, "Could not open '%s' for reading.\n", filename );
+ return NULL;
+ }
+
+ if ( ( check_file( fh, LINE_BUFFER_SIZE, ';', sizeof( derived_properties_fields ) / sizeof( int ), derived_properties_fields ) ) != (size_t)-1 )
+ {
+ while ( fgets( buffer, LINE_BUFFER_SIZE, fh ) )
+ {
+ if ( strstr( buffer, code_point_count ) != NULL )
+ {
+ size_t count = strtoul( buffer + strlen( code_point_count ), NULL, 10 );
+
+ if ( ( SIZE_MAX - count ) < code_points )
+ {
+ fprintf( stderr, "Summing up total code points in '%s' would overflow.\n", filename );
+ fclose( fh );
+ return NULL;
+ }
+
+ code_points += count;
+ ++properties;
+ }
+ }
+
+ rewind( fh );
+
+ if ( ( dp = malloc( sizeof( struct derived_properties_t ) ) ) )
+ {
+ dp->count = properties;
+
+ if ( ( dp->name = calloc( properties, sizeof( char * ) ) ) )
+ {
+ if ( ( dp->begin = calloc( properties, sizeof( size_t ) ) ) )
+ {
+ if ( ( dp->end = calloc( properties, sizeof( size_t ) ) ) )
+ {
+ if ( ( dp->code_points = malloc( code_points * sizeof( size_t ) ) ) )
+ {
+ char * p;
+ char * range;
+ properties = 0; /* Re-using the variable */
+ code_points = 0; /* Re-using the variable */
+
+ while ( fgets( buffer, LINE_BUFFER_SIZE, fh ) )
+ {
+ /* Remove comments */
+ if ( ( p = strchr( buffer, '#' ) ) != NULL )
+ {
+ *p = '\0';
+ }
+
+ /* > 0 because of newline */
+ if ( strlen( buffer ) > 1 )
+ {
+ size_t first;
+ size_t last;
+
+ range = next_token( buffer, ';' );
+ p = next_token( NULL, ';' );
+
+ if ( ! range || ! p )
+ {
+ size_t i;
+
+ fprintf( stderr, "Parse error, malformed input.\n" );
+
+ for ( i = 0; i < properties; ++i )
+ {
+ free( dp->name[ i ] );
+ }
+
+ free( dp->name );
+ free( dp->begin );
+ free( dp->end );
+ free( dp->code_points );
+ free( dp );
+ return NULL;
+ }
+
+ /* If we got to a new property (except the first) */
+ if ( dp->name[ properties ] && strcmp( p, dp->name[ properties ] ) )
+ {
+ /* Index into ->code_points where the previous property ends */
+ dp->end[ properties ] = code_points;
+ ++properties;
+ }
+
+ /* If we got to a new property, even the first */
+ if ( dp->name[ properties ] == NULL )
+ {
+ dp->name[ properties ] = malloc( strlen( p ) + 1 );
+ strcpy( dp->name[ properties ], p );
+
+ /* Index into ->code_points where this property begins */
+ dp->begin[ properties ] = code_points;
+ }
+
+ /* Re-using p, as we have done everything related to the property
+ name at this point.
+ */
+ first = strtoul( range, &p, 16 );
+
+ if ( *p == '\0' )
+ {
+ last = first;
+ }
+ else
+ {
+ while ( *p && ! isxdigit( *p ) )
+ {
+ ++p;
+ }
+
+ last = strtoul( p, NULL, 16 );
+
+ if ( last <= first )
+ {
+ size_t i;
+
+ fprintf( stderr, "Parse error, malformed input.\n" );
+
+ for ( i = 0; i < properties; ++i )
+ {
+ free( dp->name[ i ] );
+ }
+
+ free( dp->name );
+ free( dp->begin );
+ free( dp->end );
+ free( dp->code_points );
+ free( dp );
+ return NULL;
+ }
+ }
+
+ for ( ; first <= last; ++first )
+ {
+ dp->code_points[ code_points++ ] = first;
+ }
+ }
+ }
+
+ /* Have to end the last property as well */
+ dp->end[ properties ] = code_points;
+ }
+ else
+ {
+ fprintf( stderr, "Memory allocation failure.\n" );
+ free( dp->name );
+ free( dp->begin );
+ free( dp->end );
+ free( dp );
+ dp = NULL;
+ }
+ }
+ else
+ {
+ fprintf( stderr, "Memory allocation failure.\n" );
+ free( dp->name );
+ free( dp->begin );
+ free( dp );
+ dp = NULL;
+ }
+ }
+ else
+ {
+ fprintf( stderr, "Memory allocation failure.\n" );
+ free( dp->name );
+ free( dp );
+ dp = NULL;
+ }
+ }
+ else
+ {
+ fprintf( stderr, "Memory allocation failure.\n" );
+ free( dp );
+ dp = NULL;
+ }
+ }
+ else
+ {
+ fprintf( stderr, "Memory allocation failure.\n" );
+ }
+ }
+
+ fclose( fh );
+ return dp;
+}
+
+static int comp( const void * l, const void * r )
+{
+ const size_t * lhs = l;
+ const size_t * rhs = r;
+
+ return ( *lhs < *rhs ) ? -1 : ( *lhs > *rhs ) ? 1 : 0;
+}
+
+int lookup_property( struct derived_properties_t * dp, const char * property, size_t codepoint )
+{
+ size_t i;
+
+ for ( i = 0; i < dp->count; ++i )
+ {
+ /* Look for the requested property */
+ if ( strcmp( dp->name[ i ], property ) == 0 )
+ {
+ size_t cp = dp->begin[ i ];
+
+ return bsearch( &codepoint, dp->code_points + cp, dp->end[ i ] - cp, sizeof( size_t ), comp ) != NULL;
+ }
+ }
+
+ return 0;
+}
+
+void release_derived_properties( struct derived_properties_t * dp )
+{
+ size_t i;
+
+ for ( i = 0; i < dp->count; ++i )
+ {
+ free( dp->name[ i ] );
+ }
+
+ free( dp->name );
+ free( dp->begin );
+ free( dp->end );
+ free( dp->code_points );
+ free( dp );
+}
+
+#ifdef TEST
+
+#include "test.h"
+
+int main( void )
+{
+ FILE * fh = fopen( "test.txt", "wb+" );
+ struct derived_properties_t * dp;
+
+ TESTCASE( fh != NULL );
+ TESTCASE( fprintf( fh, "0000..0006 ; Test1 \n" ) == 20 );
+ TESTCASE( fprintf( fh, "# Total code points: 7\n" ) == 23 );
+ TESTCASE( fprintf( fh, "0001;Test2\n" ) == 11 );
+ TESTCASE( fprintf( fh, "# Total code points: 1\n" ) == 23 );
+
+ fclose( fh );
+ dp = read_derived_properties( "test.txt" );
+
+ TESTCASE( dp != NULL );
+ TESTCASE( dp->count == 2 );
+ TESTCASE( ! strcmp( dp->name[0], "Test1" ) );
+ TESTCASE( ! strcmp( dp->name[1], "Test2" ) );
+
+ TESTCASE( lookup_property( dp, "Test1", 0 ) );
+ TESTCASE( lookup_property( dp, "Test1", 6 ) );
+ TESTCASE( ! lookup_property( dp, "Test1", 7 ) );
+
+ TESTCASE( ! lookup_property( dp, "Test2", 0 ) );
+ TESTCASE( lookup_property( dp, "Test2", 1 ) );
+ TESTCASE( ! lookup_property( dp, "Test2", 2 ) );
+
+ TESTCASE( ! lookup_property( dp, "Test", 0 ) );
+ TESTCASE( ! lookup_property( dp, "Test3", 0 ) );
+
+ release_derived_properties( dp );
+ remove( "test.txt" );
+
+ return TEST_RESULTS;
+}
+
+#endif
diff --git a/src/pdclib/auxiliary/uctype/derived_properties.h b/src/pdclib/auxiliary/uctype/derived_properties.h
new file mode 100644
index 0000000..d06ac84
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/derived_properties.h
@@ -0,0 +1,34 @@
+/* derived properties
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#ifndef DERIVED_PROPERTIES
+#define DERIVED_PROPERTIES DERIVED_PROPERTIES
+
+#include <stddef.h>
+
+/* https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt */
+
+struct derived_properties_t
+{
+ size_t count;
+ char * * name;
+ size_t * begin;
+ size_t * end;
+ size_t * code_points;
+};
+
+static const int derived_properties_fields[] = {
+ -1, /* code point or code point range */
+ -1 /* property name */
+};
+
+struct derived_properties_t * read_derived_properties( const char * filename );
+
+int lookup_property( struct derived_properties_t * dp, const char * property, size_t codepoint );
+
+void release_derived_properties( struct derived_properties_t * dp );
+
+#endif
diff --git a/src/pdclib/auxiliary/uctype/main.c b/src/pdclib/auxiliary/uctype/main.c
new file mode 100644
index 0000000..ef60bb4
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/main.c
@@ -0,0 +1,300 @@
+/* main
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#include <locale.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef TEST
+#include <wctype.h>
+#endif
+
+#include "uctype.h"
+
+/* RLE Compressed Output
+
+ <wctype.h> requires *11* flags:
+ iswupper, iswlower, iswalpha, iswdigit, iswblank, iswspace,
+ iswcntrl, iswxdigit, iswgraph, iswprint.
+ iswalnum (the 12th classification function) is *defined* as
+ iswalpha || iswdigit. And iswdigit and iswxdigit are defined
+ in a rather restrictive way that can be expressed by simple
+ ranges instead of lookup tables. And isgraph is defined as
+ isprint && ! isspace (which is trivial to check that it holds
+ true for all the records provided by get-unicode-ctype, at
+ least up to Unicode 11.0).
+ So we have only 8 flags we actually need in a lookup... nicely
+ reducing the storage requirement to an unsigned char.
+
+ Another trick is to express toupper / tolower as offsets
+ instead of absolute values, which will allow run-time-length
+ compression of the data.
+*/
+
+struct output_record_t
+{
+ size_t codepoint;
+ int toupper_diff;
+ int tolower_diff;
+ unsigned char flags;
+};
+
+#ifdef TEST
+static void print_codepoint_age( size_t codepoint, struct derived_properties_t * age )
+{
+ size_t index = age->count;
+
+ while ( index )
+ {
+ --index;
+
+ if ( lookup_property( age, age->name[ index ], codepoint ) )
+ {
+ printf( "%s", age->name[ index ] );
+ return;
+ }
+ }
+}
+
+static void print_additional_codepoint_info( size_t codepoint, struct unicode_record_t * ur )
+{
+ printf( " - %s", ur->name );
+ printf( " - %s", ur->general_category );
+ printf( " - %d", ur->canonical_combining_class );
+ printf( " - %s", ur->bidi_class );
+ printf( " - %s", ( ur->decomposition ? ur->decomposition : "NULL" ) );
+ printf( " - %d", ur->numeric_type );
+ printf( " - %d", ur->numeric_digit );
+ printf( " - %s", ( ur->numeric_value ? ur->numeric_value : "NULL" ) );
+ printf( " - %c", ur->bidi_mirrored );
+ printf( " - U+%06zx", ur->simple_uppercase_mapping );
+ printf( " - U+%06zx", ur->simple_lowercase_mapping );
+ printf( " - U+%06zx", ur->simple_titlecase_mapping );
+ printf( " - " );
+
+ /* Implementations are at liberty to return non-zero values other
+ than 1 for "true".
+ */
+ printf( "%d", ( iswupper( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswlower( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswalpha( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswdigit( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswblank( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswspace( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswcntrl( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswxdigit( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswgraph( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswprint( codepoint ) ) ? 1 : 0 );
+ printf( "%d", ( iswpunct( codepoint ) ) ? 1 : 0 );
+}
+
+static void print_codepoint_info( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core, struct derived_properties_t * age )
+{
+ int rc;
+ int equal = 1;
+
+ if ( codepoint % 20 == 0 )
+ {
+ printf( " cp up low UlA0_WCXGP.\n" );
+ }
+
+ printf( "U+%06zX ", codepoint );
+ rc = get_towupper( codepoint, ur ); equal &= ( (unsigned)rc == towupper( codepoint ) ); printf( "U+%06X ", rc );
+ rc = get_towlower( codepoint, ur ); equal &= ( (unsigned)rc == towlower( codepoint ) ); printf( "U+%06X ", rc );
+ rc = get_iswupper( codepoint, ur, core ); equal &= ( iswupper( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswlower( codepoint, ur, core ); equal &= ( iswlower( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswalpha( codepoint, ur, core ); equal &= ( iswalpha( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswdigit( codepoint ); equal &= ( iswdigit( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswblank( codepoint, ur ); equal &= ( iswblank( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswspace( codepoint, ur ); equal &= ( iswspace( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswcntrl( codepoint, ur ); equal &= ( iswcntrl( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswxdigit( codepoint ); equal &= ( iswxdigit( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswgraph( codepoint, ur ); equal &= ( iswgraph( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswprint( codepoint, ur ); equal &= ( iswprint( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+ rc = get_iswpunct( codepoint, ur, core ); equal &= ( iswpunct( codepoint ) ? 1 : 0 == rc ); printf( "%d", rc ? 1 : 0 );
+
+ if ( codepoint != ur->code_point )
+ {
+ /* These two may only differ for codepoint "ranges", which are
+ signified by "..., First>" / "..., Last>" pairs in UnicodeData.
+ If they differ and it's NOT a range, that is an error of some
+ kind.
+ */
+ if ( ! strstr( ur->name, ", Last>" ) || codepoint < ( ur - 1 )->code_point )
+ {
+ printf( " ERROR: U+%06zX != U+%06zX outside of First, Last codepoint range. ", codepoint, ur->code_point );
+ }
+ }
+
+ if ( ! equal )
+ {
+ printf( " ERROR: Deviation from SysLib: " );
+ print_codepoint_age( codepoint, age );
+ print_additional_codepoint_info( codepoint, ur );
+ }
+
+ printf( "\n" );
+}
+#else
+static struct output_record_t get_output_record( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core )
+{
+ struct output_record_t rc;
+ char buffer[ 9 ];
+
+ rc.codepoint = codepoint;
+ rc.toupper_diff = get_towupper( codepoint, ur ) - codepoint;
+ rc.tolower_diff = get_towlower( codepoint, ur ) - codepoint;
+
+ sprintf( buffer, "%zu%zu%zu%zu%zu%zu%zu%zu",
+ get_iswupper( codepoint, ur, core ),
+ get_iswlower( codepoint, ur, core ),
+ get_iswalpha( codepoint, ur, core ),
+ get_iswblank( codepoint, ur ),
+ get_iswspace( codepoint, ur ),
+ get_iswcntrl( codepoint, ur ),
+ get_iswprint( codepoint, ur ),
+ get_iswpunct( codepoint, ur, core ) );
+
+ rc.flags = strtoul( buffer, NULL, 2 );
+
+ return rc;
+}
+#endif
+
+int main( int argc, char * argv[] )
+{
+ struct unicode_data_t * ud;
+ struct derived_properties_t * core;
+#ifdef TEST
+ struct derived_properties_t * age;
+#endif
+
+ char * locale = setlocale( LC_CTYPE, "" );
+
+ if ( ! strstr( locale, "UTF-8" ) || strstr( locale, "TR" ) || strstr( locale, "tr" ) )
+ {
+ fprintf( stderr, "Need non-turkish locale to work correctly.\n'%s' will not do.\n", locale );
+ return EXIT_FAILURE;
+ }
+
+ if ( argc != 4 )
+ {
+ printf( "\n"
+ "Usage: get-uctypes <UnicodeData.txt> <DerivedCoreProperties.txt>"
+#ifdef TEST
+ " <DerivedAge.txt>"
+#endif
+ "\n\n"
+ "Generates lookup tables for <wctype.h> from files available from\n"
+ "the Unicode Consortium.\n"
+ "\n"
+ "The required files can be retrieved from the following URL:\n"
+ "\n"
+ "http://www.unicode.org/Public/UCD/latest/ucd/\n"
+ "\n" );
+ return EXIT_FAILURE;
+ }
+
+ if ( ( ud = read_unicode_data( argv[ 1 ] ) ) != NULL )
+ {
+ if ( ( core = read_derived_properties( argv[ 2 ] ) ) != NULL )
+ {
+#ifndef TEST
+ /* Print (to file) RLE compressed data */
+ FILE * fh = fopen( "ctype.dat", "wb" );
+
+ if ( fh )
+ {
+ size_t codepoint = 0;
+ size_t i = 0;
+ struct unicode_record_t * ur = &(ud->records[i]);
+ /* Name substring indicating a code point _range_ */
+ const char * last = ", Last>";
+
+ struct output_record_t previous = get_output_record( codepoint, ur, core );
+
+ fprintf( fh, "%zx ", previous.codepoint );
+
+ for ( codepoint = 1; codepoint < 0x10fffe; ++codepoint )
+ {
+ struct output_record_t current;
+
+ while ( codepoint > ur->code_point )
+ {
+ ur = &(ud->records[++i]);
+ }
+
+ if ( codepoint != ur->code_point && ( ur->name && ( strstr( ur->name, last ) != ( ur->name + strlen( ur->name ) - strlen( last ) ) ) ) )
+ {
+ /* Unregistered Code Point */
+ continue;
+ }
+
+ current = get_output_record( codepoint, ur, core );
+
+ /* RLE */
+ if ( current.codepoint != previous.codepoint + 1 ||
+ current.toupper_diff != previous.toupper_diff ||
+ current.tolower_diff != previous.tolower_diff ||
+ current.flags != previous.flags )
+ {
+ fprintf( fh, "%zx %d %d %hhx\n", previous.codepoint, previous.toupper_diff, previous.tolower_diff, previous.flags );
+ fprintf( fh, "%zx ", current.codepoint );
+ }
+
+ previous = current;
+ }
+
+ fprintf( fh, "%zx %d %d %hhx\n", previous.codepoint, previous.toupper_diff, previous.tolower_diff, previous.flags );
+ fclose( fh );
+ }
+ else
+ {
+ fprintf( stderr, "Could not open 'ctype.dat' for writing.\n" );
+ }
+#else
+ if ( ( age = read_derived_properties( argv[ 3 ] ) ) != NULL )
+ {
+ /* Print (to screen) raw data comparing our results
+ to the system library.
+ Differences are often because the system library
+ uses older data, which is why we add the age to
+ the output.
+ */
+ size_t codepoint = 0;
+ size_t i = 0;
+ struct unicode_record_t * ur = &(ud->records[i]);
+ /* Name substring indicating a code point _range_ */
+ const char * last = ", Last>";
+
+ for ( codepoint = 0; codepoint < 0x10fffe; ++codepoint )
+ {
+ while ( codepoint > ur->code_point )
+ {
+ ur = &(ud->records[++i]);
+ }
+
+ if ( codepoint != ur->code_point && ! name_ends_with( ur, last ) )
+ {
+ /* Unregistered Code Point */
+ continue;
+ }
+
+ print_codepoint_info( codepoint, ur, core, age );
+ }
+
+ release_derived_properties( age );
+ }
+#endif
+
+ release_derived_properties( core );
+ }
+
+ release_unicode_data( ud );
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/src/pdclib/auxiliary/uctype/test.h b/src/pdclib/auxiliary/uctype/test.h
new file mode 100644
index 0000000..3cd33a8
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/test.h
@@ -0,0 +1,19 @@
+/* test
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#ifndef TEST_H
+#define TEST_H TEST_H
+
+#include <stdio.h>
+
+#define NO_TESTDRIVER 0
+
+static int TEST_RESULTS = 0;
+
+#define TESTCASE( x ) if ( x ) {} \
+ else { TEST_RESULTS += 1; printf( "FAILED: " __FILE__ ", line %d - %s\n", __LINE__, #x ); }
+
+#endif
diff --git a/src/pdclib/auxiliary/uctype/text_utilities.c b/src/pdclib/auxiliary/uctype/text_utilities.c
new file mode 100644
index 0000000..20973d9
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/text_utilities.c
@@ -0,0 +1,206 @@
+/* text utilities
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#include "text_utilities.h"
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+
+char * trim( char * s )
+{
+ char * p;
+
+ /* Skip over leading whitespace */
+ while ( *s && isspace( *s ) )
+ {
+ ++s;
+ }
+
+ /* Trim trailing whitespace */
+ p = s;
+
+ while ( *p )
+ {
+ ++p;
+ }
+
+ while ( isspace( *(--p) ) )
+ {
+ *p = '\0';
+ }
+
+ return s;
+}
+
+char * next_token( char * s, char delim )
+{
+ static char * p = NULL;
+ char * rc;
+
+ if ( s != NULL )
+ {
+ /* Re-init the to-be-tokenized string */
+ p = s;
+ }
+
+ /* Remembering the start of the next token */
+ rc = p;
+
+ /* In case the function has not been initialized, or the previous
+ string been exhaused, do nothing.
+ */
+ if ( p )
+ {
+ /* Re-using s here */
+ if ( ( s = strchr( p, delim ) ) )
+ {
+ /* Null the delimiter */
+ *s = '\0';
+ /* Make the internal, static pointer point to the next token */
+ p = s + 1;
+ }
+ else
+ {
+ /* Delimiter not found, end-of-string reached. */
+ p = NULL;
+ }
+
+ /* Trim the result */
+ rc = trim( rc );
+ }
+
+ return rc;
+}
+
+size_t check_file( FILE * fh, size_t buffer_size, char delim, size_t fields, int const * widths )
+{
+ /* Dynamically allocated buffer */
+ char * buffer = malloc( buffer_size );
+ size_t lines = 0;
+
+ rewind( fh );
+
+ while ( fgets( buffer, buffer_size, fh ) )
+ {
+ size_t i;
+ char * p;
+
+ ++lines;
+
+ /* Check line for complete read */
+ if ( buffer[ strlen( buffer ) - 1 ] != '\n' )
+ {
+ fprintf( stderr, "Line %zu will not fit into a %zu-character buffer.\n", lines, buffer_size );
+ rewind( fh );
+ free( buffer );
+ return -1;
+ }
+
+ /* Remove comments */
+ if ( ( p = strchr( buffer, '#' ) ) != NULL )
+ {
+ *p = '\0';
+ }
+
+ /* > 1 because of newline */
+ if ( strlen( buffer ) > 1 )
+ {
+ /* Check field count and field widths */
+ p = next_token( buffer, delim );
+
+ for ( i = 0; i < fields; ++i )
+ {
+ if ( ! p )
+ {
+ fprintf( stderr, "Line %zu contains less than %zu fields.\n", lines, fields );
+ rewind( fh );
+ free( buffer );
+ return -1;
+ }
+
+ if ( widths[ i ] >= 0 && strlen( p ) >= (unsigned)widths[ i ] )
+ {
+ fprintf( stderr, "Line %zu: Field %zu '%s' will not fit in a %d character string.\n", lines, i + 1, p, widths[ i ] );
+ rewind( fh );
+ free( buffer );
+ return -1;
+ }
+
+ p = next_token( NULL, delim );
+ }
+
+ if ( p )
+ {
+ fprintf( stderr, "Line %zu contains more than %zu fields.\n", lines, fields );
+ rewind( fh );
+ free( buffer );
+ return -1;
+ }
+ }
+ }
+
+ /* Rewind, free the buffer, and report the number of lines */
+ rewind( fh );
+ free( buffer );
+ return lines;
+}
+
+#ifdef TEST
+
+#include "test.h"
+
+int main( void )
+{
+ FILE * fh = fopen( "test.txt", "wb+" );
+ int widths[] = { 4, 4, 4 };
+ char buffer[ 500 ];
+
+ /* check_file() (and as dependency, next_token() */
+
+ /* All ok */
+ TESTCASE( fprintf( fh, "%s;%s;%s\n", "1", "123", "12" ) == 9 );
+ TESTCASE( fprintf( fh, ";;\n" ) == 3 );
+ TESTCASE( check_file( fh, 10, ';', 3, widths ) == 2 );
+ /* Field 1 too long */
+ TESTCASE( fprintf( fh, "%s;%s;%s\n", "", "1234", "1" ) == 8 );
+ TESTCASE( check_file( fh, 10, ';', 3, widths ) == (size_t)-1 );
+ /* Too few fields */
+ TESTCASE( fprintf( fh, "%s;%s\n", "123", "123" ) == 8 );
+ TESTCASE( check_file( fh, 10, ';', 3, widths )== (size_t)-1 );
+ /* Too many fields */
+ TESTCASE( fprintf( fh, "%s;%s;%s;%s\n", "1", "1", "1", "1" ) == 8 );
+ TESTCASE( check_file( fh, 10, ';', 3, widths )== (size_t)-1 );
+ /* Line too long */
+ TESTCASE( fprintf( fh, "%s;%s;%s\n", "12", "123", "12" ) == 10 );
+ TESTCASE( check_file( fh, 10, ';', 3, widths )== (size_t)-1 );
+
+ fclose( fh );
+ remove( "test.txt" );
+
+ /* trim() */
+
+ strcpy( buffer, " xyz" );
+ TESTCASE( ! strcmp( trim( buffer ), "xyz" ) );
+ strcpy( buffer, "xyz " );
+ TESTCASE( ! strcmp( trim( buffer ), "xyz" ) );
+ strcpy( buffer, " xyz " );
+ TESTCASE( ! strcmp( trim( buffer ), "xyz" ) );
+ strcpy( buffer, " x" );
+ TESTCASE( ! strcmp( trim( buffer ), "x" ) );
+ strcpy( buffer, "x " );
+ TESTCASE( ! strcmp( trim( buffer ), "x" ) );
+ strcpy( buffer, " " );
+ TESTCASE( ! strcmp( trim( buffer ), "" ) );
+ strcpy( buffer, " " );
+ TESTCASE( ! strcmp( trim( buffer ), "" ) );
+ strcpy( buffer, "" );
+ TESTCASE( ! strcmp( trim( buffer ), "" ) );
+
+ return TEST_RESULTS;
+}
+
+#endif
diff --git a/src/pdclib/auxiliary/uctype/text_utilities.h b/src/pdclib/auxiliary/uctype/text_utilities.h
new file mode 100644
index 0000000..f961e6b
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/text_utilities.h
@@ -0,0 +1,59 @@
+/* text utilities
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#ifndef TEXT_UTILITIES_H
+#define TEXT_UTILITIES_H TEXT_UTILITIES_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+/* Trim leading and trailing whitespace from a given string.
+ Will return a pointer beyond leading whitespace, and overwrite trailing
+ whitespace with null bytes.
+*/
+char * trim( char * s );
+
+/* A function similar to strtok(), that returns the next token in a string,
+ up to the next separator character (which is replaced with a null byte)
+ or up to end-of-string.
+ As opposed to strtok(), which treats consecutive separators as one, this
+ function will work "correctly" for those as well, returning a (pointer
+ to an) empty string in those cases.
+ Pass the string as first parameter IN THE FIRST CALL ONLY, and NULL in
+ subsequent calls. The function holds an internal, static pointer to the
+ string being processed. This, of course, means the function is not thread-
+ safe.
+*/
+char * next_token( char * s, char delim );
+
+/* When processing a file with delimited-values, there are a couple of things
+ you want to be sure about before parsing it:
+ - the number of lines (data records) in the file;
+ - that all lines of the file will fit the intended line buffer size;
+ - that all records in the file indeed have the expected number of fields;
+ - that none of the fields for which you are assuming a given size exceeds
+ that size.
+ (For line buffer size, consider that the buffer must be large enough for
+ the line contents, the newline (to check that the line was actually read
+ in full), and the null terminator.)
+ This function does all that for you in a single pass. The parameters are:
+ - FILE handle to the file (function will rewind the file before checking,
+ and rewind again when it is done);
+ - the intended line buffer size;
+ - the field delimiter;
+ - the expected number of fields;
+ - a pointer to an array holding the expected maximum width for each field,
+ with a negative value indicating that this field's width need not be
+ checked.
+ The function will return the number of lines in the file, or (size_t)-1
+ if one of the checks failed. The reason for the failed check will be
+ written to stderr. (The file will not be rewound in this case.)
+ This requires reading and tokenizing the file twice, but removes lots of
+ error checking from the actual parsing, making for cleaner code.
+*/
+size_t check_file( FILE * fh, size_t max_line_length, char delim, size_t fields, int const * widths );
+
+#endif
diff --git a/src/pdclib/auxiliary/uctype/uctype.c b/src/pdclib/auxiliary/uctype/uctype.c
new file mode 100644
index 0000000..ce8d8ef
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/uctype.c
@@ -0,0 +1,85 @@
+/* uctype
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#include "uctype.h"
+
+size_t get_towupper( size_t codepoint, struct unicode_record_t * ur )
+{
+ return towupper_differs( ur, codepoint ) ? ur->simple_uppercase_mapping : codepoint;
+}
+
+size_t get_towlower( size_t codepoint, struct unicode_record_t * ur )
+{
+ return towlower_differs( ur, codepoint ) ? ur->simple_lowercase_mapping : codepoint;
+}
+
+size_t get_iswupper( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core )
+{
+ return towlower_differs( ur, codepoint ) || lookup_property( core, "Uppercase", codepoint );
+}
+
+size_t get_iswlower( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core )
+{
+ return towupper_differs( ur, codepoint ) || lookup_property( core, "Lowercase", codepoint );
+}
+
+size_t get_iswalpha( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core )
+{
+ return lookup_property( core, "Alphabetic", codepoint ) || ( is_general_category( ur, "Nd" ) && ! get_iswdigit( codepoint ) );
+}
+
+size_t get_iswdigit( size_t codepoint )
+{
+ return codepoint >= 0x0030 && codepoint <= 0x0039;
+}
+
+size_t get_iswxdigit( size_t codepoint )
+{
+ return get_iswdigit( codepoint ) || ( codepoint >= 0x0041 && codepoint <= 0x0046 ) || ( codepoint >= 0x0061 && codepoint <= 0x0066 );
+}
+
+size_t get_iswblank( size_t codepoint, struct unicode_record_t * ur )
+{
+ return ( codepoint == 0x0009 ) || ( is_general_category( ur, "Zs" ) && ! decomposition_contains( ur, "<noBreak>" ) );
+}
+
+size_t get_iswspace( size_t codepoint, struct unicode_record_t * ur )
+{
+ return is_general_category( ur, "Zl" ) || is_general_category( ur, "Zp" ) || ( is_general_category( ur, "Zs" ) && ! decomposition_contains( ur, "<noBreak>" ) ) || ( codepoint == 0x0020 ) || ( codepoint >= 0x0009 && codepoint <= 0x000D );
+}
+
+size_t get_iswcntrl( size_t codepoint, struct unicode_record_t * ur )
+{
+ return is_general_category( ur, "Zl" ) || is_general_category( ur, "Zp" ) || has_name( ur, "<control>" );
+}
+
+size_t get_iswgraph( size_t codepoint, struct unicode_record_t * ur )
+{
+ return ! is_general_category( ur, "Cs" ) && ! has_name( ur, "<control>" ) && ! get_iswspace( codepoint, ur );
+}
+
+size_t get_iswprint( size_t codepoint, struct unicode_record_t * ur )
+{
+ return ! is_general_category( ur, "Zp" ) && ! is_general_category( ur, "Zl" ) && ! is_general_category( ur, "Cs" ) && ! has_name( ur, "<control>" );
+}
+
+size_t get_iswpunct( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core )
+{
+ return ! get_iswalpha( codepoint, ur, core ) && ! get_iswdigit( codepoint ) && ( ! has_name( ur, "<control>" ) && ! get_iswspace( codepoint, ur ) ) && ! is_general_category( ur, "Cs" );
+}
+
+#ifdef TEST
+
+#include "test.h"
+
+int main( void )
+{
+ TESTCASE( NO_TESTDRIVER );
+
+ return TEST_RESULTS;
+}
+
+#endif
diff --git a/src/pdclib/auxiliary/uctype/uctype.h b/src/pdclib/auxiliary/uctype/uctype.h
new file mode 100644
index 0000000..8cdda43
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/uctype.h
@@ -0,0 +1,29 @@
+/* uctype data
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#ifndef UCTYPE
+#define UCTYPE
+
+#include "derived_properties.h"
+#include "unicode_data.h"
+
+size_t get_towupper( size_t codepoint, struct unicode_record_t * ur );
+size_t get_towlower( size_t codepoint, struct unicode_record_t * ur );
+size_t get_iswupper( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core );
+size_t get_iswlower( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core );
+size_t get_iswalpha( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core );
+size_t get_iswdigit( size_t codepoint );
+size_t get_iswxdigit( size_t codepoint );
+size_t get_iswblank( size_t codepoint, struct unicode_record_t * ur );
+size_t get_iswspace( size_t codepoint, struct unicode_record_t * ur );
+size_t get_iswcntrl( size_t codepoint, struct unicode_record_t * ur );
+size_t get_iswgraph( size_t codepoint, struct unicode_record_t * ur );
+size_t get_iswprint( size_t codepoint, struct unicode_record_t * ur );
+size_t get_iswpunct( size_t codepoint, struct unicode_record_t * ur, struct derived_properties_t * core );
+
+
+#endif
+
diff --git a/src/pdclib/auxiliary/uctype/unicode_data.c b/src/pdclib/auxiliary/uctype/unicode_data.c
new file mode 100644
index 0000000..5d92fda
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/unicode_data.c
@@ -0,0 +1,224 @@
+/* unicode data
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "text_utilities.h"
+
+#include "unicode_data.h"
+
+#define LINE_BUFFER_SIZE 500u
+
+struct unicode_data_t * read_unicode_data( const char * filename )
+{
+ FILE * fh;
+ char buffer[ LINE_BUFFER_SIZE ];
+ struct unicode_data_t * ud = NULL;
+ size_t lines;
+
+ if ( ( fh = fopen( filename, "r" ) ) == NULL )
+ {
+ fprintf( stderr, "Could not open '%s' for reading.\n", filename );
+ return NULL;
+ }
+
+ if ( ( lines = check_file( fh, LINE_BUFFER_SIZE, ';', sizeof( unicode_record_fields ) / sizeof( int ), unicode_record_fields ) ) != (size_t)-1 )
+ {
+ if ( ( ud = malloc( sizeof( struct unicode_data_t ) ) ) )
+ {
+ ud->size = lines;
+
+ if ( ( ud->records = calloc( lines, sizeof( struct unicode_record_t ) ) ) )
+ {
+ size_t i;
+
+ for ( i = 0; i < lines; ++i )
+ {
+ char * p;
+
+ fgets( buffer, LINE_BUFFER_SIZE, fh );
+
+ ud->records[ i ].code_point = strtoul( next_token( buffer, ';' ), NULL, 16 );
+
+ p = next_token( NULL, ';' );
+ if ( *p )
+ {
+ ud->records[ i ].name = malloc( strlen( p ) + 1 );
+ strcpy( ud->records[ i ].name, p );
+ }
+
+ strcpy( ud->records[ i ].general_category, next_token( NULL, ';' ) );
+
+ p = next_token( NULL, ';' );
+ ud->records[ i ].canonical_combining_class = ( *p ) ? strtol( p, NULL, 10 ) : -1l;
+
+ strcpy( ud->records[ i ].bidi_class, next_token( NULL, ';' ) );
+
+ p = next_token( NULL, ';' );
+ if ( *p )
+ {
+ ud->records[ i ].decomposition = malloc( strlen( p ) + 1 );
+ strcpy( ud->records[ i ].decomposition, p );
+ }
+
+ p = next_token( NULL, ';' );
+ ud->records[ i ].numeric_type = ( *p ) ? strtol( p, NULL, 10 ) : -1l;
+
+ p = next_token( NULL, ';' );
+ ud->records[ i ].numeric_digit = ( *p ) ? strtol( p, NULL, 10 ) : -1l;
+
+ p = next_token( NULL, ';' );
+ if ( *p )
+ {
+ ud->records[ i ].numeric_value = malloc( strlen( p ) + 1 );
+ strcpy( ud->records[ i ].numeric_value, p );
+ }
+
+ p = next_token( NULL, ';' );
+ ud->records[ i ].bidi_mirrored = ( *p ) ? *p : '\0';
+
+ next_token( NULL, ';' ); /* Unicode_1_Name */
+ next_token( NULL, ';' ); /* ISO_Comment */
+
+ ud->records[ i ].simple_uppercase_mapping = strtoul( next_token( NULL, ';' ), NULL, 16 );
+ ud->records[ i ].simple_lowercase_mapping = strtoul( next_token( NULL, ';' ), NULL, 16 );
+ ud->records[ i ].simple_titlecase_mapping = strtoul( next_token( NULL, ';' ), NULL, 16 );
+ }
+ }
+ else
+ {
+ fprintf( stderr, "Memory allocation failure.\n" );
+ free( ud );
+ ud = NULL;
+ }
+ }
+ else
+ {
+ fprintf( stderr, "Memory allocation failure.\n" );
+ }
+ }
+
+ fclose( fh );
+ return ud;
+}
+
+int has_name( struct unicode_record_t * ur, const char * name )
+{
+ return strcmp( ur->name, name ) == 0;
+}
+
+int name_ends_with( struct unicode_record_t * ur, const char * name )
+{
+ return strstr( ur->name, name ) == ( ur->name + ( strlen( ur->name ) - strlen( name ) ) );
+}
+
+int is_general_category( struct unicode_record_t * ur, const char * category )
+{
+ return strcmp( ur->general_category, category ) == 0;
+}
+
+int decomposition_contains( struct unicode_record_t * ur, const char * substring )
+{
+ return ur->decomposition && strstr( ur->decomposition, substring ) != NULL;
+}
+
+int towupper_differs( struct unicode_record_t * ur, size_t codepoint )
+{
+ return ur->simple_uppercase_mapping && ( ur->simple_uppercase_mapping != codepoint );
+}
+
+int towlower_differs( struct unicode_record_t * ur, size_t codepoint )
+{
+ return ur->simple_lowercase_mapping && ( ur->simple_lowercase_mapping != codepoint );
+}
+
+void release_unicode_data( struct unicode_data_t * ud )
+{
+ size_t i;
+
+ for ( i = 0; i < ud->size; ++i )
+ {
+ free( ud->records[i].name );
+ free( ud->records[i].decomposition );
+ free( ud->records[i].numeric_value );
+ }
+
+ free( ud->records );
+ free( ud );
+}
+
+#ifdef TEST
+
+#include "test.h"
+
+#include <inttypes.h>
+
+int main( void )
+{
+ FILE * fh = fopen( "test.txt", "w" );
+ struct unicode_data_t * ud;
+ int rc;
+
+ TESTCASE( fh != NULL );
+ TESTCASE( fprintf( fh, "%04x;%s;%s;%d;%s;;;;;%c;%s;;;;\n", 0, "<control>", "Cc", 0, "BN", 'N', "NULL" ) == 38 );
+ TESTCASE( ( rc = fprintf( fh, "%04x;%s;%s;%d;%s;%s;;;%s;%c;;;%04x;;%04x\n", 0x2170, "SMALL ROMAN NUMERAL ONE", "Nl", 0, "L", "<compat> 0069", "1", 'N', 0x2160, 0x2160 ) ) == 69 );
+
+ fclose( fh );
+ ud = read_unicode_data( "test.txt" );
+ remove( "test.txt" );
+
+ TESTCASE( ud != NULL );
+ TESTCASE( ud->size == 2 );
+
+ TESTCASE( ud->records[0].code_point == 0 );
+ TESTCASE( strcmp( ud->records[0].name, "<control>" ) == 0 );
+ TESTCASE( strcmp( ud->records[0].general_category, "Cc" ) == 0 );
+ TESTCASE( ud->records[0].canonical_combining_class == 0 );
+ TESTCASE( strcmp( ud->records[0].bidi_class, "BN" ) == 0 );
+ TESTCASE( ud->records[0].decomposition == NULL );
+ TESTCASE( ud->records[0].numeric_type == -1 );
+ TESTCASE( ud->records[0].numeric_digit == -1 );
+ TESTCASE( ud->records[0].numeric_value == NULL );
+ TESTCASE( ud->records[0].bidi_mirrored == 'N' );
+ TESTCASE( ud->records[0].simple_uppercase_mapping == 0 );
+ TESTCASE( ud->records[0].simple_lowercase_mapping == 0 );
+ TESTCASE( ud->records[0].simple_titlecase_mapping == 0 );
+
+ TESTCASE( ud->records[1].code_point == 0x2170 );
+ TESTCASE( strcmp( ud->records[1].name, "SMALL ROMAN NUMERAL ONE" ) == 0 );
+ TESTCASE( strcmp( ud->records[1].general_category, "Nl" ) == 0 );
+ TESTCASE( ud->records[1].canonical_combining_class == 0 );
+ TESTCASE( strcmp( ud->records[1].bidi_class, "L" ) == 0 );
+ TESTCASE( strcmp( ud->records[1].decomposition, "<compat> 0069" ) == 0 );
+ TESTCASE( ud->records[1].numeric_type == -1 );
+ TESTCASE( ud->records[1].numeric_digit == -1 );
+ TESTCASE( strcmp( ud->records[1].numeric_value, "1" ) == 0 );
+ TESTCASE( ud->records[1].bidi_mirrored == 'N' );
+ TESTCASE( ud->records[1].simple_uppercase_mapping == 0x2160 );
+ TESTCASE( ud->records[1].simple_lowercase_mapping == 0 );
+ TESTCASE( ud->records[1].simple_titlecase_mapping == 0x2160 );
+
+ TESTCASE( is_general_category( &(ud->records[0]), "Cc" ) );
+ TESTCASE( ! is_general_category( &(ud->records[0]), "" ) );
+ TESTCASE( is_general_category( &(ud->records[1]), "Nl" ) );
+ TESTCASE( ! is_general_category( &(ud->records[1]), "Foo" ) );
+
+ TESTCASE( decomposition_contains( &(ud->records[1]), "<compat>" ) );
+ TESTCASE( ! decomposition_contains( &(ud->records[1]), "Foo" ) );
+
+ TESTCASE( ! towupper_differs( &(ud->records[0]), 0 ) );
+ TESTCASE( ! towlower_differs( &(ud->records[0]), 0 ) );
+ TESTCASE( towupper_differs( &(ud->records[1]), 0x2170 ) );
+ TESTCASE( ! towlower_differs( &(ud->records[1]), 0x2170 ) );
+
+ release_unicode_data( ud );
+
+ return TEST_RESULTS;
+}
+
+#endif
diff --git a/src/pdclib/auxiliary/uctype/unicode_data.h b/src/pdclib/auxiliary/uctype/unicode_data.h
new file mode 100644
index 0000000..8cd4832
--- /dev/null
+++ b/src/pdclib/auxiliary/uctype/unicode_data.h
@@ -0,0 +1,77 @@
+/* unicode data
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#ifndef UNICODE_DATA
+#define UNICODE_DATA UNICODE_DATA
+
+#include <stddef.h>
+
+/* https://www.unicode.org/reports/tr44/#UnicodeData.txt */
+
+/* We do not need all these fields at this point, but we read them anyway
+ so we do not need to change much should the need arise later.
+*/
+struct unicode_record_t
+{
+ size_t code_point;
+ char * name;
+ char general_category[ 3 ];
+ int canonical_combining_class;
+ char bidi_class[ 4 ];
+ char * decomposition;
+ int numeric_type;
+ int numeric_digit;
+ char * numeric_value;
+ char bidi_mirrored;
+ /*char * unicode_1_name;*/ /* Obsolete as of 6.2.0 */
+ /*char * iso_comment;*/ /* Obsoƶete as of 5.2.0 */
+ size_t simple_uppercase_mapping;
+ size_t simple_lowercase_mapping;
+ size_t simple_titlecase_mapping;
+};
+
+struct unicode_data_t
+{
+ size_t size;
+ struct unicode_record_t * records;
+};
+
+/* The assumed field widths, for use with check_file(). */
+static const int unicode_record_fields[] = {
+ -1, /* code_point */
+ -1, /* name */
+ 3, /* general_category */
+ -1, /* canonical_combining_class */
+ 4, /* bidi_class */
+ -1, /* decomposition */
+ -1, /* numeric_type */
+ -1, /* numeric_digit */
+ -1, /* numeric_value */
+ 2, /* bidi_mirrored */
+ -1, /* unicode_1_name */
+ -1, /* iso_comment */
+ -1, /* simple_uppercase_mapping */
+ -1, /* simple_lowercase_mapping */
+ -1 /* simple_titlecase_mapping */
+};
+
+struct unicode_data_t * read_unicode_data( const char * filename );
+
+int has_name( struct unicode_record_t * ur, const char * name );
+
+int name_ends_with( struct unicode_record_t * ur, const char * name );
+
+int is_general_category( struct unicode_record_t * ur, const char * category );
+
+int decomposition_contains( struct unicode_record_t * ur, const char * substring );
+
+int towupper_differs( struct unicode_record_t * ur, size_t codepoint );
+
+int towlower_differs( struct unicode_record_t * ur, size_t codepoint );
+
+void release_unicode_data( struct unicode_data_t * ud );
+
+#endif