diff options
author | ElOraiby <wael.eloraiby@gmail.com> | 2015-05-31 20:11:46 -0400 |
---|---|---|
committer | ElOraiby <wael.eloraiby@gmail.com> | 2015-05-31 20:11:46 -0400 |
commit | 198b0eef667930341321f43bdd2da283658549b2 (patch) | |
tree | 1540a74e49d0ce5e88a15e590cf4aaa4fb9baf3a | |
parent | 2b81c17e2ca89df942479f51b8057ac713abe2d4 (diff) |
one simple call to convert a utf8 string to arabic codepoints
-rw-r--r-- | README.md | 14 | ||||
-rw-r--r-- | arabtype.c | 42 | ||||
-rw-r--r-- | arabtype.h | 11 | ||||
-rw-r--r-- | mainwindow.cpp | 44 |
4 files changed, 68 insertions, 43 deletions
@@ -7,6 +7,20 @@ This is mostly suitable for lightweight UIs (embedded systems, games, media play It's worthy to note that this is far more lightweight than HarfBuzz (used by Qt/Pango...). Freetype 2 on the other hand lacks this functionality. +<U><B>Usage:</B></U> +One call will convert a utf8 string to a sequence of code points: + +```C +size_t get_presentation_form_b(size_t in_len, unsigned char *in_str, size_t out_len, uint32_t* out_cp); +``` + +Where +`in_len`: the input utf8 string length in bytes +`in_str`: the input utf8 string +`out_len`: output code point buffer size (in bytes) +`out_cp`: output code point buffer +`return`: return the total number of code points transformed + <U><B>Details:</B></U> Arabic letters have 4 forms: Isolated, Initial, Medial and Ending. An arabic letter will have one of these forms depending on the letters preceding and succeeding it: For instance take the letter ﺡ : This is the isolated form (i.e. nothing precedes nor succeeds it), if it comes at the start of a syllable it will have the initial form ( ﺣ ). If it ends a syllable it will have the ending form ( ﺢ ) and if it is in the middle of a syllable it will have the medial form ( ﺤ ). Some even have more complicated ligature forms (such as Lam and Alef together: ﻻ ) . A utf8 arabic string usually only comprises of isolated letters from [Arabic Unicode Block](http://en.wikipedia.org/wiki/Arabic_%28Unicode_block%29). With this library you will transform it to the rendering/presentation form: [Arabic Presentation Forms B](http://en.wikipedia.org/wiki/Arabic_Presentation_Forms-B) @@ -101,7 +101,8 @@ static inline bool is_alef_prev_lam(uint32_t prev, uint32_t cp) { return prev == static inline bool is_linking_type(uint32_t cp) { return is_arabic_letter(cp) && arabic_forms_b[cp - ARABIC_LETTER_START][0][MEDIAL] != 0; } -uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp) { +static uint32_t +get_presentation_form_b_of_char(uint32_t prev, uint32_t next, uint32_t cp) { if( !is_arabic_letter(cp) ) { return cp; /* not an Arabic letter */ @@ -129,7 +130,44 @@ uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp) { #else // optimized code uint32_t index = (((is_lapl | is_arabic_letter(next)) & is_linking_type(cp)) << 1) | is_linking_type(prev); - uint32_t ref = next * is_la + cp * (1 - is_la) - ARABIC_LETTER_START; + uint32_t ref = next * is_la + cp * (!is_la) - ARABIC_LETTER_START; return arabic_forms_b[ref][is_lapl][index]; #endif } + +size_t +get_presentation_form_b(size_t in_len, unsigned char* in_str, size_t out_len, uint32_t* out_cp) { + uint32_t codep = 0; + uint32_t state = 0; + uint32_t prev = 0; + size_t cp_count = out_len / sizeof(uint32_t); + size_t o = 0; + + for( size_t i = 0; i < in_len && o < cp_count; ++i ) { + if( decode(&state, &codep, in_str[i]) == UTF8_ACCEPT ) { + out_cp[o] = codep; + ++o; + } + } + + if( state != UTF8_ACCEPT ) { + // The string is not well-formed" + return 0; + } + + cp_count = o; + size_t s = 0; + + for( o = 0; o < cp_count; ++o) { + uint32_t cp = out_cp[o]; + uint32_t next = o < cp_count - 1 ? out_cp[o + 1] : 0; + uint32_t tcp = get_presentation_form_b_of_char(prev, next, cp); + if( tcp != (uint32_t)-1 ) { + out_cp[s] = tcp; + ++s; + } + prev = cp; + } + + return o; +} @@ -25,12 +25,13 @@ extern "C" { /** * @brief get_presentation_form_b - * @param prev previous character - * @param next next character - * @param cp the current character - * @return 0 ignore the code point, the transformed code point otherise + * @param in_len input utf8 string length + * @param in_str input utf8 string + * @param out_len output code point buffer size (in bytes) + * @param out_cp output code point buffer + * @return */ -uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp); +size_t get_presentation_form_b(size_t in_len, unsigned char *in_str, size_t out_len, uint32_t* out_cp); #ifdef __cplusplus } diff --git a/mainwindow.cpp b/mainwindow.cpp index 14a2871..ad6cde7 100644 --- a/mainwindow.cpp +++ b/mainwindow.cpp @@ -19,38 +19,6 @@ FT_Library MainWindow::ftlib__ = NULL; - -uint32_t get_arabic_form(const std::vector<uint32_t>& code_point, uint32_t idx) -{ - uint ch = code_point[idx]; - uint prev = 0; - uint next = 0; - - if( idx ) - prev = code_point[idx - 1]; - if( idx < code_point.size() - 1 ) - next = code_point[idx + 1]; - - return get_presentation_form_b(prev, next, ch); -} - -std::vector<uint32_t> decode(const std::vector<uint8_t>& in) -{ - std::vector<uint> ret; - uint codep = 0; - uint state = 0; - - for( size_t i = 0; i < in.size(); ++i ) - { - if( decode(&state, &codep, in[i]) == UTF8_ACCEPT ) - ret.push_back(codep); - } - if( state != UTF8_ACCEPT ) - std::cout << "The string is not well-formed" << std::endl; - - return ret; -} - MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent), ui__(new Ui::MainWindow) @@ -75,9 +43,11 @@ MainWindow::MainWindow(QWidget *parent) : std::cout << "size: " << arabic_string.size() << std::endl; ifs.close(); - std::vector<uint> arabic_cp = decode(arabic_string); - for( size_t i = 0; i < arabic_cp.size(); ++i ) + uint32_t* arabic_cp = new uint32_t[arabic_string.size()]; + uint32_t acp_size = get_presentation_form_b(arabic_string.size(), &arabic_string[0], arabic_string.size() * sizeof(uint32_t), arabic_cp); + + for( size_t i = 0; i < acp_size; ++i ) std::cout << "0x" << std::hex << arabic_cp[i] << std::endl; std::cout << std::dec; @@ -113,9 +83,10 @@ MainWindow::MainWindow(QWidget *parent) : int line = font_size; QImage img(data__, width, height, QImage::Format_RGB32); // render the arabic glyphs - for( size_t idx = 0; idx < arabic_cp.size(); ++idx ) + for( size_t idx = 0; idx < acp_size; ++idx ) { - uint ch = get_arabic_form(arabic_cp, idx); + uint ch = arabic_cp[idx]; + if( ch == 0xA ) { line += font_size + 5; col = width - font_size; @@ -172,6 +143,7 @@ MainWindow::MainWindow(QWidget *parent) : FT_Done_Glyph(glyph); } + delete[] arabic_cp; QPixmap pixmap = QPixmap::fromImage(img); ui__->label->setPixmap(pixmap); |