one simple call to convert a utf8 string to arabic codepoints

author ElOraiby <wael.eloraiby@gmail.com>

Mon, 1 Jun 2015 00:11:46 +0000 (20:11 -0400)

committer ElOraiby <wael.eloraiby@gmail.com>

Mon, 1 Jun 2015 00:11:46 +0000 (20:11 -0400)
author ElOraiby <wael.eloraiby@gmail.com>
Mon, 1 Jun 2015 00:11:46 +0000 (20:11 -0400)
committer ElOraiby <wael.eloraiby@gmail.com>
Mon, 1 Jun 2015 00:11:46 +0000 (20:11 -0400)
diff --git a/README.md b/README.md

index bd4002752a045595a8374ba2c1f9f2036789aa02..4b9e9832e8308f3eedc64c9e9a56286f7f2d57c5 100644 (file)
--- a/README.md
+++ b/README.md
@@ -7,6 +7,20 @@ This is mostly suitable for lightweight UIs (embedded systems, games, media play
  It's worthy to note that this is far more lightweight than HarfBuzz (used by Qt/Pango...).
  Freetype 2 on the other hand lacks this functionality.
  
+<U><B>Usage:</B></U>
+One call will convert a utf8 string to a sequence of code points:
+
+```C
+size_t get_presentation_form_b(size_t in_len, unsigned char *in_str, size_t out_len, uint32_t* out_cp);
+```
+
+Where
+`in_len`: the input utf8 string length in bytes
+`in_str`: the input utf8 string
+`out_len`: output code point buffer size (in bytes)
+`out_cp`: output code point buffer     
+`return`: return the total number of code points transformed
+
  <U><B>Details:</B></U>
  Arabic letters have 4 forms: Isolated, Initial, Medial and Ending. An arabic letter will have one of these forms depending on the letters preceding and succeeding it: For instance take the letter ﺡ : This is the isolated form (i.e. nothing precedes nor succeeds it), if it comes at the start of a syllable it will have the initial form ( ﺣ ). If it ends a syllable it will have the ending form ( ﺢ ) and if it is in the middle of a syllable it will have the medial form ( ﺤ ).  Some even have more complicated ligature forms (such as Lam and Alef together: ﻻ ) . A utf8 arabic string usually only comprises of isolated letters from [Arabic Unicode Block](http://en.wikipedia.org/wiki/Arabic_%28Unicode_block%29). With this library you will transform it to the rendering/presentation form: [Arabic Presentation Forms B](http://en.wikipedia.org/wiki/Arabic_Presentation_Forms-B)
  
diff --git a/arabtype.c b/arabtype.c

index 74180dfdd1477f11d12cfe7d56adc7b5a9446ca9..9c9469c0de9707b7e1899b4069d2bc228420eef1 100644 (file)
--- a/arabtype.c
+++ b/arabtype.c
@@ -101,7 +101,8 @@ static inline bool is_alef_prev_lam(uint32_t prev, uint32_t cp)     { return prev ==
  static inline bool is_linking_type(uint32_t cp) { return is_arabic_letter(cp) && arabic_forms_b[cp - ARABIC_LETTER_START][0][MEDIAL] != 0; }
  
  
-uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp) {
+static uint32_t
+get_presentation_form_b_of_char(uint32_t prev, uint32_t next, uint32_t cp) {
  
         if( !is_arabic_letter(cp) ) {
                 return cp;      /* not an Arabic letter */
@@ -129,7 +130,44 @@ uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp) {
  #else
         // optimized code
         uint32_t index  = (((is_lapl | is_arabic_letter(next)) & is_linking_type(cp)) << 1) | is_linking_type(prev);
-       uint32_t ref    = next * is_la + cp * (1 - is_la) - ARABIC_LETTER_START;
+       uint32_t ref    = next * is_la + cp * (!is_la) - ARABIC_LETTER_START;
         return arabic_forms_b[ref][is_lapl][index];
  #endif
  }
+
+size_t
+get_presentation_form_b(size_t in_len, unsigned char* in_str, size_t out_len, uint32_t* out_cp) {
+       uint32_t codep  = 0;
+       uint32_t state  = 0;
+       uint32_t prev   = 0;
+       size_t cp_count = out_len / sizeof(uint32_t);
+       size_t  o       = 0;
+
+       for( size_t i = 0; i < in_len && o < cp_count; ++i ) {
+               if( decode(&state, &codep, in_str[i]) == UTF8_ACCEPT ) {
+                       out_cp[o]       = codep;
+                       ++o;
+               }
+       }
+
+       if( state != UTF8_ACCEPT ) {
+               // The string is not well-formed"
+               return 0;
+       }
+
+       cp_count        = o;
+       size_t          s       = 0;
+
+       for( o = 0; o < cp_count; ++o) {
+               uint32_t        cp      = out_cp[o];
+               uint32_t        next    = o < cp_count - 1 ? out_cp[o + 1] : 0;
+               uint32_t        tcp     = get_presentation_form_b_of_char(prev, next, cp);
+               if( tcp != (uint32_t)-1 ) {
+                       out_cp[s]       = tcp;
+                       ++s;
+               }
+               prev    = cp;
+       }
+
+       return o;
+}
diff --git a/arabtype.h b/arabtype.h

index 1e5377c28a92d2483502011f89103ecdf4bec496..3a014feaaaf6b4a50b5ef3b1c5c1f588349a29ed 100644 (file)
--- a/arabtype.h
+++ b/arabtype.h
@@ -25,12 +25,13 @@ extern "C" {
  
  /**
   * @brief get_presentation_form_b
- * @param prev previous character
- * @param next next character
- * @param cp the current character
- * @return 0 ignore the code point, the transformed code point otherise
+ * @param in_len       input utf8 string length
+ * @param in_str       input utf8 string
+ * @param out_len      output code point buffer size (in bytes)
+ * @param out_cp       output code point buffer
+ * @return
   */
-uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp);
+size_t get_presentation_form_b(size_t in_len, unsigned char *in_str, size_t out_len, uint32_t* out_cp);
  
  #ifdef __cplusplus
  }
diff --git a/mainwindow.cpp b/mainwindow.cpp

index 14a2871a0953ff0e5e792e5beba19fe3057ce51c..ad6cde7392356d518406183213048d57762224c2 100644 (file)
--- a/mainwindow.cpp
+++ b/mainwindow.cpp
@@ -19,38 +19,6 @@
  FT_Library MainWindow::ftlib__ = NULL;
  
  
-
-uint32_t       get_arabic_form(const std::vector<uint32_t>& code_point, uint32_t idx)
-{
-       uint    ch      = code_point[idx];
-       uint    prev    = 0;
-       uint    next    = 0;
-
-       if( idx )
-               prev    = code_point[idx - 1];
-       if( idx < code_point.size() - 1 )
-               next    = code_point[idx + 1];
-
-       return get_presentation_form_b(prev, next, ch);
-}
-
-std::vector<uint32_t> decode(const std::vector<uint8_t>& in)
-{
-       std::vector<uint>       ret;
-       uint codep      = 0;
-       uint state      = 0;
-
-       for( size_t i = 0; i < in.size(); ++i )
-       {
-               if( decode(&state, &codep, in[i]) == UTF8_ACCEPT )
-                       ret.push_back(codep);
-       }
-       if( state != UTF8_ACCEPT )
-               std::cout << "The string is not well-formed" << std::endl;
-
-       return ret;
-}
-
  MainWindow::MainWindow(QWidget *parent) :
         QMainWindow(parent),
         ui__(new Ui::MainWindow)
@@ -75,9 +43,11 @@ MainWindow::MainWindow(QWidget *parent) :
         std::cout << "size: " << arabic_string.size() << std::endl;
         ifs.close();
  
-       std::vector<uint> arabic_cp     = decode(arabic_string);
  
-       for( size_t i = 0; i < arabic_cp.size(); ++i )
+       uint32_t*       arabic_cp       = new uint32_t[arabic_string.size()];
+       uint32_t        acp_size        = get_presentation_form_b(arabic_string.size(), &arabic_string[0], arabic_string.size() * sizeof(uint32_t), arabic_cp);
+
+       for( size_t i = 0; i < acp_size; ++i )
                 std::cout << "0x" << std::hex << arabic_cp[i] << std::endl;
  
         std::cout << std::dec;
@@ -113,9 +83,10 @@ MainWindow::MainWindow(QWidget *parent) :
         int     line    = font_size;
         QImage  img(data__, width, height, QImage::Format_RGB32);
         // render the arabic glyphs
-       for( size_t idx = 0; idx < arabic_cp.size(); ++idx )
+       for( size_t idx = 0; idx < acp_size; ++idx )
         {
-               uint ch = get_arabic_form(arabic_cp, idx);
+               uint ch = arabic_cp[idx];
+
                 if( ch == 0xA ) {
                         line    += font_size + 5;
                         col     = width - font_size;
@@ -172,6 +143,7 @@ MainWindow::MainWindow(QWidget *parent) :
                 FT_Done_Glyph(glyph);
         }
  
+       delete[] arabic_cp;
  
         QPixmap pixmap = QPixmap::fromImage(img);
         ui__->label->setPixmap(pixmap);
author	ElOraiby <wael.eloraiby@gmail.com>
	Mon, 1 Jun 2015 00:11:46 +0000 (20:11 -0400)
committer	ElOraiby <wael.eloraiby@gmail.com>
	Mon, 1 Jun 2015 00:11:46 +0000 (20:11 -0400)
README.md		patch \| blob \| history
arabtype.c		patch \| blob \| history
arabtype.h		patch \| blob \| history
mainwindow.cpp		patch \| blob \| history