aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorElOraiby <wael.eloraiby@gmail.com>2015-05-31 20:11:46 -0400
committerElOraiby <wael.eloraiby@gmail.com>2015-05-31 20:11:46 -0400
commit198b0eef667930341321f43bdd2da283658549b2 (patch)
tree1540a74e49d0ce5e88a15e590cf4aaa4fb9baf3a
parent2b81c17e2ca89df942479f51b8057ac713abe2d4 (diff)
one simple call to convert a utf8 string to arabic codepoints
-rw-r--r--README.md14
-rw-r--r--arabtype.c42
-rw-r--r--arabtype.h11
-rw-r--r--mainwindow.cpp44
4 files changed, 68 insertions, 43 deletions
diff --git a/README.md b/README.md
index bd40027..4b9e983 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,20 @@ This is mostly suitable for lightweight UIs (embedded systems, games, media play
It's worthy to note that this is far more lightweight than HarfBuzz (used by Qt/Pango...).
Freetype 2 on the other hand lacks this functionality.
+<U><B>Usage:</B></U>
+One call will convert a utf8 string to a sequence of code points:
+
+```C
+size_t get_presentation_form_b(size_t in_len, unsigned char *in_str, size_t out_len, uint32_t* out_cp);
+```
+
+Where
+`in_len`: the input utf8 string length in bytes
+`in_str`: the input utf8 string
+`out_len`: output code point buffer size (in bytes)
+`out_cp`: output code point buffer
+`return`: return the total number of code points transformed
+
<U><B>Details:</B></U>
Arabic letters have 4 forms: Isolated, Initial, Medial and Ending. An arabic letter will have one of these forms depending on the letters preceding and succeeding it: For instance take the letter ﺡ : This is the isolated form (i.e. nothing precedes nor succeeds it), if it comes at the start of a syllable it will have the initial form ( ﺣ ). If it ends a syllable it will have the ending form ( ﺢ ) and if it is in the middle of a syllable it will have the medial form ( ﺤ ). Some even have more complicated ligature forms (such as Lam and Alef together: ﻻ ) . A utf8 arabic string usually only comprises of isolated letters from [Arabic Unicode Block](http://en.wikipedia.org/wiki/Arabic_%28Unicode_block%29). With this library you will transform it to the rendering/presentation form: [Arabic Presentation Forms B](http://en.wikipedia.org/wiki/Arabic_Presentation_Forms-B)
diff --git a/arabtype.c b/arabtype.c
index 74180df..9c9469c 100644
--- a/arabtype.c
+++ b/arabtype.c
@@ -101,7 +101,8 @@ static inline bool is_alef_prev_lam(uint32_t prev, uint32_t cp) { return prev ==
static inline bool is_linking_type(uint32_t cp) { return is_arabic_letter(cp) && arabic_forms_b[cp - ARABIC_LETTER_START][0][MEDIAL] != 0; }
-uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp) {
+static uint32_t
+get_presentation_form_b_of_char(uint32_t prev, uint32_t next, uint32_t cp) {
if( !is_arabic_letter(cp) ) {
return cp; /* not an Arabic letter */
@@ -129,7 +130,44 @@ uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp) {
#else
// optimized code
uint32_t index = (((is_lapl | is_arabic_letter(next)) & is_linking_type(cp)) << 1) | is_linking_type(prev);
- uint32_t ref = next * is_la + cp * (1 - is_la) - ARABIC_LETTER_START;
+ uint32_t ref = next * is_la + cp * (!is_la) - ARABIC_LETTER_START;
return arabic_forms_b[ref][is_lapl][index];
#endif
}
+
+size_t
+get_presentation_form_b(size_t in_len, unsigned char* in_str, size_t out_len, uint32_t* out_cp) {
+ uint32_t codep = 0;
+ uint32_t state = 0;
+ uint32_t prev = 0;
+ size_t cp_count = out_len / sizeof(uint32_t);
+ size_t o = 0;
+
+ for( size_t i = 0; i < in_len && o < cp_count; ++i ) {
+ if( decode(&state, &codep, in_str[i]) == UTF8_ACCEPT ) {
+ out_cp[o] = codep;
+ ++o;
+ }
+ }
+
+ if( state != UTF8_ACCEPT ) {
+ // The string is not well-formed"
+ return 0;
+ }
+
+ cp_count = o;
+ size_t s = 0;
+
+ for( o = 0; o < cp_count; ++o) {
+ uint32_t cp = out_cp[o];
+ uint32_t next = o < cp_count - 1 ? out_cp[o + 1] : 0;
+ uint32_t tcp = get_presentation_form_b_of_char(prev, next, cp);
+ if( tcp != (uint32_t)-1 ) {
+ out_cp[s] = tcp;
+ ++s;
+ }
+ prev = cp;
+ }
+
+ return o;
+}
diff --git a/arabtype.h b/arabtype.h
index 1e5377c..3a014fe 100644
--- a/arabtype.h
+++ b/arabtype.h
@@ -25,12 +25,13 @@ extern "C" {
/**
* @brief get_presentation_form_b
- * @param prev previous character
- * @param next next character
- * @param cp the current character
- * @return 0 ignore the code point, the transformed code point otherise
+ * @param in_len input utf8 string length
+ * @param in_str input utf8 string
+ * @param out_len output code point buffer size (in bytes)
+ * @param out_cp output code point buffer
+ * @return
*/
-uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp);
+size_t get_presentation_form_b(size_t in_len, unsigned char *in_str, size_t out_len, uint32_t* out_cp);
#ifdef __cplusplus
}
diff --git a/mainwindow.cpp b/mainwindow.cpp
index 14a2871..ad6cde7 100644
--- a/mainwindow.cpp
+++ b/mainwindow.cpp
@@ -19,38 +19,6 @@
FT_Library MainWindow::ftlib__ = NULL;
-
-uint32_t get_arabic_form(const std::vector<uint32_t>& code_point, uint32_t idx)
-{
- uint ch = code_point[idx];
- uint prev = 0;
- uint next = 0;
-
- if( idx )
- prev = code_point[idx - 1];
- if( idx < code_point.size() - 1 )
- next = code_point[idx + 1];
-
- return get_presentation_form_b(prev, next, ch);
-}
-
-std::vector<uint32_t> decode(const std::vector<uint8_t>& in)
-{
- std::vector<uint> ret;
- uint codep = 0;
- uint state = 0;
-
- for( size_t i = 0; i < in.size(); ++i )
- {
- if( decode(&state, &codep, in[i]) == UTF8_ACCEPT )
- ret.push_back(codep);
- }
- if( state != UTF8_ACCEPT )
- std::cout << "The string is not well-formed" << std::endl;
-
- return ret;
-}
-
MainWindow::MainWindow(QWidget *parent) :
QMainWindow(parent),
ui__(new Ui::MainWindow)
@@ -75,9 +43,11 @@ MainWindow::MainWindow(QWidget *parent) :
std::cout << "size: " << arabic_string.size() << std::endl;
ifs.close();
- std::vector<uint> arabic_cp = decode(arabic_string);
- for( size_t i = 0; i < arabic_cp.size(); ++i )
+ uint32_t* arabic_cp = new uint32_t[arabic_string.size()];
+ uint32_t acp_size = get_presentation_form_b(arabic_string.size(), &arabic_string[0], arabic_string.size() * sizeof(uint32_t), arabic_cp);
+
+ for( size_t i = 0; i < acp_size; ++i )
std::cout << "0x" << std::hex << arabic_cp[i] << std::endl;
std::cout << std::dec;
@@ -113,9 +83,10 @@ MainWindow::MainWindow(QWidget *parent) :
int line = font_size;
QImage img(data__, width, height, QImage::Format_RGB32);
// render the arabic glyphs
- for( size_t idx = 0; idx < arabic_cp.size(); ++idx )
+ for( size_t idx = 0; idx < acp_size; ++idx )
{
- uint ch = get_arabic_form(arabic_cp, idx);
+ uint ch = arabic_cp[idx];
+
if( ch == 0xA ) {
line += font_size + 5;
col = width - font_size;
@@ -172,6 +143,7 @@ MainWindow::MainWindow(QWidget *parent) :
FT_Done_Glyph(glyph);
}
+ delete[] arabic_cp;
QPixmap pixmap = QPixmap::fromImage(img);
ui__->label->setPixmap(pixmap);