one simple call to convert a utf8 string to arabic codepoints

author: ElOraiby <wael.eloraiby@gmail.com> 2015-05-31 20:11:46 -0400
committer: ElOraiby <wael.eloraiby@gmail.com> 2015-05-31 20:11:46 -0400
commit: 198b0eef667930341321f43bdd2da283658549b2 (patch)
tree: 1540a74e49d0ce5e88a15e590cf4aaa4fb9baf3a
parent: 2b81c17e2ca89df942479f51b8057ac713abe2d4 (diff)
4 files changed, 68 insertions, 43 deletions
diff --git a/README.md b/README.md
index bd40027..4b9e983 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,20 @@ This is mostly suitable for lightweight UIs (embedded systems, games, media play
 It's worthy to note that this is far more lightweight than HarfBuzz (used by Qt/Pango...).
 Freetype 2 on the other hand lacks this functionality.
 
+<U><B>Usage:</B></U>
+One call will convert a utf8 string to a sequence of code points:
+
+```C
+size_t get_presentation_form_b(size_t in_len, unsigned char *in_str, size_t out_len, uint32_t* out_cp);
+```
+
+Where
+`in_len`: the input utf8 string length in bytes
+`in_str`: the input utf8 string
+`out_len`: output code point buffer size (in bytes)
+`out_cp`: output code point buffer	
+`return`: return the total number of code points transformed
+
 <U><B>Details:</B></U>
 Arabic letters have 4 forms: Isolated, Initial, Medial and Ending. An arabic letter will have one of these forms depending on the letters preceding and succeeding it: For instance take the letter ﺡ : This is the isolated form (i.e. nothing precedes nor succeeds it), if it comes at the start of a syllable it will have the initial form ( ﺣ ). If it ends a syllable it will have the ending form ( ﺢ ) and if it is in the middle of a syllable it will have the medial form ( ﺤ ).  Some even have more complicated ligature forms (such as Lam and Alef together: ﻻ ) . A utf8 arabic string usually only comprises of isolated letters from [Arabic Unicode Block](http://en.wikipedia.org/wiki/Arabic_%28Unicode_block%29). With this library you will transform it to the rendering/presentation form: [Arabic Presentation Forms B](http://en.wikipedia.org/wiki/Arabic_Presentation_Forms-B)
 
diff --git a/arabtype.c b/arabtype.c
index 74180df..9c9469c 100644
--- a/arabtype.c
+++ b/arabtype.c
@@ -101,7 +101,8 @@ static inline bool is_alef_prev_lam(uint32_t prev, uint32_t cp)	{ return prev ==
 static inline bool is_linking_type(uint32_t cp) { return is_arabic_letter(cp) && arabic_forms_b[cp - ARABIC_LETTER_START][0][MEDIAL] != 0; }
 
 
-uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp) {
+static uint32_t
+get_presentation_form_b_of_char(uint32_t prev, uint32_t next, uint32_t cp) {
 
 	if( !is_arabic_letter(cp) ) {
 		return cp;	/* not an Arabic letter */
@@ -129,7 +130,44 @@ uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp) {
 #else
 	// optimized code
 	uint32_t index	= (((is_lapl | is_arabic_letter(next)) & is_linking_type(cp)) << 1) | is_linking_type(prev);
-	uint32_t ref	= next * is_la + cp * (1 - is_la) - ARABIC_LETTER_START;
+	uint32_t ref	= next * is_la + cp * (!is_la) - ARABIC_LETTER_START;
 	return arabic_forms_b[ref][is_lapl][index];
 #endif
 }
+
+size_t
+get_presentation_form_b(size_t in_len, unsigned char* in_str, size_t out_len, uint32_t* out_cp) {
+	uint32_t codep	= 0;
+	uint32_t state	= 0;
+	uint32_t prev	= 0;
+	size_t cp_count	= out_len / sizeof(uint32_t);
+	size_t	o	= 0;
+
+	for( size_t i = 0; i < in_len && o < cp_count; ++i ) {
+		if( decode(&state, &codep, in_str[i]) == UTF8_ACCEPT ) {
+			out_cp[o]	= codep;
+			++o;
+		}
+	}
+
+	if( state != UTF8_ACCEPT ) {
+		// The string is not well-formed"
+		return 0;
+	}
+
+	cp_count	= o;
+	size_t		s	= 0;
+
+	for( o = 0; o < cp_count; ++o) {
+		uint32_t	cp	= out_cp[o];
+		uint32_t	next	= o < cp_count - 1 ? out_cp[o + 1] : 0;
+		uint32_t	tcp	= get_presentation_form_b_of_char(prev, next, cp);
+		if( tcp != (uint32_t)-1 ) {
+			out_cp[s]	= tcp;
+			++s;
+		}
+		prev	= cp;
+	}
+
+	return o;
+}
diff --git a/arabtype.h b/arabtype.h
index 1e5377c..3a014fe 100644
--- a/arabtype.h
+++ b/arabtype.h
@@ -25,12 +25,13 @@ extern "C" {
 
 /**
  * @brief get_presentation_form_b
- * @param prev previous character
- * @param next next character
- * @param cp the current character
- * @return 0 ignore the code point, the transformed code point otherise
+ * @param in_len	input utf8 string length
+ * @param in_str	input utf8 string
+ * @param out_len	output code point buffer size (in bytes)
+ * @param out_cp	output code point buffer
+ * @return
  */
-uint32_t get_presentation_form_b(uint32_t prev, uint32_t next, uint32_t cp);
+size_t get_presentation_form_b(size_t in_len, unsigned char *in_str, size_t out_len, uint32_t* out_cp);
 
 #ifdef __cplusplus
 }
diff --git a/mainwindow.cpp b/mainwindow.cpp
index 14a2871..ad6cde7 100644
--- a/mainwindow.cpp
+++ b/mainwindow.cpp
@@ -19,38 +19,6 @@
 FT_Library MainWindow::ftlib__	= NULL;
 
 
-
-uint32_t	get_arabic_form(const std::vector<uint32_t>& code_point, uint32_t idx)
-{
-	uint	ch	= code_point[idx];
-	uint	prev	= 0;
-	uint	next	= 0;
-
-	if( idx )
-		prev	= code_point[idx - 1];
-	if( idx < code_point.size() - 1 )
-		next	= code_point[idx + 1];
-
-	return get_presentation_form_b(prev, next, ch);
-}
-
-std::vector<uint32_t> decode(const std::vector<uint8_t>& in)
-{
-	std::vector<uint>	ret;
-	uint codep	= 0;
-	uint state	= 0;
-
-	for( size_t i = 0; i < in.size(); ++i )
-	{
-		if( decode(&state, &codep, in[i]) == UTF8_ACCEPT )
-			ret.push_back(codep);
-	}
-	if( state != UTF8_ACCEPT )
-		std::cout << "The string is not well-formed" << std::endl;
-
-	return ret;
-}
-
 MainWindow::MainWindow(QWidget *parent) :
 	QMainWindow(parent),
 	ui__(new Ui::MainWindow)
@@ -75,9 +43,11 @@ MainWindow::MainWindow(QWidget *parent) :
 	std::cout << "size: " << arabic_string.size() << std::endl;
 	ifs.close();
 
-	std::vector<uint> arabic_cp	= decode(arabic_string);
 
-	for( size_t i = 0; i < arabic_cp.size(); ++i )
+	uint32_t*	arabic_cp	= new uint32_t[arabic_string.size()];
+	uint32_t	acp_size	= get_presentation_form_b(arabic_string.size(), &arabic_string[0], arabic_string.size() * sizeof(uint32_t), arabic_cp);
+
+	for( size_t i = 0; i < acp_size; ++i )
 		std::cout << "0x" << std::hex << arabic_cp[i] << std::endl;
 
 	std::cout << std::dec;
@@ -113,9 +83,10 @@ MainWindow::MainWindow(QWidget *parent) :
 	int	line	= font_size;
 	QImage	img(data__, width, height, QImage::Format_RGB32);
 	// render the arabic glyphs
-	for( size_t idx = 0; idx < arabic_cp.size(); ++idx )
+	for( size_t idx = 0; idx < acp_size; ++idx )
 	{
-		uint ch	= get_arabic_form(arabic_cp, idx);
+		uint ch	= arabic_cp[idx];
+
 		if( ch == 0xA ) {
 			line	+= font_size + 5;
 			col	= width - font_size;
@@ -172,6 +143,7 @@ MainWindow::MainWindow(QWidget *parent) :
 		FT_Done_Glyph(glyph);
 	}
 
+	delete[] arabic_cp;
 
 	QPixmap	pixmap = QPixmap::fromImage(img);
 	ui__->label->setPixmap(pixmap);
author	ElOraiby <wael.eloraiby@gmail.com>	2015-05-31 20:11:46 -0400
committer	ElOraiby <wael.eloraiby@gmail.com>	2015-05-31 20:11:46 -0400
commit	198b0eef667930341321f43bdd2da283658549b2 (patch)
tree	1540a74e49d0ce5e88a15e590cf4aaa4fb9baf3a
parent	2b81c17e2ca89df942479f51b8057ac713abe2d4 (diff)