* utf8.c (utf8_from_uc, utf8_decode): Use upper case for hex constants.

If bytes decode to U+DCxx, treat this sequence as invalid. This way we can't be fooled by an attacker into accepting some U+DCxx which on output we will then convert to byte xx. (utf8_to_uc): Use upper case for hex constants.
author: Kaz Kylheku <kaz@kylheku.com> 2012-02-02 22:54:17 -0800
committer: Kaz Kylheku <kaz@kylheku.com> 2012-02-02 22:54:17 -0800
commit: a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7 (patch)
tree: 1696c8ed5bd8d5da2bda03f9a0a78507139bdfa0 /utf8.c
parent: 905b074cea7303553777e169529efc8aeccdc35a (diff)
download: txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.tar.gz
txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.tar.bz2
txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.zip
1 files changed, 29 insertions, 25 deletions
diff --git a/utf8.c b/utf8.c
index fcc4dc98..0c9c109c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -61,7 +61,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
         break;
       src = backtrack;
       if (wdst)
-        *wdst++ = 0xdc00 | *src;
+        *wdst++ = 0xDC00 | *src;
       nchar++;
       state = utf8_init;
       continue;
@@ -73,15 +73,15 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
         if (wdst)
           *wdst++ = ch;
         nchar++;
-      } else if (ch >= 0xc2 && ch <= 0xe0) {
+      } else if (ch >= 0xC2 && ch <= 0xE0) {
         state = utf8_more1;
-        wch = (ch & 0x1f);
+        wch = (ch & 0x1F);
 	wch_min = 0x80;
-      } else if (ch >= 0xe0 && ch <= 0xef) {
+      } else if (ch >= 0xE0 && ch <= 0xEF) {
         state = utf8_more2;
-        wch = (ch & 0xf);
+        wch = (ch & 0xF);
 	wch_min = 0x800;
-      } else if (ch >= 0xf0 && ch < 0xf5) {
+      } else if (ch >= 0xF0 && ch < 0xF5) {
 #ifdef FULL_UNICODE
         state = utf8_more3;
         wch = (ch & 0x7);
@@ -91,7 +91,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
 #endif
       } else {
         if (wdst)
-          *wdst++ = 0xdc00 | ch;
+          *wdst++ = 0xDC00 | ch;
         nchar++;
       }
       backtrack = src;
@@ -99,15 +99,17 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
     case utf8_more1:
     case utf8_more2:
     case utf8_more3:
-      if (ch >= 0x80 && ch < 0xc0) {
+      if (ch >= 0x80 && ch < 0xC0) {
         wch <<= 6;
-        wch |= (ch & 0x3f);
+        wch |= (ch & 0x3F);
         state = (enum utf8_state) (state - 1);
         if (state == utf8_init) {
-	  if (wch < wch_min) {
+	  if (wch < wch_min &&
+	      (wch <= 0xFFFF && (wch & 0xFF00) == 0xDC00))
+	  {
 	    src = backtrack;
 	    if (wdst)
-	      *wdst++ = 0xdc00 | *src;
+	      *wdst++ = 0xDC00 | *src;
 	  } else {
 	    if (wdst)
 	      *wdst++ = wch;
@@ -117,7 +119,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
       } else {
         src = backtrack;
         if (wdst)
-          *wdst++ = 0xdc00 | *src;
+          *wdst++ = 0xDC00 | *src;
         nchar++;
         state = utf8_init;
       }
@@ -155,7 +157,7 @@ size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc)
       if ((wch & 0xFF00) == 0xDC00) {
 	nbyte += 1;
 	if (dst)
-	  *dst++ = (wch & 0xff);
+	  *dst++ = (wch & 0xFF);
       } else {
 	nbyte += 3;
 	if (dst) {
@@ -267,7 +269,7 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
       if (ud->state == utf8_init) {
         return WEOF;
       } else {
-        wchar_t wch = 0xdc00 | ud->buf[ud->back];
+        wchar_t wch = 0xDC00 | ud->buf[ud->back];
         ud->tail = ud->back = (ud->back + 1) % 8;
         ud->state = utf8_init;
         return wch;
@@ -279,15 +281,15 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
       if (ch < 0x80) {
         ud->back = ud->tail;
         return ch;
-      } else if (ch >= 0xc0 && ch <= 0xe0) {
+      } else if (ch >= 0xC0 && ch <= 0xE0) {
         ud->state = utf8_more1;
-        ud->wch = (ch & 0x1f);
+        ud->wch = (ch & 0x1F);
 	ud->wch_min = 0x80;
-      } else if (ch >= 0xe0 && ch <= 0xef) {
+      } else if (ch >= 0xE0 && ch <= 0xEF) {
         ud->state = utf8_more2;
-        ud->wch = (ch & 0xf);
+        ud->wch = (ch & 0xF);
 	ud->wch_min = 0x800;
-      } else if (ch >= 0xf0 && ch < 0xf5) {
+      } else if (ch >= 0xF0 && ch < 0xF5) {
 #ifdef FULL_UNICODE
         ud->state = utf8_more3;
         ud->wch = (ch & 0x7);
@@ -297,19 +299,21 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
 #endif
       } else {
         ud->back = ud->tail;
-        return 0xdc00 | ch;
+        return 0xDC00 | ch;
       }
       break;
     case utf8_more1:
     case utf8_more2:
     case utf8_more3:
-      if (ch >= 0x80 && ch < 0xc0) {
+      if (ch >= 0x80 && ch < 0xC0) {
         ud->wch <<= 6;
-        ud->wch |= (ch & 0x3f);
+        ud->wch |= (ch & 0x3F);
         ud->state = (enum utf8_state) (ud->state - 1);
         if (ud->state == utf8_init) {
-	  if (ud->wch < ud->wch_min) {
-	    wchar_t wch = 0xdc00 | ud->buf[ud->back];
+	  if (ud->wch < ud->wch_min || 
+	      (ud->wch <= 0xFFFF && (ud->wch & 0xFF00) == 0xDC00))
+	  {
+	    wchar_t wch = 0xDC00 | ud->buf[ud->back];
 	    ud->tail = ud->back = (ud->back + 1) % 8;
 	    return wch;
 	  } else {
@@ -318,7 +322,7 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
 	  }
         }
       } else {
-        wchar_t wch = 0xdc00 | ud->buf[ud->back];
+        wchar_t wch = 0xDC00 | ud->buf[ud->back];
         ud->tail = ud->back = (ud->back + 1) % 8;
         ud->state = utf8_init;
         return wch;
author	Kaz Kylheku <kaz@kylheku.com>	2012-02-02 22:54:17 -0800
committer	Kaz Kylheku <kaz@kylheku.com>	2012-02-02 22:54:17 -0800
commit	a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7 (patch)
tree	1696c8ed5bd8d5da2bda03f9a0a78507139bdfa0 /utf8.c
parent	905b074cea7303553777e169529efc8aeccdc35a (diff)
download	txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.tar.gz txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.tar.bz2 txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.zip