3 files changed, 151 insertions, 21 deletions
diff --git a/ChangeLog b/ChangeLog
index 82ee1edf..d729a960 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,19 @@
 2009-11-12  Kaz Kylheku  <kkylheku@gmail.com>
 
+	* utf8.c (utf8_from): Fix total breakage.
+	Was writing out incomplete wide characters on internal
+	state transtions while traversing a single multi-byte character.
+	Also, improved handling of bad bytes close to EOF: if EOF
+	occurs in a multi-byte character, it will backtrack,
+	and skip one bad byte, etc.
+	(utf8_encode, utf8_decoder_init, utf8_decode): New functions.
+
+	* utf8.h (enum utf8_state): New enum.
+	(struct utf8_decoder, utf8_decoder_t): New struct.
+	(utf8_encode, utf8_decoder_init, utf8_decode): Declared.
+
+2009-11-12  Kaz Kylheku  <kkylheku@gmail.com>
+
 	Documenting extended characters in man page.
 	Cleaned up some more issues related to extended characters.
 
diff --git a/utf8.c b/utf8.c
index a3a23e8e..ca2e9016 100644
--- a/utf8.c
+++ b/utf8.c
@@ -27,31 +27,45 @@
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <wchar.h>
 #include "lib.h"
+#include "utf8.h"
 
 size_t utf8_from(wchar_t *wdst, const unsigned char *src)
 {
   size_t nchar = 1;
-  enum { init, more1, more2, more3 } state;
+  enum utf8_state state = utf8_init;
   const char *backtrack = 0;
-  int ch;
   wchar_t wch = 0;
 
-  for (state = init; (ch = *src); src++) {
+  for (;;) {
+    int ch = *src++;
+
+    if (ch == 0) {
+      if (state == utf8_init)
+        break;
+      src = backtrack;
+      if (wdst)
+        *wdst++ = 0xdc00 | *src;
+      nchar++;
+      state = utf8_init;
+      continue;
+    }
+
     switch (state) {
-    case init:
+    case utf8_init:
       if (ch < 0x80) {
         if (wdst)
           *wdst++ = ch;
         nchar++;
       } else if (ch >= 0xc2 && ch <= 0xe0) {
-        state = more1;
+        state = utf8_more1;
         wch = (ch & 0x1f);
       } else if (ch >= 0xe0 && ch <= 0xef) {
-        state = more2;
+        state = utf8_more2;
         wch = (ch & 0xf);
       } else if (ch >= 0xf0 && ch < 0xf5) {
-        state = more3;
+        state = utf8_more3;
         wch = (ch & 0x7);
       } else {
         if (wdst)
@@ -60,33 +74,28 @@ size_t utf8_from(wchar_t *wdst, const unsigned char *src)
       }
       backtrack = src;
       break;
-    case more1:
-    case more2:
-    case more3:
+    case utf8_more1:
+    case utf8_more2:
+    case utf8_more3:
       if (ch >= 0x80 && ch < 0xc0) {
         wch <<= 6;
         wch |= (ch & 0x3f);
-        if (wdst)
-          *wdst++ = wch;
-        nchar++;
-        state--;
+        if (--state == utf8_init) {
+          if (wdst)
+            *wdst++ = wch;
+          nchar++;
+        }
       } else {
         src = backtrack;
         if (wdst)
           *wdst++ = 0xdc00 | *src;
         nchar++;
-        state = init;
+        state = utf8_init;
       }
       break;
     }
   }
 
-  if (state != init) {
-    if (wdst)
-      *wdst++ = 0xdc00 | *backtrack;
-    nchar++;
-  }
-
   if (wdst)
     *wdst++ = 0;
   return nchar;
@@ -147,6 +156,99 @@ unsigned char *utf8_dup_to(const wchar_t *wstr)
   return str;
 }
 
+int utf8_encode(wchar_t wch, int (*put)(int ch, void *ctx), void *ctx)
+{
+  if (wch < 0x80) {
+    return put(wch, ctx);
+  } else if (wch < 0x800) {
+    return put(0xC0 | (wch >> 6), ctx) &&
+           put(0x80 | (wch & 0x3F), ctx);
+  } else if (wch < 0x10000) {
+    return put(0xE0 | (wch >> 12), ctx) &&
+           put(0x80 | ((wch >> 6) & 0x3F), ctx) &&
+           put(0x80 | (wch & 0x3F), ctx);
+  } else if (wch < 0x110000) {
+    return put(0xF0 | (wch >> 18), ctx) &&
+           put(0x80 | ((wch >> 12) & 0x3F), ctx) &&
+           put(0x80 | ((wch >> 6) & 0x3F), ctx) &&
+           put(0x80 | (wch & 0x3F), ctx);
+  }
+
+  return 0;
+}
+
+void utf8_decoder_init(utf8_decoder_t *ud)
+{
+  ud->state = utf8_init;
+  ud->wch = 0;
+  ud->head = ud->tail = ud->back = 0;
+}
+
+wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(void *ctx), void *ctx)
+{
+  for (;;) {
+    int ch;
+
+    if (ud->tail != ud->head) {
+      ch = ud->buf[ud->tail];
+      ud->tail = (ud->tail + 1) % 8;
+    } else {
+      ch = get(ctx);
+      ud->buf[ud->head] = ch;
+      ud->head = ud->tail = (ud->head + 1) % 8;
+    }
+
+    if (ch == EOF) {
+      if (ud->state == utf8_init) {
+        return WEOF;
+      } else {
+        wchar_t wch = 0xdc00 | ud->buf[ud->back];
+        ud->tail = ud->back = (ud->back + 1) % 8;
+        ud->state = utf8_init;
+        return wch; 
+      }
+    }
+
+    switch (ud->state) {
+    case utf8_init:
+      if (ch < 0x80) {
+        ud->back = ud->tail;
+        return ch;
+      } else if (ch >= 0xc2 && ch <= 0xe0) {
+        ud->state = utf8_more1;
+        ud->wch = (ch & 0x1f);
+      } else if (ch >= 0xe0 && ch <= 0xef) {
+        ud->state = utf8_more2;
+        ud->wch = (ch & 0xf);
+      } else if (ch >= 0xf0 && ch < 0xf5) {
+        ud->state = utf8_more3;
+        ud->wch = (ch & 0x7);
+      } else {
+        ud->back = ud->tail;
+        return 0xdc00 | ch;
+      }
+      break;
+    case utf8_more1:
+    case utf8_more2:
+    case utf8_more3:
+      if (ch >= 0x80 && ch < 0xc0) {
+        ud->wch <<= 6;
+        ud->wch |= (ch & 0x3f);
+        if (--ud->state == utf8_init) {
+          ud->back = ud->tail;
+          return ud->wch;
+        }
+      } else {
+        wchar_t wch = 0xdc00 | ud->buf[ud->back];
+        ud->tail = ud->back = (ud->back + 1) % 8;
+        ud->state = utf8_init;
+        return wch; 
+      }
+      break;
+    }
+  }
+}
+
 FILE *w_fopen(const wchar_t *wname, const wchar_t *wmode)
 {
   char *name = (char *) utf8_dup_to(wname);
diff --git a/utf8.h b/utf8.h
index 28e67fe2..542a84fa 100644
--- a/utf8.h
+++ b/utf8.h
@@ -28,5 +28,19 @@ size_t utf8_from(wchar_t *, const unsigned char *);
 size_t utf8_to(unsigned char *, const wchar_t *);
 wchar_t *utf8_dup_from(const unsigned char *);
 unsigned char *utf8_dup_to(const wchar_t *);
+
+enum utf8_state { utf8_init, utf8_more1, utf8_more2, utf8_more3 };
+
+typedef struct utf8_decoder {
+  enum utf8_state state;
+  wchar_t wch;
+  int head, tail, back;
+  int buf[8];
+} utf8_decoder_t;
+
+int utf8_encode(wchar_t, int (*put)(int ch, void *ctx), void *ctx);
+void utf8_decoder_init(utf8_decoder_t *);
+wint_t utf8_decode(utf8_decoder_t *,int (*get)(void *ctx), void *ctx);
+
 FILE *w_fopen(const wchar_t *, const wchar_t *);
 FILE *w_popen(const wchar_t *, const wchar_t *);