1 files changed, 1064 insertions, 12 deletions
diff --git a/dfa.c b/dfa.c
index 97164127..77a6fb83 100644
--- a/dfa.c
+++ b/dfa.c
@@ -1,5 +1,5 @@
 /* dfa.c - deterministic extended regexp routines for GNU
-   Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
+   Copyright 1988, 1998, 2000, 2002 Free Software Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -46,6 +46,16 @@ extern void free();
 #include <strings.h>
 #endif
 
+#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
+/* We can handle multibyte string.  */
+# define MBS_SUPPORT
+#endif
+
+#ifdef MBS_SUPPORT
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
 #ifndef DEBUG	/* use the same approach as regex.c */
 #undef assert
 #define assert(e)
@@ -91,17 +101,22 @@ extern void free();
    host does not conform to Posix.  */
 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
 
+/* Don't use gettext if ENABLE_NLS is not defined */
 /* If we (don't) have I18N.  */
 /* glibc defines _ */
-#ifndef _
-# ifdef HAVE_LIBINTL_H
-#  include <libintl.h>
-#  ifndef _
-#   define _(Str) gettext (Str)
+#ifdef ENABLE_NLS
+# ifndef _
+#  ifdef HAVE_LIBINTL_H
+#   include <libintl.h>
+#   ifndef _
+#    define _(Str) gettext (Str)
+#   endif
+#  else
+#   define _(Str) (Str)
 #  endif
-# else
-#  define _(Str) (Str)
 # endif
+#else
+# define _(Str) (Str)
 #endif
 
 #include "regex.h"
@@ -234,6 +249,10 @@ prtok (token t)
 	case ORTOP: s = "ORTOP"; break;
 	case LPAREN: s = "LPAREN"; break;
 	case RPAREN: s = "RPAREN"; break;
+#ifdef MBS_SUPPORT
+	case ANYCHAR: s = "ANYCHAR"; break;
+	case MBCSET: s = "MBCSET"; break;
+#endif /* MBS_SUPPORT */
 	default: s = "CSET"; break;
 	}
       fprintf(stderr, "%s", s);
@@ -349,8 +368,117 @@ static int laststart;		/* True if we're separated from beginning or (, |
 static int parens;		/* Count of outstanding left parens. */
 static int minrep, maxrep;	/* Repeat counts for {m,n}. */
 
+#ifdef MBS_SUPPORT
+/* These variables are used only if (MB_CUR_MAX > 1).  */
+static mbstate_t mbs;		/* Mbstate for mbrlen().  */
+static int cur_mb_len;		/* Byte length of the current scanning
+				   multibyte character.  */
+static int cur_mb_index;        /* Byte index of the current scanning multibyte
+                                   character.
+
+				   singlebyte character : cur_mb_index = 0
+				   multibyte character
+				       1st byte : cur_mb_index = 1
+				       2nd byte : cur_mb_index = 2
+				         ...
+				       nth byte : cur_mb_index = n  */
+static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
+				   Each element store the amount of remain
+				   byte of corresponding multibyte character
+				   in the input string.  A element's value
+				   is 0 if corresponding character is a
+				   singlebyte chracter.
+				   e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
+				    mblen_buf :   0,       3,       2,       1
+				*/
+				  
+static wchar_t *inputwcs;	/* Wide character representation of input
+				   string in dfaexec().
+				   The length of this array is same as
+				   the length of input string(char array).
+				   inputstring[i] is a single-byte char,
+				   or 1st byte of a multibyte char.
+				   And inputwcs[i] is the codepoint.  */
+static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
+static unsigned char const *buf_end;	/* refference to end in dfaexec().  */
+#endif /* MBS_SUPPORT  */
+
+#ifdef MBS_SUPPORT
+/* This function update cur_mb_len, and cur_mb_index.
+   p points current lexptr, len is the remaining buffer length.  */
+static void
+update_mb_len_index (unsigned char const *p, int len)
+{
+  /* If last character is a part of a multibyte character,
+     we update cur_mb_index.  */
+  if (cur_mb_index)
+    cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
+			: cur_mb_index + 1;
+
+  /* If last character is a single byte character, or the
+     last portion of a multibyte character, we check whether
+     next character is a multibyte character or not.  */
+  if (! cur_mb_index)
+    {
+      cur_mb_len = mbrlen(p, len, &mbs);
+      if (cur_mb_len > 1)
+	/* It is a multibyte character.
+	   cur_mb_len was already set by mbrlen().  */
+	cur_mb_index = 1;
+      else if (cur_mb_len < 1)
+	/* Invalid sequence.  We treat it as a singlebyte character.
+	   cur_mb_index is aleady 0.  */
+	cur_mb_len = 1;
+      /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
+	 cur_mb_index is aleady 0.  */
+    }
+}
+#endif /* MBS_SUPPORT */
+
+#ifdef MBS_SUPPORT
 /* Note that characters become unsigned here. */
-#define FETCH(c, eoferr)   	      \
+# define FETCH(c, eoferr)			\
+  {						\
+    if (! lexleft)				\
+     {						\
+	if (eoferr != 0)			\
+	  dfaerror (eoferr);			\
+	else					\
+	  return lasttok = END;			\
+      }						\
+    if (MB_CUR_MAX > 1)				\
+      update_mb_len_index(lexptr, lexleft);	\
+    (c) = (unsigned char) *lexptr++;		\
+    --lexleft;					\
+  }
+
+/* This function fetch a wide character, and update cur_mb_len,
+   used only if the current locale is a multibyte environment.  */
+static wint_t
+fetch_wc (char const *eoferr)
+{
+  wchar_t wc;
+  if (! lexleft)
+    {
+      if (eoferr != 0)
+	dfaerror (eoferr);
+      else
+	return WEOF;
+    }
+
+  cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
+  if (cur_mb_len <= 0)
+   {
+      cur_mb_len = 1;
+      wc = *lexptr;
+    }
+  lexptr += cur_mb_len;
+  lexleft -= cur_mb_len;
+  return wc;
+}
+#else
+/* Note that characters become unsigned here. */
+# define FETCH(c, eoferr)   	      \
   {			   	      \
     if (! lexleft)	   	      \
       {				      \
@@ -362,6 +490,202 @@ static int minrep, maxrep;	/* Repeat counts for {m,n}. */
     (c) = (unsigned char) *lexptr++;  \
     --lexleft;		   	      \
   }
+#endif /* MBS_SUPPORT */
+
+#ifdef MBS_SUPPORT
+/* Multibyte character handling sub-routin for lex.
+   This function  parse a bracket expression and build a struct
+   mb_char_classes.  */
+static void
+parse_bracket_exp_mb ()
+{
+  wint_t wc, wc1, wc2;
+
+  /* Work area to build a mb_char_classes.  */
+  struct mb_char_classes *work_mbc;
+  int chars_al, range_sts_al, range_ends_al, ch_classes_al,
+    equivs_al, coll_elems_al;
+
+  REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
+		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
+  /* dfa->multibyteb_prop[] hold the index of dfa->mbcsets.
+     We will update dfa->multibyte_prop[] in addtok(), because we can't
+     decide the index in dfa->tokens[].  */
+
+  /* Initialize work are */
+  work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
+
+  chars_al = 1;
+  range_sts_al = range_ends_al = 0;
+  ch_classes_al = equivs_al = coll_elems_al = 0;
+  MALLOC(work_mbc->chars, wchar_t, chars_al);
+
+  work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
+  work_mbc->nequivs = work_mbc->ncoll_elems = 0;
+  work_mbc->chars = work_mbc->ch_classes = NULL;
+  work_mbc->range_sts = work_mbc->range_ends = NULL;
+  work_mbc->equivs = work_mbc->coll_elems = NULL;
+
+  wc = fetch_wc(_("Unbalanced ["));
+  if (wc == L'^')
+    {
+      wc = fetch_wc(_("Unbalanced ["));
+      work_mbc->invert = 1;
+    }
+  else
+    work_mbc->invert = 0;
+  do
+    {
+      wc1 = WEOF; /* mark wc1 is not initialized".  */
+
+      /* Note that if we're looking at some other [:...:] construct,
+	 we just treat it as a bunch of ordinary characters.  We can do
+	 this because we assume regex has checked for syntax errors before
+	 dfa is ever called. */
+      if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
+	{
+#define BRACKET_BUFFER_SIZE 128
+	  char str[BRACKET_BUFFER_SIZE];
+	  wc1 = wc;
+	  wc = fetch_wc(_("Unbalanced ["));
+
+	  /* If pattern contains `[[:', `[[.', or `[[='.  */
+	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
+	    {
+	      unsigned char c;
+	      unsigned char delim = (unsigned char)wc;
+	      int len = 0;
+	      for (;;)
+		{
+		  if (! lexleft)
+		    dfaerror (_("Unbalanced ["));
+		  c = (unsigned char) *lexptr++;
+		  --lexleft;
+
+		  if ((c == delim && *lexptr == ']') || lexleft == 0)
+		    break;
+		  if (len < BRACKET_BUFFER_SIZE)
+		    str[len++] = c;
+		  else
+		    /* This is in any case an invalid class name.  */
+		    str[0] = '\0';
+		}
+	      str[len] = '\0';
+
+	      if (lexleft == 0)
+		{
+		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+				       work_mbc->nchars + 2);
+		  work_mbc->chars[work_mbc->nchars++] = L'[';
+		  work_mbc->chars[work_mbc->nchars++] = delim;
+		  break; 
+		}
+
+	      if (--lexleft, *lexptr++ != ']')
+		dfaerror (_("Unbalanced ["));
+	      if (delim == ':')
+		/* build character class.  */
+		{
+		  wctype_t wt;
+		  /* Query the character class as wctype_t.  */
+		  wt = wctype (str);
+
+		  if (ch_classes_al == 0)
+		    MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
+		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
+				       ch_classes_al,
+				       work_mbc->nch_classes + 1);
+		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
+
+ 		}
+	      else if (delim == '=' || delim == '.')
+		{
+		  char *elem;
+		  MALLOC(elem, char, len + 1);
+		  strncpy(elem, str, len + 1);
+
+		  if (delim == '=')
+		    /* build equivalent class.  */
+		    {
+		      if (equivs_al == 0)
+			MALLOC(work_mbc->equivs, char*, ++equivs_al);
+		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
+					   equivs_al,
+					   work_mbc->nequivs + 1);
+		      work_mbc->equivs[work_mbc->nequivs++] = elem;
+		    }
+
+		  if (delim == '.')
+		    /* build collating element.  */
+		    {
+		      if (coll_elems_al == 0)
+			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
+		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
+					   coll_elems_al,
+					   work_mbc->ncoll_elems + 1);
+		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
+		    }
+ 		}
+	      wc = WEOF;
+	    }
+	  else
+	    /* We treat '[' as a normal character here.  */
+	    {
+	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
+	    }
+	}
+      else
+	{
+	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+	    wc = fetch_wc(("Unbalanced ["));
+	}
+
+      if (wc1 == WEOF)
+	wc1 = fetch_wc(_("Unbalanced ["));
+
+      if (wc1 == L'-')
+	/* build range characters.  */
+	{
+	  wc2 = fetch_wc(_("Unbalanced ["));
+	  if (wc2 == L']')
+	    {
+	      /* In the case [x-], the - is an ordinary hyphen,
+		 which is left in c1, the lookahead character. */
+	      lexptr -= cur_mb_len;
+	      lexleft += cur_mb_len;
+	      wc2 = wc;
+	    }
+	  else
+	    {
+	      if (wc2 == L'\\'
+		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+		wc2 = fetch_wc(_("Unbalanced ["));
+	      wc1 = fetch_wc(_("Unbalanced ["));
+	    }
+
+	  if (range_sts_al == 0)
+	    {
+	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
+	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
+	    }
+	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+			       range_sts_al, work_mbc->nranges + 1);
+	  work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
+	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+			       range_ends_al, work_mbc->nranges + 1);
+	  work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
+	}
+      else if (wc != WEOF)
+	/* build normal characters.  */
+	{
+	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+			       work_mbc->nchars + 1);
+	  work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
+	}
+    }
+  while ((wc = wc1) != L']');
+}
+#endif /* MBS_SUPPORT */
 
 #ifdef __STDC__
 #define FUNC(F, P) static int F(int c) { return P(c); }
@@ -442,6 +766,14 @@ lex (void)
   for (i = 0; i < 2; ++i)
     {
       FETCH(c, 0);
+#ifdef MBS_SUPPORT
+      if (MB_CUR_MAX > 1 && cur_mb_index)
+	/* If this is a part of a multi-byte character, we must treat
+	   this byte data as a normal character.
+	   e.g. In case of SJIS encoding, some character contains '\',
+	        but they must not be backslash.  */
+	goto normal_char;
+#endif /* MBS_SUPPORT  */
       switch (c)
 	{
 	case '\\':
@@ -662,6 +994,15 @@ lex (void)
 	case '.':
 	  if (backslash)
 	    goto normal_char;
+#ifdef MBS_SUPPORT
+	  if (MB_CUR_MAX > 1)
+	    {
+	      /* In multibyte environment period must match with a single
+		 character not a byte.  So we use ANYCHAR.  */
+	      laststart = 0;
+	      return lasttok = ANYCHAR;
+	    }
+#endif /* MBS_SUPPORT */
 	  zeroset(ccl);
 	  notset(ccl);
 	  if (!(syntax_bits & RE_DOT_NEWLINE))
@@ -687,6 +1028,17 @@ lex (void)
 	case '[':
 	  if (backslash)
 	    goto normal_char;
+#ifdef MBS_SUPPORT
+	  if (MB_CUR_MAX > 1)
+	    {
+	      /* In multibyte environment a bracket expression may contain
+		 multibyte characters, which must be treated as characters
+		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
+              laststart = 0;
+              parse_bracket_exp_mb();
+	      return lasttok = MBCSET;
+	    }
+#endif
 	  zeroset(ccl);
 	  FETCH(c, _("Unbalanced ["));
 	  if (c == '^')
@@ -815,6 +1167,26 @@ static int depth;		/* Current depth of a hypothetical stack
 static void
 addtok (token t)
 {
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
+			   dfa->tindex);
+      /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
+      if (t == MBCSET)
+        dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
+      else if (t < NOTCHAR)
+	dfa->multibyte_prop[dfa->tindex]
+	  = (cur_mb_len == 1)? 3 /* single-byte char */
+	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
+	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
+      else
+        /* It may be unnecessary, but it is safer to treat other symbols
+           as single byte characters.  */
+        dfa->multibyte_prop[dfa->tindex] = 3;
+    }
+#endif
+
   REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
   dfa->tokens[dfa->tindex++] = t;
 
@@ -858,7 +1230,10 @@ addtok (token t)
      atom
 
    atom:
+     <multibyte character>
      <normal character>
+     ANYCHAR
+     MBCSET
      CSET
      BACKREF
      BEGLINE
@@ -876,10 +1251,31 @@ atom (void)
 {
   if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
       || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
+#ifdef MBS_SUPPORT
+      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
+#endif /* MBS_SUPPORT */
       || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
     {
       addtok(tok);
       tok = lex();
+#ifdef MBS_SUPPORT
+      /* We treat a multibyte character as a single atom, so that DFA
+	 can treat a multibyte character as a single expression.
+
+         e.g. We construct following tree from "<mb1><mb2>".
+              <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
+              <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
+      */
+      if (MB_CUR_MAX > 1)
+	{
+	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
+	    {
+	      addtok(tok);
+	      addtok(CAT);
+	      tok = lex();
+            }
+	}
+#endif /* MBS_SUPPORT  */
     }
   else if (tok == LPAREN)
     {
@@ -998,6 +1394,14 @@ dfaparse (char *s, size_t len, struct dfa *d)
   lasttok = END;
   laststart = 1;
   parens = 0;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      cur_mb_index = 0;
+      cur_mb_len = 0;
+      memset(&mbs, 0, sizeof(mbstate_t));
+    }
+#endif /* MBS_SUPPORT  */
 
   if (! syntax_bits_set)
     dfaerror(_("No regexp syntax bits specified"));
@@ -1139,6 +1543,10 @@ state_index (struct dfa *d, position_set *s, int newline, int letter)
   d->states[i].backref = 0;
   d->states[i].constraint = 0;
   d->states[i].first_end = 0;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    d->states[i].mbps.nelem = 0;
+#endif
   for (j = 0; j < s->nelem; ++j)
     if (d->tokens[s->elems[j].index] < 0)
       {
@@ -1181,6 +1589,10 @@ epsclosure (position_set *s, struct dfa *d)
   for (i = 0; i < s->nelem; ++i)
     if (d->tokens[s->elems[i].index] >= NOTCHAR
 	&& d->tokens[s->elems[i].index] != BACKREF
+#ifdef MBS_SUPPORT
+	&& d->tokens[s->elems[i].index] != ANYCHAR
+	&& d->tokens[s->elems[i].index] != MBCSET
+#endif
 	&& d->tokens[s->elems[i].index] < CSET)
       {
 	old = s->elems[i];
@@ -1462,6 +1874,10 @@ dfaanalyze (struct dfa *d, int searchflag)
      it with its epsilon closure. */
   for (i = 0; i < d->tindex; ++i)
     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
+#ifdef MBS_SUPPORT
+        || d->tokens[i] == ANYCHAR
+        || d->tokens[i] == MBCSET
+#endif
 	|| d->tokens[i] >= CSET)
       {
 #ifdef DEBUG
@@ -1563,6 +1979,9 @@ dfastate (int s, struct dfa *d, int trans[])
   int wants_letter;		/* New state wants to know letter context. */
   int state_letter;		/* New state on a letter transition. */
   static int initialized;	/* Flag for static initialization. */
+#ifdef MBS_SUPPORT
+  int next_isnt_1st_byte = 0;	/* Flag if we can't add state0.  */
+#endif 
   int i, j, k;
 
   /* Initialize the set of letters, if necessary. */
@@ -1584,6 +2003,23 @@ dfastate (int s, struct dfa *d, int trans[])
 	setbit(d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
 	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
+#ifdef MBS_SUPPORT
+      else if (d->tokens[pos.index] == ANYCHAR
+               || d->tokens[pos.index] == MBCSET)
+      /* MB_CUR_MAX > 1  */
+	{
+	  /* ANYCHAR and MBCSET must match with a single character, so we
+	     must put it to d->states[s].mbps, which contains the positions
+	     which can match with a single character not a byte.  */
+	  if (d->states[s].mbps.nelem == 0)
+	    {
+	      MALLOC(d->states[s].mbps.elems, position,
+		     d->states[s].elems.nelem);
+	    }
+	  insert(pos, &(d->states[s].mbps));
+	  continue;
+	}
+#endif /* MBS_SUPPORT */
       else
 	continue;
 
@@ -1720,9 +2156,46 @@ dfastate (int s, struct dfa *d, int trans[])
 	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
 	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
 
+#ifdef MBS_SUPPORT
+      if (MB_CUR_MAX > 1)
+	{
+	  /* If a token in follows.elems is not 1st byte of a multibyte
+	     character, or the states of follows must accept the bytes
+	     which are not 1st byte of the multibyte character.
+	     Then, if a state of follows encounter a byte, it must not be
+	     a 1st byte of a multibyte character nor singlebyte character.
+	     We cansel to add state[0].follows to next state, because
+	     state[0] must accept 1st-byte
+
+	     For example, we assume <sb a> is a certain singlebyte
+	     character, <mb A> is a certain multibyte character, and the
+	     codepoint of <sb a> equals the 2nd byte of the codepoint of
+	     <mb A>.
+	     When state[0] accepts <sb a>, state[i] transit to state[i+1]
+	     by accepting accepts 1st byte of <mb A>, and state[i+1]
+	     accepts 2nd byte of <mb A>, if state[i+1] encounter the
+	     codepoint of <sb a>, it must not be <sb a> but 2nd byte of
+	     <mb A>, so we can not add state[0].  */
+
+	  next_isnt_1st_byte = 0;
+	  for (j = 0; j < follows.nelem; ++j)
+	    {
+	      if (!(d->multibyte_prop[follows.elems[j].index] & 1))
+		{
+		  next_isnt_1st_byte = 1;
+		  break;
+		}
+	    }
+	}
+#endif
+
       /* If we are building a searching matcher, throw in the positions
 	 of state 0 as well. */
+#ifdef MBS_SUPPORT
+      if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
+#else
       if (d->searchflag)
+#endif
 	for (j = 0; j < d->states[0].elems.nelem; ++j)
 	  insert(d->states[0].elems.elems[j], &follows);
 
@@ -1871,6 +2344,434 @@ build_state_zero (struct dfa *d)
   build_state(0, d);
 }
 
+#ifdef MBS_SUPPORT
+/* Multibyte character handling sub-routins for dfaexec.  */
+
+/* Initial state may encounter the byte which is not a singlebyte character
+   nor 1st byte of a multibyte character.  But it is incorrect for initial
+   state to accept such a byte.
+   For example, in sjis encoding the regular expression like "\\" accepts
+   the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
+   0x815c. Then Initial state must skip the bytes which are not a singlebyte
+   character nor 1st byte of a multibyte character.  */
+#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)		\
+  if (s == 0)						\
+    {							\
+      while (inputwcs[p - buf_begin] == 0		\
+	     && mblen_buf[p - buf_begin] > 0		\
+             && (unsigned char const *)p < buf_end)	\
+        ++p;						\
+      if ((char*)p >= end)				\
+        {						\
+          free(mblen_buf);				\
+          free(inputwcs);				\
+	  return NULL;					\
+	}						\
+    }
+
+static void
+realloc_trans_if_necessary(struct dfa *d, int new_state)
+{
+  /* Make sure that the trans and fail arrays are allocated large enough
+     to hold a pointer for the new state. */
+  if (new_state >= d->tralloc)
+    {
+      int oldalloc = d->tralloc;
+
+      while (new_state >= d->tralloc)
+	d->tralloc *= 2;
+      REALLOC(d->realtrans, int *, d->tralloc + 1);
+      d->trans = d->realtrans + 1;
+      REALLOC(d->fails, int *, d->tralloc);
+      REALLOC(d->success, int, d->tralloc);
+      REALLOC(d->newlines, int, d->tralloc);
+      while (oldalloc < d->tralloc)
+	{
+	  d->trans[oldalloc] = NULL;
+	  d->fails[oldalloc++] = NULL;
+	}
+    }
+}
+
+/* Return values of transit_state_singlebyte(), and
+   transit_state_consume_1char.  */
+typedef enum
+{
+  TRANSIT_STATE_IN_PROGRESS,	/* State transition has not finished.  */
+  TRANSIT_STATE_DONE,		/* State transition has finished.  */
+  TRANSIT_STATE_END_BUFFER	/* Reach the end of the buffer.  */
+} status_transit_state;
+
+/* Consume a single byte and transit state from 's' to '*next_state'.
+   This function is almost same as the state transition routin in dfaexec().
+   But state transition is done just once, otherwise matching succeed or
+   reach the end of the buffer.  */
+static status_transit_state
+transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
+				  int *next_state)
+{
+  int *t;
+  int works = s;
+
+  status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
+
+  while (rval == TRANSIT_STATE_IN_PROGRESS)
+    {
+      if ((t = d->trans[works]) != NULL)
+	{
+	  works = t[*p];
+	  rval = TRANSIT_STATE_DONE;
+	  if (works < 0)
+	    works = 0;
+	}
+      else
+	{
+	  if (works >= 0 && d->fails[works])
+	    {
+	      works = d->fails[works][*p];
+	      rval = TRANSIT_STATE_DONE;
+	    }
+
+	  else  if (works >= 0)
+	    {
+	      build_state(works, d);
+	    }
+	  else
+	    {
+	      works = 0;
+	    }
+	}
+    }
+  *next_state = works;
+  return rval;
+}
+
+/* Check whether period can match or not in the current context.  If it can,
+   return the amount of the bytes with which period can match, otherwise
+   return 0.
+   `pos' is the position of the period.  `index' is the index from the
+   buf_begin, and it is the current position in the buffer.  */
+static int
+match_anychar (struct dfa *d, int s, position pos, int index)
+{
+  int newline = 0;
+  int letter = 0;
+  wchar_t wc;
+  int mbclen;
+
+  wc = inputwcs[index];
+  mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
+
+  /* Check context.  */
+  if (wc == (wchar_t)eolbyte)
+    {
+      if (!(syntax_bits & RE_DOT_NEWLINE))
+	return 0;
+      newline = 1;
+    }
+  else if (wc == (wchar_t)'\0')
+    {
+      if (syntax_bits & RE_DOT_NOT_NULL)
+	return 0;
+      newline = 1;
+    }
+
+  if (iswalnum(wc) || wc == L'_')
+    letter = 1;
+
+  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
+			   newline, d->states[s].letter, letter))
+    return 0;
+
+  return mbclen;
+}
+
+/* Check whether bracket expression can match or not in the current context.
+   If it can, return the amount of the bytes with which expression can match,
+   otherwise return 0.
+   `pos' is the position of the bracket expression.  `index' is the index
+   from the buf_begin, and it is the current position in the buffer.  */
+int
+match_mb_charset (struct dfa *d, int s, position pos, int index)
+{
+  int i;
+  int match;		/* Flag which represent that matching succeed.  */
+  int match_len;	/* Length of the character (or collating element)
+			   with which this operator match.  */
+  int op_len;		/* Length of the operator.  */
+  char buffer[128];
+  wchar_t wcbuf[6];
+
+  /* Pointer to the structure to which we are currently reffering.  */
+  struct mb_char_classes *work_mbc;
+
+  int newline = 0;
+  int letter = 0;
+  wchar_t wc;		/* Current reffering character.  */
+
+  wc = inputwcs[index];
+
+  /* Check context.  */
+  if (wc == (wchar_t)eolbyte)
+    {
+      if (!(syntax_bits & RE_DOT_NEWLINE))
+	return 0;
+      newline = 1;
+    }
+  else if (wc == (wchar_t)'\0')
+    {
+      if (syntax_bits & RE_DOT_NOT_NULL)
+	return 0;
+      newline = 1;
+    }
+  if (iswalnum(wc) || wc == L'_')
+    letter = 1;
+  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
+			   newline, d->states[s].letter, letter))
+    return 0;
+
+  /* Assign the current reffering operator to work_mbc.  */
+  work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
+  match = !work_mbc->invert;
+  match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
+
+  /* match with a character class?  */
+  for (i = 0; i<work_mbc->nch_classes; i++)
+    {
+      if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
+	goto charset_matched;
+    }
+
+  strncpy(buffer, buf_begin + index, match_len);
+  buffer[match_len] = '\0';
+
+  /* match with an equivalent class?  */
+  for (i = 0; i<work_mbc->nequivs; i++)
+    {
+      op_len = strlen(work_mbc->equivs[i]);
+      strncpy(buffer, buf_begin + index, op_len);
+      buffer[op_len] = '\0';
+      if (strcoll(work_mbc->equivs[i], buffer) == 0)
+	{
+	  match_len = op_len;
+	  goto charset_matched;
+	}
+    }
+
+  /* match with a collating element?  */
+  for (i = 0; i<work_mbc->ncoll_elems; i++)
+    {
+      op_len = strlen(work_mbc->coll_elems[i]);
+      strncpy(buffer, buf_begin + index, op_len);
+      buffer[op_len] = '\0';
+
+      if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
+	{
+	  match_len = op_len;
+	  goto charset_matched;
+	}
+    }
+
+  wcbuf[0] = wc;
+  wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
+
+  /* match with a range?  */
+  for (i = 0; i<work_mbc->nranges; i++)
+    {
+      wcbuf[2] = work_mbc->range_sts[i];
+      wcbuf[4] = work_mbc->range_ends[i];
+
+      if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
+	  wcscoll(wcbuf+4, wcbuf) >= 0)
+	goto charset_matched;
+    }
+
+  /* match with a character?  */
+  for (i = 0; i<work_mbc->nchars; i++)
+    {
+      if (wc == work_mbc->chars[i])
+	goto charset_matched;
+    }
+
+  match = !match;
+
+ charset_matched:
+  return match ? match_len : 0;
+}
+
+/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
+   array which corresponds to `d->states[s].mbps.elem' and each element of
+   the array contains the amount of the bytes with which the element can
+   match.
+   `index' is the index from the buf_begin, and it is the current position
+   in the buffer.
+   Caller MUST free the array which this function return.  */
+static int*
+check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
+{
+  int i;
+  int* rarray;
+
+  MALLOC(rarray, int, d->states[s].mbps.nelem);
+  for (i = 0; i < d->states[s].mbps.nelem; ++i)
+    {
+      position pos = d->states[s].mbps.elems[i];
+      switch(d->tokens[pos.index])
+	{
+	case ANYCHAR:
+	  rarray[i] = match_anychar(d, s, pos, index);
+	  break;
+	case MBCSET:
+	  rarray[i] = match_mb_charset(d, s, pos, index);
+	  break;
+	default:
+	  break; /* can not happen.  */
+	}
+    }
+  return rarray;
+}
+
+/* Consume a single character and enumerate all of the positions which can
+   be next position from the state `s'.
+   `match_lens' is the input. It can be NULL, but it can also be the output
+   of check_matching_with_multibyte_ops() for optimization.
+   `mbclen' and `pps' are the output.  `mbclen' is the length of the
+   character consumed, and `pps' is the set this function enumerate.  */
+static status_transit_state 
+transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
+			     int *match_lens, int *mbclen, position_set *pps)
+{
+  int i, j;
+  int s1, s2;
+  int* work_mbls;
+  status_transit_state rs = TRANSIT_STATE_DONE;
+
+  /* Calculate the length of the (single/multi byte) character
+     to which p points.  */
+  *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
+    : mblen_buf[*pp - buf_begin];
+
+  /* Calculate the state which can be reached from the state `s' by
+     consuming `*mbclen' single bytes from the buffer.  */
+  s1 = s;
+  for (i = 0; i < *mbclen; i++)
+    {
+      s2 = s1;
+      rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
+    }
+  /* Copy the positions contained by `s1' to the set `pps'.  */
+  copy(&(d->states[s1].elems), pps);
+
+  /* Check (inputed)match_lens, and initialize if it is NULL.  */
+  if (match_lens == NULL && d->states[s].mbps.nelem != 0)
+    work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
+  else
+    work_mbls = match_lens;
+
+  /* Add all of the positions which can be reached from `s' by consuming
+     a single character.  */
+  for (i = 0; i < d->states[s].mbps.nelem ; i++)
+   {
+      if (work_mbls[i] == *mbclen)
+	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
+	     j++)
+	  insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
+		 pps);
+    }
+
+  if (match_lens == NULL && work_mbls != NULL)
+    free(work_mbls);
+  return rs;
+}
+
+/* Transit state from s, then return new state and update the pointer of the
+   buffer.  This function is for some operator which can match with a multi-
+   byte character or a collating element(which may be multi characters).  */
+static int
+transit_state (struct dfa *d, int s, unsigned char const **pp)
+{
+  int s1;
+  int mbclen;		/* The length of current input multibyte character. */
+  int maxlen = 0;
+  int i, j;
+  int *match_lens = NULL;
+  int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
+  position_set follows;
+  unsigned char const *p1 = *pp;
+  status_transit_state rs;
+  wchar_t wc;
+
+  if (nelem > 0)
+    /* This state has (a) multibyte operator(s).
+       We check whether each of them can match or not.  */
+    {
+      /* Note: caller must free the return value of this function.  */
+      match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
+
+      for (i = 0; i < nelem; i++)
+	/* Search the operator which match the longest string,
+	   in this state.  */
+	{
+	  if (match_lens[i] > maxlen)
+	    maxlen = match_lens[i];
+	}
+    }
+
+  if (nelem == 0 || maxlen == 0)
+    /* This state has no multibyte operator which can match.
+       We need to  check only one singlebyte character.  */
+    {
+      status_transit_state rs;
+      rs = transit_state_singlebyte(d, s, *pp, &s1);
+
+      /* We must update the pointer if state transition succeeded.  */
+      if (rs == TRANSIT_STATE_DONE)
+	++*pp;
+
+      if (match_lens != NULL)
+	free(match_lens);
+      return s1;
+    }
+
+  /* This state has some operators which can match a multibyte character.  */
+  follows.nelem = 0;
+  MALLOC(follows.elems, position, d->nleaves);
+
+  /* `maxlen' may be longer than the length of a character, because it may
+     not be a character but a (multi character) collating element.
+     We enumerate all of the positions which `s' can reach by consuming
+     `maxlen' bytes.  */
+  rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
+
+  wc = inputwcs[*pp - mbclen - buf_begin];
+  s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
+  realloc_trans_if_necessary(d, s1);
+
+  while (*pp - p1 < maxlen)
+    {
+      follows.nelem = 0;
+      rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
+
+      for (i = 0; i < nelem ; i++)
+	{
+	  if (match_lens[i] == *pp - p1)
+	    for (j = 0;
+		 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
+	      insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
+		     &follows);
+	}
+
+      wc = inputwcs[*pp - mbclen - buf_begin];
+      s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
+      realloc_trans_if_necessary(d, s1);
+    }
+  free(match_lens);
+  free(follows.elems);
+  return s1;
+}
+
+#endif
+
 /* Search through a buffer looking for a match to the given struct dfa.
    Find the first occurrence of a string matching the regexp in the buffer,
    and the shortest possible version thereof.  Return a pointer to the first
@@ -1914,8 +2815,80 @@ dfaexec (struct dfa *d, char *begin, char *end,
   trans = d->trans;
   *end = eol;
 
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      int remain_bytes, i;
+      buf_begin = begin;
+      buf_end = end;
+
+      /* initialize mblen_buf, and inputwcs.  */
+      MALLOC(mblen_buf, unsigned char, end - begin + 2);
+      MALLOC(inputwcs, wchar_t, end - begin + 2);
+      memset(&mbs, 0, sizeof(mbstate_t));
+      remain_bytes = 0;
+      for (i = 0; i < end - begin + 1; i++)
+	{
+	  if (remain_bytes == 0)
+	    {
+	      remain_bytes
+		= mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+	      if (remain_bytes <= 1)
+		{
+		  remain_bytes = 0;
+		  inputwcs[i] = (wchar_t)begin[i];
+		  mblen_buf[i] = 0;
+		}
+	      else
+		{
+		  mblen_buf[i] = remain_bytes;
+		  remain_bytes--;
+		}
+	    }
+	  else
+	    {
+	      mblen_buf[i] = remain_bytes;
+	      inputwcs[i] = 0;
+	      remain_bytes--;
+	    }
+	}
+      mblen_buf[i] = 0;
+      inputwcs[i] = 0; /* sentinel */
+    }
+#endif /* MBS_SUPPORT */
+
   for (;;)
     {
+#ifdef MBS_SUPPORT
+      if (MB_CUR_MAX > 1)
+        while ((t = trans[s]))
+	  {
+            if ((char *) p > end)
+              break;
+            s1 = s;
+	    if (d->states[s].mbps.nelem != 0)
+	      {
+		/* Can match with a multibyte character( and multi character
+		   collating element).  */
+		unsigned char const *nextp;
+
+		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
+
+		nextp = p;
+		s = transit_state(d, s, &nextp);
+		p = (unsigned char *)nextp;
+
+		/* Trans table might be updated.  */
+		trans = d->trans;
+	      }
+	    else
+	      {
+		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
+		s = t[*p++];
+	      }
+	  }
+      else
+#endif /* MBS_SUPPORT */
       while ((t = trans[s]) != 0) { /* hand-optimized loop */
 	s1 = t[*p++];
         if ((t = trans[s1]) == 0) {
@@ -1931,10 +2904,30 @@ dfaexec (struct dfa *d, char *begin, char *end,
 	    {
 	      if (backref)
 		*backref = (d->states[s].backref != 0);
-	      return (char *) p;
+#ifdef MBS_SUPPORT
+              if (MB_CUR_MAX > 1)
+		{
+		  free(mblen_buf);
+		  free(inputwcs);
+		}
+#endif /* MBS_SUPPORT */
+              return (char *) p;
 	    }
 
 	  s1 = s;
+#ifdef MBS_SUPPORT
+	  if (MB_CUR_MAX > 1)
+	    {
+	      unsigned char const *nextp;
+	      nextp = p;
+	      s = transit_state(d, s, &nextp);
+	      p = (unsigned char *)nextp;
+
+	      /* Trans table might be updated.  */
+	      trans = d->trans;
+	    }
+	  else
+#endif /* MBS_SUPPORT */
 	  s = d->fails[s][*p++];
 	  continue;
 	}
@@ -1945,7 +2938,16 @@ dfaexec (struct dfa *d, char *begin, char *end,
 
       /* Check if we've run off the end of the buffer. */
       if ((char *) p > end)
-	return NULL;
+        {
+#ifdef MBS_SUPPORT
+          if (MB_CUR_MAX > 1)
+	    {
+	      free(mblen_buf);
+	      free(inputwcs);
+	    }
+#endif /* MBS_SUPPORT */
+          return NULL;
+        }
 
       if (s >= 0)
 	{
@@ -1976,6 +2978,16 @@ dfainit (struct dfa *d)
   d->talloc = 1;
   MALLOC(d->tokens, token, d->talloc);
   d->tindex = d->depth = d->nleaves = d->nregexps = 0;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      d->nmultibyte_prop = 1;
+      MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
+      d->nmbcsets = 0;
+      d->mbcsets_alloc = 1;
+      MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
+    }
+#endif
 
   d->searchflag = 0;
   d->tralloc = 0;
@@ -2036,6 +3048,37 @@ dfafree (struct dfa *d)
 
   free((ptr_t) d->charclasses);
   free((ptr_t) d->tokens);
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      free((ptr_t) d->multibyte_prop);
+      for (i = 0; i < d->nmbcsets; ++i)
+	{
+	  int j;
+	  struct mb_char_classes *p = &(d->mbcsets[i]);
+	  if (p->chars != NULL)
+	    free(p->chars);
+	  if (p->ch_classes != NULL)
+	    free(p->ch_classes);
+	  if (p->range_sts != NULL)
+	    free(p->range_sts);
+	  if (p->range_ends != NULL)
+	    free(p->range_ends);
+
+	  for (j = 0; j < p->nequivs; ++j)
+	    free(p->equivs[j]);
+	  if (p->equivs != NULL)
+	    free(p->equivs);
+
+	  for (j = 0; j < p->ncoll_elems; ++j)
+	    free(p->coll_elems[j]);
+	  if (p->coll_elems != NULL)
+	    free(p->coll_elems);
+	}
+      free((ptr_t) d->mbcsets);
+    }
+#endif /* MBS_SUPPORT */
+
   for (i = 0; i < d->sindex; ++i)
     free((ptr_t) d->states[i].elems.elems);
   free((ptr_t) d->states);
@@ -2091,6 +3134,10 @@ dfafree (struct dfa *d)
 	----	----		-----		--		--
 	char c	# c		# c		# c		# c
 
+	ANYCHAR	ZERO		ZERO		ZERO		ZERO
+
+	MBCSET	ZERO		ZERO		ZERO		ZERO
+
 	CSET	ZERO		ZERO		ZERO		ZERO
 
 	STAR	ZERO		ZERO		ZERO		ZERO
@@ -2536,7 +3583,12 @@ dfamust (struct dfa *dfa)
 	      /* not on *my* shift */
 	      goto done;
 	    }
-	  else if (t >= CSET)
+	  else if (t >= CSET
+#ifdef MBS_SUPPORT
+		   || t == ANYCHAR
+		   || t == MBCSET
+#endif /* MBS_SUPPORT */
+		   )
 	    {
 	      /* easy enough */
 	      resetmust(mp);