summaryrefslogtreecommitdiffstats
path: root/lib/regex_internal.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/regex_internal.c')
-rw-r--r--lib/regex_internal.c150
1 files changed, 114 insertions, 36 deletions
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index 55fe298..2129888 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -1,5 +1,6 @@
/* Extended regular expression matching and search library.
- Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
+ Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 Free Software
+ Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
@@ -488,27 +489,34 @@ re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
mbstate_t prev_st;
Idx rawbuf_idx;
size_t mbclen;
- wchar_t wc = 0;
+ wint_t wc = WEOF;
/* Skip the characters which are not necessary to check. */
for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
rawbuf_idx < new_raw_idx;)
{
+ wchar_t wc2;
Idx remain_len;
remain_len = pstr->len - rawbuf_idx;
prev_st = pstr->cur_state;
- mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
+ mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
remain_len, &pstr->cur_state);
if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
{
- /* We treat these cases as a singlebyte character. */
+ /* We treat these cases as a single byte character. */
+ if (mbclen == 0 || remain_len == 0)
+ wc = L'\0';
+ else
+ wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
mbclen = 1;
pstr->cur_state = prev_st;
}
+ else
+ wc = wc2;
/* Then proceed the next character. */
rawbuf_idx += mbclen;
}
- *last_wc = (wint_t) wc;
+ *last_wc = wc;
return rawbuf_idx;
}
#endif /* RE_ENABLE_I18N */
@@ -590,34 +598,98 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
if (BE (offset != 0, 1))
{
- /* Are the characters which are already checked remain? */
- if (BE (offset < pstr->valid_raw_len, 1)
-#ifdef RE_ENABLE_I18N
- /* Handling this would enlarge the code too much.
- Accept a slowdown in that case. */
- && pstr->offsets_needed == 0
-#endif
- )
+ /* Should the already checked characters be kept? */
+ if (BE (offset < pstr->valid_raw_len, 1))
{
/* Yes, move them to the front of the buffer. */
- pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
#ifdef RE_ENABLE_I18N
- if (pstr->mb_cur_max > 1)
- memmove (pstr->wcs, pstr->wcs + offset,
- (pstr->valid_len - offset) * sizeof (wint_t));
+ if (BE (pstr->offsets_needed, 0))
+ {
+ Idx low = 0, high = pstr->valid_len, mid;
+ do
+ {
+ mid = (high + low) / 2;
+ if (pstr->offsets[mid] > offset)
+ high = mid;
+ else if (pstr->offsets[mid] < offset)
+ low = mid + 1;
+ else
+ break;
+ }
+ while (low < high);
+ if (pstr->offsets[mid] < offset)
+ ++mid;
+ pstr->tip_context = re_string_context_at (pstr, mid - 1,
+ eflags);
+ /* This can be quite complicated, so handle specially
+ only the common and easy case where the character with
+ different length representation of lower and upper
+ case is present at or after offset. */
+ if (pstr->valid_len > offset
+ && mid == offset && pstr->offsets[mid] == offset)
+ {
+ memmove (pstr->wcs, pstr->wcs + offset,
+ (pstr->valid_len - offset) * sizeof (wint_t));
+ memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
+ pstr->valid_len -= offset;
+ pstr->valid_raw_len -= offset;
+ for (low = 0; low < pstr->valid_len; low++)
+ pstr->offsets[low] = pstr->offsets[low + offset] - offset;
+ }
+ else
+ {
+ /* Otherwise, just find out how long the partial multibyte
+ character at offset is and fill it with WEOF/255. */
+ pstr->len = pstr->raw_len - idx + offset;
+ pstr->stop = pstr->raw_stop - idx + offset;
+ pstr->offsets_needed = 0;
+ while (mid > 0 && pstr->offsets[mid - 1] == offset)
+ --mid;
+ while (mid < pstr->valid_len)
+ if (pstr->wcs[mid] != WEOF)
+ break;
+ else
+ ++mid;
+ if (mid == pstr->valid_len)
+ pstr->valid_len = 0;
+ else
+ {
+ pstr->valid_len = pstr->offsets[mid] - offset;
+ if (pstr->valid_len)
+ {
+ for (low = 0; low < pstr->valid_len; ++low)
+ pstr->wcs[low] = WEOF;
+ memset (pstr->mbs, 255, pstr->valid_len);
+ }
+ }
+ pstr->valid_raw_len = pstr->valid_len;
+ }
+ }
+ else
+#endif
+ {
+ pstr->tip_context = re_string_context_at (pstr, offset - 1,
+ eflags);
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ memmove (pstr->wcs, pstr->wcs + offset,
+ (pstr->valid_len - offset) * sizeof (wint_t));
#endif /* RE_ENABLE_I18N */
- if (BE (pstr->mbs_allocated, 0))
- memmove (pstr->mbs, pstr->mbs + offset,
- pstr->valid_len - offset);
- pstr->valid_len -= offset;
- pstr->valid_raw_len -= offset;
+ if (BE (pstr->mbs_allocated, 0))
+ memmove (pstr->mbs, pstr->mbs + offset,
+ pstr->valid_len - offset);
+ pstr->valid_len -= offset;
+ pstr->valid_raw_len -= offset;
#if DEBUG
- assert (pstr->valid_len > 0);
+ assert (pstr->valid_len > 0);
#endif
+ }
}
else
{
/* No, skip all characters until IDX. */
+ Idx prev_valid_len = pstr->valid_len;
+
#ifdef RE_ENABLE_I18N
if (BE (pstr->offsets_needed, 0))
{
@@ -627,7 +699,6 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
}
#endif
pstr->valid_len = 0;
- pstr->valid_raw_len = 0;
#ifdef RE_ENABLE_I18N
if (pstr->mb_cur_max > 1)
{
@@ -636,12 +707,14 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
if (pstr->is_utf8)
{
- const unsigned char *raw, *p, *q, *end;
+ const unsigned char *raw, *p, *end;
/* Special case UTF-8. Multi-byte chars start with any
byte other than 0x80 - 0xbf. */
raw = pstr->raw_mbs + pstr->raw_mbs_idx;
end = raw + (offset - pstr->mb_cur_max);
+ if (end < pstr->raw_mbs)
+ end = pstr->raw_mbs;
p = raw + offset - 1;
#ifdef _LIBC
/* We know the wchar_t encoding is UCS4, so for the simple
@@ -649,7 +722,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
if (isascii (*p) && BE (pstr->trans == NULL, 1))
{
memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
- pstr->valid_len = 0;
+ /* pstr->valid_len = 0; */
wc = (wchar_t) *p;
}
else
@@ -663,13 +736,11 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
unsigned char buf[6];
size_t mbclen;
- q = p;
if (BE (pstr->trans != NULL, 0))
{
int i = mlen < 6 ? mlen : 6;
while (--i >= 0)
buf[i] = pstr->trans[p[i]];
- q = buf;
}
/* XXX Don't use mbrtowc, we know which conversion
to use (UTF-8 -> UCS4). */
@@ -690,25 +761,30 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
if (wc == WEOF)
pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+ if (wc == WEOF)
+ pstr->tip_context
+ = re_string_context_at (pstr, prev_valid_len - 1, eflags);
+ else
+ pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
+ && IS_WIDE_WORD_CHAR (wc))
+ ? CONTEXT_WORD
+ : ((IS_WIDE_NEWLINE (wc)
+ && pstr->newline_anchor)
+ ? CONTEXT_NEWLINE : 0));
if (BE (pstr->valid_len, 0))
{
for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
pstr->wcs[wcs_idx] = WEOF;
if (pstr->mbs_allocated)
- memset (pstr->mbs, -1, pstr->valid_len);
+ memset (pstr->mbs, 255, pstr->valid_len);
}
pstr->valid_raw_len = pstr->valid_len;
- pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
- && IS_WIDE_WORD_CHAR (wc))
- ? CONTEXT_WORD
- : ((IS_WIDE_NEWLINE (wc)
- && pstr->newline_anchor)
- ? CONTEXT_NEWLINE : 0));
}
else
#endif /* RE_ENABLE_I18N */
{
int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
+ pstr->valid_raw_len = 0;
if (pstr->trans)
c = pstr->trans[c];
pstr->tip_context = (bitset_contain (pstr->word_char, c)
@@ -1329,7 +1405,6 @@ static Idx
internal_function
re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
{
- int type = token.type;
if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
{
size_t new_nodes_alloc = dfa->nodes_alloc * 2;
@@ -1365,8 +1440,11 @@ re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
dfa->nodes[dfa->nodes_len] = token;
dfa->nodes[dfa->nodes_len].constraint = 0;
#ifdef RE_ENABLE_I18N
+ {
+ int type = token.type;
dfa->nodes[dfa->nodes_len].accept_mb =
(type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
+ }
#endif
dfa->nexts[dfa->nodes_len] = REG_MISSING;
re_node_set_init_empty (dfa->edests + dfa->nodes_len);