Allow Unicode characters in identifiers.

* parser.l (unicode_ident): New static function. (BSCHR, NSCHR): Include UONLY match. (grammar): Use unicode_ident function to validate tokens obtained from BTOK and NTOK. * txr.1: Documented changing definition of bident and lident.
author: Kaz Kylheku <kaz@kylheku.com> 2018-05-11 06:55:25 -0700
committer: Kaz Kylheku <kaz@kylheku.com> 2018-05-11 06:55:25 -0700
commit: 78e12d9c43b606f7402100a7c3b3367057d103d9 (patch)
tree: 486f65f1faab122d002be6fd3291ce528f2e661b
parent: 5bb5391fd3ad9874fdb266dd5b6d57f084626d13 (diff)
download: txr-78e12d9c43b606f7402100a7c3b3367057d103d9.tar.gz
txr-78e12d9c43b606f7402100a7c3b3367057d103d9.tar.bz2
txr-78e12d9c43b606f7402100a7c3b3367057d103d9.zip
2 files changed, 59 insertions, 6 deletions
diff --git a/parser.l b/parser.l
index 5fd70a51..774ade24 100644
--- a/parser.l
+++ b/parser.l
@@ -186,6 +186,47 @@ static wchar_t num_esc(scanner_t *scn, char *num)
   return val;
 }
 
+static wchar_t *unicode_ident(scanner_t *scn, const char *lex)
+{
+  wchar_t *wlex = utf8_dup_from(lex), *ptr = wlex, wch;
+
+  while ((wch = *ptr++)) {
+    if (wch < 0x1680 || (wch >= 0x3000 && wch < 0xdc00))
+      continue;
+
+    if ((wch >= 0xdc00 && wch <= 0xdcff) ||
+        (wch >= 0xd800 && wch <= 0xdbff) ||
+#if FULL_UNICODE
+        (wch >= 0xf0000 && wch <= 0xffffd) ||
+        (wch >= 0x100000 && wch <= 0x10fffd) ||
+#endif
+        (wch >= 0xe000 && wch <= 0xf8ff) ||
+        (wch == 0xfffe) ||
+        (wch == 0xffff))
+    {
+      yyerror(scn, yyget_extra(scn),
+              "disallowed Unicode character in identifier");
+      break;
+    }
+
+    switch (wch) {
+    case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002:
+    case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007:
+    case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029:
+    case 0x205f: case 0x3000:
+      yyerror(scn, yyget_extra(scn),
+              "Unicode space occurs in identifier");
+      break;
+    default:
+      continue;
+    }
+
+    break;
+  }
+
+  return wlex;
+}
+
 %}
 
 %option stack noinput reentrant bison-bridge extra-type="parser_t *"
@@ -202,8 +243,8 @@ DOTFLO  [.]{DIG}+
 XNUM    #x{SGN}?{XDIG}+
 ONUM    #o{SGN}?[0-7]+
 BNUM    #b{SGN}?[0-1]+
-BSCHR   [a-zA-Z0-9!$%&*+\-<=>?\\_~]
-NSCHR   [a-zA-Z0-9!$%&*+\-<=>?\\_~/]
+BSCHR   ([a-zA-Z0-9!$%&*+\-<=>?\\_~]|{UONLY})
+NSCHR   ([a-zA-Z0-9!$%&*+\-<=>?\\_~/]|{UONLY})
 ID_END  [^a-zA-Z0-9!$%&*+\-<=>?\\_~/]
 EXTRA   [#^]
 BT0     {BSCHR}({BSCHR}|{EXTRA})*
@@ -395,7 +436,7 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
       || yy_top_state(yyscanner) == QWLIT)
     yy_pop_state(yyscanner);
 
-  yylval->lexeme = utf8_dup_from(yytext);
+  yylval->lexeme = unicode_ident(yyscanner, yytext);
   return SYMTOK;
 }
 
@@ -408,7 +449,7 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
       || yy_top_state(yyscanner) == QWLIT)
     yy_pop_state(yyscanner);
 
-  yylval->lexeme = utf8_dup_from(yytext);
+  yylval->lexeme = unicode_ident(yyscanner, yytext);
   return SYMTOK;
 }
 
@@ -422,7 +463,7 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
   yyerrorf(yyg, lit("bad token: ~a"),
                     string_own(utf8_dup_from(yytext)),
                     nao);
-  yylval->lexeme = utf8_dup_from(yytext);
+  yylval->lexeme = unicode_ident(yyscanner, yytext);
   return SYMTOK;
 }
 
diff --git a/txr.1 b/txr.1
index b8472938..3ad47907 100644
--- a/txr.1
+++ b/txr.1
@@ -1782,6 +1782,13 @@ which are not allowed in a
  ! $ % & * + - < = > ? \e ~
 .cble
 
+Moreover, most Unicode characters beyond U+007F may appear in a
+.metn bident ,
+with certain exceptions. A character may not be used if it is any of the
+Unicode space characters, a member of the high or low surrogate region,
+a member of any Unicode private use area, or is one of the two characters
+U+FFFE or U+FFFF.
+
 The rule still holds that a name cannot look like a number so
 .code +123
 is not a valid
@@ -10825,7 +10832,12 @@ underscores:
 
 and of course, may not look like a number.
 
-The character allowed in a
+A
+.meta lident
+may also include all of the Unicode characters which are permitted in a
+.metn bident .
+
+The one character which is allowed in a
 .meta lident
 but not in a
 .meta bident
author	Kaz Kylheku <kaz@kylheku.com>	2018-05-11 06:55:25 -0700
committer	Kaz Kylheku <kaz@kylheku.com>	2018-05-11 06:55:25 -0700
commit	78e12d9c43b606f7402100a7c3b3367057d103d9 (patch)
tree	486f65f1faab122d002be6fd3291ce528f2e661b
parent	5bb5391fd3ad9874fdb266dd5b6d57f084626d13 (diff)
download	txr-78e12d9c43b606f7402100a7c3b3367057d103d9.tar.gz txr-78e12d9c43b606f7402100a7c3b3367057d103d9.tar.bz2 txr-78e12d9c43b606f7402100a7c3b3367057d103d9.zip