/* Copyright 2009-2022
 * Kaz Kylheku <kaz@kylheku.com>
 * Vancouver, Canada
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

%{

#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <stdlib.h>
#include <limits.h>
#include <wchar.h>
#include <signal.h>
#include "config.h"
#include "lib.h"
#include "gc.h"
#include "stream.h"
#include "utf8.h"
#include "signal.h"
#include "unwind.h"
#include "hash.h"
#include "y.tab.h"
#include "parser.h"
#include "txr.h"

#define YY_INPUT(buf, result, max_size)                                 \
  do {                                                                  \
    val self = lit("parser");                                           \
    val n = get_bytes(self, yyextra->stream,                            \
                      coerce(mem_t *, buf), max_size);                  \
    result = c_num(n, self);                                            \
  } while (0)

#define YY_DECL \
  static int yylex_impl(YYSTYPE *yylval_param, yyscan_t yyscanner)

int opt_loglevel = 1;   /* 0 - quiet; 1 - normal; 2 - verbose */

val form_to_ln_hash;

static int directive_tok(scanner_t *yyg, int tok, int state);

#define FLEX_NUM_VERSION 10000*YY_FLEX_MAJOR_VERSION + \
                         100*YY_FLEX_MINOR_VERSION + \
                         YY_FLEX_SUBMINOR_VERSION

#if FLEX_NUM_VERSION < 20509
int yylex_destroy(void)
{
  return 0;
}
#endif

/* Missing prototypes not generated by flex. */
int yyget_column(yyscan_t);
void yyset_column (int column_no, yyscan_t yyscanner);

/* The following function is all that libflex provides.
   By providing it here, we eliminate the need to link libflex. */
#define YY_SKIP_YYWRAP
INLINE int yywrap(yyscan_t scanner)
{
  (void) scanner;
  return 1;
}

void yyerror(scanner_t *scanner, parser_t *parser, const char *s)
{
  yyerrorf(scanner, lit("~a"), string_utf8(s), nao);
  if (parser->prepared_msg) {
    yyerrorf(scanner, lit("~a"), parser->prepared_msg, nao);
    parser->prepared_msg = nil;
  }
}

void yyerrorf(scanner_t *scanner, val fmt, ...)
{
  parser_t *parser = yyget_extra(scanner);

  if (opt_loglevel >= 1) {
    va_list vl;
    va_start (vl, fmt);
    if (opt_compat && opt_compat <= 114)
      format(std_error, lit("~a: (~a:~d): "), prog_string,
             parser->name, num(parser->lineno), nao);
    else
      format(std_error, lit("~a:~d: "),
             parser->name, num(parser->lineno), nao);

    vformat(std_error, fmt, vl);
    put_char(chr('\n'), std_error);
    va_end (vl);
  }
  parser->errors++;
}

static void yyerrprepf(scanner_t *scanner, val fmt, ...)
{
  parser_t *parser = yyget_extra(scanner);

  if (opt_loglevel >= 1) {
    va_list vl;
    va_start (vl, fmt);
    set(mkloc(parser->prepared_msg, parser->parser),
        vformat_to_string(fmt, vl));
    va_end (vl);
  }
}

static void out_of_range_float(scanner_t *scanner, const char *tok)
{
    yyerrorf(scanner, lit("out-of-range floating-point literal: ~a"),
             string_utf8(tok), nao);
}

static wchar_t char_esc(int letter)
{
  switch (letter) {
  case ' ': return L' ';
  case 'a': return L'\a';
  case 'b': return L'\b';
  case 't': return L'\t';
  case 'n': return L'\n';
  case 'v': return L'\v';
  case 'f': return L'\f';
  case 'r': return L'\r';
  case 'e': return 27;
  case '"': return L'"';
  case '\'': return L'\'';
  case '`': return L'`';
  case '/': return L'/';
  case '@': return L'@';
  case '\\': return L'\\';
  }

  internal_error("unhandled escape character");
}

static wchar_t num_esc(scanner_t *scn, char *num)
{
  long val = 0;

  if (num[0] == 'x' || num[0] == 'u') {
    if (strlen(num) > 7)
      yyerror(scn, yyget_extra(scn), "too many digits in hex character escape");
    else
      val = strtol(num + 1, 0, 16);
  } else {
    if (num[0] == 'o')
      num++;
    if (strlen(num) > 8)
      yyerror(scn, yyget_extra(scn), "too many digits in octal character escape");
    else
      val = strtol(num, 0, 8);
  }

  if (val < 0 || val > 0x10FFFF || convert(wchar_t, val) != val) {
    yyerror(scn, yyget_extra(scn), "numeric character escape out of range");
    val = 0;
  }

  return val;
}

static wchar_t *unicode_ident(scanner_t *scn, const char *lex)
{
  wchar_t *wlex = utf8_dup_from(lex), *ptr = wlex, wch;

  while ((wch = *ptr++)) {
    if (wch < 0x1680 || (wch >= 0x3000 && wch < 0xdc00))
      continue;

    if ((wch >= 0xdc00 && wch <= 0xdcff) ||
        (wch >= 0xd800 && wch <= 0xdbff) ||
#if FULL_UNICODE
        (wch >= 0xf0000 && wch <= 0xffffd) ||
        (wch >= 0x100000 && wch <= 0x10fffd) ||
#endif
        (wch >= 0xe000 && wch <= 0xf8ff) ||
        (wch == 0xfffe) ||
        (wch == 0xffff))
    {
      yyerror(scn, yyget_extra(scn),
              "disallowed Unicode character in identifier");
      break;
    }

    switch (wch) {
    case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002:
    case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007:
    case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029:
    case 0x205f: case 0x3000:
      yyerror(scn, yyget_extra(scn),
              "Unicode space occurs in identifier");
      break;
    default:
      continue;
    }

    break;
  }

  return wlex;
}

%}

%option stack noinput reentrant bison-bridge extra-type="parser_t *"
%option never-interactive

TOK     [a-zA-Z0-9_]+
SGN     [+\-]
EXP     [eE][+\-]?[0-9]+
DIG     [0-9]
DIG19   [1-9]
XDIG    [0-9A-Fa-f]
NUM     {SGN}?{DIG}+
FLO     {SGN}?({DIG}*[.]{DIG}+{EXP}?|{DIG}+[.]?{EXP})
FLODOT  {SGN}?{DIG}+[.]
DOTFLO  [.]{DIG}+
XNUM    #x{SGN}?{XDIG}+
ONUM    #o{SGN}?[0-7]+
BNUM    #b{SGN}?[0-1]+
BSCHR   ([a-zA-Z0-9!$%&*+\-<=>?\\_~]|{UONLY})
NSCHR   ([a-zA-Z0-9!$%&*+\-<=>?\\_~/]|{UONLY})
ID_END  [^a-zA-Z0-9!$%&*+\-<=>?\\_~/]
EXTRA   [#^]
BT0     {BSCHR}({BSCHR}|{EXTRA})*
BT1     @{BT0}+
BT2     ({BSCHR}|{EXTRA})+
BTREG   ({BT0}|{BT1})?:{BT2}?|({BT0}|{BT1})(:{BT2})?
BTKWUN  @?#?:{BT2}?
BTOK    {BTREG}|{BTKWUN}
NT0     {NSCHR}({NSCHR}|{EXTRA})*
NT1     @{NT0}+
NT2     ({NSCHR}|{EXTRA})+
NTREG   ({NT0}|{NT1})?:{NT2}?|({NT0}|{NT1})(:{NT2})?
NTKWUN  @?#?:{NT2}?
NTOK    {NTREG}|{NTKWUN}
WS      [\t ]*
REQWS   [\t ]+
NL      (\n|\r|\r\n)
HEX     [0-9A-Fa-f]
OCT     [0-7]

REGOP   [/()|.*?+~&%\[\]\-]

ASC     [\x00-\x7f]
ASCN    [\x00-\t\v-\x7f]
U       [\x80-\xbf]
U2      [\xc2-\xdf]
U3      [\xe0-\xef]
U4      [\xf0-\xf4]

UANY    {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
UANYN   {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}

JNUM    -?(0|{DIG19}{DIG}*)([.]{DIG}+)?{EXP}?
JPUNC   [(){},:\[\]"~*^]
NJPUNC  [^(){},:\[\]"~*^ \t\r\n]

%x      SPECIAL BRACED NESTED REGEX SREGEX STRLIT CHRLIT
%x      QSILIT QSPECIAL WLIT QWLIT BUFLIT
%x      JSON JLIT JMARKER

%%

<SPECIAL,QSPECIAL,NESTED,BRACED>{NUM} {
  val str = string_own(utf8_dup_from(yytext));

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  yylval->val = int_str(str, num(10));
  return NUMBER;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>({XNUM}|{ONUM}|{BNUM}) {
  val str = string_own(utf8_dup_from(yytext + 2));
  int base;

  switch (yytext[1]) {
  case 'x': base = 16; break;
  case 'o': base = 8; break;
  case 'b': default: base = 2; break;
  }

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  yylval->val = int_str(str, num_fast(base));
  return NUMBER;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>({BNUM}|{ONUM}|{XNUM}){TOK} {
  int base = 0;
  val str = string_own(utf8_dup_from(yytext + 2));

  switch (yytext[1]) {
  case 'x': base = 16; break;
  case 'o': base = 8; break;
  case 'b': default: base = 2; break;
  }

  yyerrorf(yyg, lit("trailing junk in numeric literal: ~a~a~a"),
           chr(yytext[0]), chr(yytext[1]), str, nao);

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  yylval->val = int_str(str, num_fast(base));
  return NUMBER;
}

<SPECIAL,NESTED,BRACED>{WS}{FLO} {
  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  if ((yylval->val = flo_str_utf8(yytext)) == nil)
    out_of_range_float(yyg, yytext);

  return NUMBER;
}

<SPECIAL>{WS}({FLO}[.]?|{FLODOT}){TOK} |
<BRACED>{WS}({FLO}[.]?|{FLODOT}){BTOK} |
<NESTED>{WS}({FLO}[.]?|{FLODOT}){NTOK} {
  val str = string_utf8(yytext);

  yyerrorf(yyg, lit("trailing junk in floating-point literal: ~a"), str, nao);

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  if ((yylval->val = flo_str_utf8(yytext)) == nil)
    out_of_range_float(yyg, yytext);

  return NUMBER;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>{FLODOT}/[^.] {
  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  if ((yylval->val = flo_str_utf8(yytext)) == nil)
    out_of_range_float(yyg, yytext);

  return NUMBER;
}

<NESTED,BRACED,QSILIT,QWLIT>@{NUM} |
<QSPECIAL>{NUM} {
  val str = string_own(utf8_dup_from(yytext + 1));

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);
  yylval->val = int_str(str, num(10));
  return METANUM;
}

<NESTED,QSILIT,QWLIT>@{XNUM} |
<QSPECIAL>{XNUM} {
  val str = string_own(utf8_dup_from(yytext + 3));

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);
  yylval->val = int_str(str, num(16));
  return METANUM;
}

<NESTED,QSILIT,QWLIT>@{ONUM} |
<QSPECIAL>{ONUM} {
  val str = string_own(utf8_dup_from(yytext + 3));

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);
  yylval->val = int_str(str, num(8));
  return METANUM;
}

<NESTED,QSILIT,QWLIT,QSPECIAL>@{BNUM} {
  val str = string_own(utf8_dup_from(yytext + 3));

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);
  yylval->val = int_str(str, num(2));
  return METANUM;
}

<SPECIAL,QSPECIAL>{TOK}/{DOTFLO} |
<BRACED>{BTOK}/{DOTFLO} |
<NESTED>{NTOK}/{DOTFLO} {
  yyerrorf(yyg, lit("cramped floating-point literal: "
                    "space needed between ~a and dot."),
                    string_own(utf8_dup_from(yytext)),
                    nao);

  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  yylval->lexeme = unicode_ident(yyg, yytext);
  return SYMTOK;
}


<SPECIAL,QSPECIAL>{TOK} |
<BRACED>{BTOK} |
<NESTED>{NTOK} {
  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  yylval->lexeme = unicode_ident(yyg, yytext);
  return SYMTOK;
}

<BRACED>{BTOK}{BTOK} |
<NESTED>{NTOK}{NTOK} {
  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);

  yyerrorf(yyg, lit("bad token: ~a"),
                    string_own(utf8_dup_from(yytext)),
                    nao);
  yylval->lexeme = unicode_ident(yyg, yytext);
  return SYMTOK;
}

<SPECIAL>\({WS}({NT0}?:)?all{WS}\) {
  return directive_tok(yyg, ALL, 0);
}

<SPECIAL>\({WS}({NT0}?:)?some/{ID_END} {
  return directive_tok(yyg, SOME, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?none{WS}\) {
  return directive_tok(yyg, NONE, 0);
}

<SPECIAL>\({WS}({NT0}?:)?maybe{WS}\) {
  return directive_tok(yyg, MAYBE, 0);
}

<SPECIAL>\({WS}({NT0}?:)?cases{WS}\) {
  return directive_tok(yyg, CASES, 0);
}

<SPECIAL>\({WS}({NT0}?:)?block/{ID_END} {
  return directive_tok(yyg, BLOCK, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?choose/{ID_END} {
  return directive_tok(yyg, CHOOSE, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?gather/{ID_END} {
  return directive_tok(yyg, GATHER, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?and{WS}\) {
  return directive_tok(yyg, AND, 0);
}

<SPECIAL>\({WS}({NT0}?:)?or{WS}\) {
  return directive_tok(yyg, OR, 0);
}

<SPECIAL>\({WS}({NT0}?:)?end{WS}\) {
  return directive_tok(yyg, END, 0);
}

<SPECIAL>\({WS}({NT0}?:)?collect/{ID_END} {
  return directive_tok(yyg, COLLECT, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?coll/{ID_END} {
  return directive_tok(yyg, COLL, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?until/{ID_END} {
  return directive_tok(yyg, UNTIL, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?output/{ID_END}  {
  return directive_tok(yyg, OUTPUT, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?repeat/{ID_END}  {
  return directive_tok(yyg, REPEAT, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?rep/{ID_END} {
  return directive_tok(yyg, REP, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?single{WS}\) {
  return directive_tok(yyg, SINGLE, 0);
}

<SPECIAL>\({WS}({NT0}?:)?first{WS}\) {
  return directive_tok(yyg, FIRST, 0);
}

<SPECIAL>\({WS}({NT0}?:)?last/{ID_END} {
  return directive_tok(yyg, LAST, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?empty{WS}\) {
  return directive_tok(yyg, EMPTY, 0);
}

<SPECIAL>\({WS}({NT0}?:)?mod/{ID_END} {
  return directive_tok(yyg, MOD, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?modlast/{ID_END} {
  return directive_tok(yyg, MODLAST, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?define/{ID_END} {
  return directive_tok(yyg, DEFINE, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?try{WS}\) {
  return directive_tok(yyg, TRY, 0);
}

<SPECIAL>\({WS}({NT0}?:)?catch/{ID_END} {
  return directive_tok(yyg, CATCH, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?finally{WS}\) {
  return directive_tok(yyg, FINALLY, 0);
}

<SPECIAL>\({WS}({NT0}?:)?if/{ID_END} {
  return directive_tok(yyg, IF, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?elif/{ID_END} {
  return directive_tok(yyg, ELIF, NESTED);
}

<SPECIAL>\({WS}({NT0}?:)?else{WS}\) {
  return directive_tok(yyg, ELSE, 0);
}

<SPECIAL,QSPECIAL>[{] {
  yy_push_state(BRACED, yyscanner);
  yylval->lineno = yyextra->lineno;
  return yytext[0];
}

<SPECIAL,QSPECIAL,NESTED,BRACED>[(\[] {
 yy_push_state(NESTED, yyscanner);
 yylval->lineno = yyextra->lineno;
 return yytext[0];
}

<NESTED,BRACED,QSPECIAL>@ {
  yylval->lineno = yyextra->lineno;
  return (opt_compat && opt_compat <= 248) ? OLD_AT : '@';
}

<NESTED,QSPECIAL,BRACED>,[*] {
  yylval->chr = '*';
  return SPLICE;
}

<QSPECIAL,NESTED,BRACED>[,'^] {
  yylval->chr = yytext[0];
  return yytext[0];
}

<BRACED>[}] {
  yy_pop_state(yyscanner);
  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);
  return yytext[0];
}

<SPECIAL,QSPECIAL,NESTED>[)\]]/{DOTFLO} {
  yyerrorf(yyg, lit("cramped floating-point literal: "
                    "space or 0 needed between ~a and dot."),
                    string_own(utf8_dup_from(yytext)),
                    nao);

  yy_pop_state(yyscanner);
  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);
  return yytext[0];
}


<SPECIAL,QSPECIAL,NESTED>[)\]}] {
  yy_pop_state(yyscanner);
  if (yy_top_state(yyscanner) == INITIAL
      || yy_top_state(yyscanner) == QSILIT
      || yy_top_state(yyscanner) == QWLIT)
    yy_pop_state(yyscanner);
  return yytext[0];
}

<SPECIAL,QSPECIAL,NESTED,BRACED>{WS} {
  /* Eat whitespace in directive */
}

<SPECIAL,QSPECIAL,NESTED,BRACED>\" {
  yy_push_state(STRLIT, yyscanner);
  return '"';
}

<SPECIAL,QSPECIAL,NESTED,BRACED>#\\ {
  yy_push_state(CHRLIT, yyscanner);
  yylval->lineno = yyextra->lineno;
  return HASH_BACKSLASH;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>#b' {
  yy_push_state(BUFLIT, yyscanner);
  yylval->lineno = yyextra->lineno;
  return HASH_B_QUOTE;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>#[/] {
  yy_push_state(REGEX, yyscanner);
  yylval->lineno = yyextra->lineno;
  return HASH_SLASH;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>` {
  yy_push_state(QSILIT, yyscanner);
  return '`';
}

<SPECIAL,QSPECIAL,NESTED,BRACED>#\" {
  yy_push_state(WLIT, yyscanner);
  yylval->lineno = yyextra->lineno;
  return WORDS;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>#\*\" {
  yy_push_state(WLIT, yyscanner);
  yylval->lineno = yyextra->lineno;
  return WSPLICE;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>#\` {
  yy_push_state(QWLIT, yyscanner);
  yylval->lineno = yyextra->lineno;
  return QWORDS;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>#\*\` {
  yy_push_state(QWLIT, yyscanner);
  yylval->lineno = yyextra->lineno;
  return QWSPLICE;
}

<NESTED,BRACED># {
  return '#';
}

<NESTED,BRACED>#H {
  yylval->lineno = yyextra->lineno;
  return HASH_H;
}

<NESTED,BRACED>#S {
  yylval->lineno = yyextra->lineno;
  return HASH_S;
}

<NESTED,BRACED>#R {
  yylval->lineno = yyextra->lineno;
  return HASH_R;
}

<NESTED,BRACED>#N {
  yylval->lineno = yyextra->lineno;
  return HASH_N;
}

<NESTED,BRACED>#T {
  yylval->lineno = yyextra->lineno;
  return HASH_T;
}

<NESTED,BRACED>#J {
  yylval->lineno = yyextra->lineno;
  yy_push_state(JSON, yyscanner);
  return HASH_J;
}

<NESTED,BRACED>#; {
  yylval->lineno = yyextra->lineno;
  return HASH_SEMI;
}

<NESTED,BRACED,JSON>#{DIG}+= {
  val str = string_own(utf8_dup_from(yytext + 1));
  yylval->val = int_str(str, num(10));
  return HASH_N_EQUALS;
}

<NESTED,BRACED,JSON>#{DIG}+# {
  val str = string_own(utf8_dup_from(yytext + 1));
  yylval->val = int_str(str, num(10));
  return HASH_N_HASH;
}

<NESTED>{WS}\.\. {
  yylval->lineno = yyextra->lineno;
  return (opt_compat && opt_compat <= 185) ? OLD_DOTDOT : DOTDOT;
}

<SPECIAL>@ {
  yy_pop_state(yyscanner);
  yylval->lexeme = chk_strdup(L"@");
  return TEXT;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>{NL} {
  yyextra->lineno++;
}

<SPECIAL,BRACED>[/] {
  yy_push_state(REGEX, yyscanner);
  return '/';
}

<SPECIAL,QSPECIAL,NESTED>{REQWS}\.{REQWS} {
  yylval->chr = '.';
  return CONSDOT;
}

<SPECIAL,QSPECIAL,NESTED>\.{REQWS} {
  yylval->chr = '.';
  return LAMBDOT;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>{REQWS}\. {
  yylval->chr = '.';
  return UREFDOT;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>\. {
  yylval->chr = '.';
  return '.';
}

<SPECIAL,QSPECIAL,NESTED>\.\? {
  yylval->chr = '.';
  return OREFDOT;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>{REQWS}\.\? {
  yylval->chr = '.';
  return UOREFDOT;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>[\\]{NL}{WS} {
  if (YYSTATE == SPECIAL)
    yy_pop_state(yyscanner);  /* @\ continuation */
  yyextra->lineno++;
}

<SPECIAL>[\\][abtnvfre ] {
  wchar_t lexeme[2];
  lexeme[0] = char_esc(yytext[1]);
  lexeme[1] = 0;
  yylval->lexeme = chk_strdup(lexeme);
  yy_pop_state(yyscanner);
  return TEXT;
}

<SPECIAL>[\\](x{HEX}+|{OCT}+);? {
  wchar_t lexeme[2];
  lexeme[0] = num_esc(yyg, yytext + 1);
  lexeme[1] = 0;
  yylval->lexeme = chk_strdup(lexeme);

  {
    char lastchar = yytext[yyleng-1];
    if (lastchar == ';' && opt_compat && opt_compat <= 109)
      unput(lastchar);
  }

  yy_pop_state(yyscanner);
  return TEXT;
}

<SPECIAL>[\\]x {
  yyerrorf(yyg, lit("\\x escape without digits"), nao);
}

<SPECIAL>[\\]. {
  yyerrorf(yyg, lit("unrecognized escape \\~a"), chr(yytext[1]), nao);
}

<SPECIAL,QSPECIAL,NESTED,BRACED>[;][^\n\r]* {
  /* comment */
}

<SPECIAL,QSPECIAL,NESTED,BRACED>{UANYN} {
  val ch = chr_str(string_utf8(yytext), zero);
  if (chr_isspace(ch))
    yyerrprepf(yyg, lit("unexpected whitespace character #\\x~,02x"),
               ch, nao);
  else if (chr_isunisp(ch))
    yyerrprepf(yyg, lit("unexpected Unicode space character #\\x~,02x"),
               ch, nao);
  else if (chr_iscntrl(ch))
    yyerrprepf(yyg, lit("unexpected control character #\\x~,02x"),
               ch, nao);
  else
    yyerrprepf(yyg, lit("unexpected character #\\~a"),
               ch, nao);
  return ERRTOK;
}

<SPECIAL,QSPECIAL,NESTED,BRACED>. {
  yyerrprepf(yyg, lit("non-UTF-8 byte #x~02x in directive"),
             num(convert(unsigned char, yytext[0])), nao);
  return ERRTOK;
}

<REGEX,SREGEX>[/] {
  yylval->chr = '/';
  return (YYSTATE == SREGEX) ? REGCHAR : '/';
}

<REGEX,SREGEX>[\\][abtnvfre\\ ] {
  yylval->chr = char_esc(yytext[1]);
  return REGCHAR;
}

<REGEX,SREGEX>[\\](x{HEX}+|{OCT}+);? {
  yylval->chr = num_esc(yyg, yytext + 1);
  return REGCHAR;
}

<REGEX,SREGEX>[\\][sSdDwW] {
  yylval->chr = yytext[1];
  return REGTOKEN;
}

<REGEX,SREGEX>{WS}[\\]{NL}{WS} {
  yyextra->lineno++;
}

<REGEX>{NL} {
  yyextra->lineno++;
  yyerrprepf(yyg, lit("newline in regex"), nao);
  return ERRTOK;
}

<SREGEX>{NL} {
  yyextra->lineno++;
  yylval->chr = yytext[0];
  return REGCHAR;
}

<REGEX,SREGEX>{REGOP} {
  yylval->chr = yytext[0];
  return yytext[0];
}

<REGEX,SREGEX>[\\]{REGOP} {
  yylval->chr = yytext[1];
  return REGCHAR;
}

<REGEX,SREGEX>[\\]. {
  if (opt_compat && opt_compat <= 105) {
    yylval->chr = yytext[1];
    return REGCHAR;
  }

  if (yytext[1] == 'x')
    yyerrprepf(yyg, lit("\\x escape without digits in regex"), nao);
  else
    yyerrprepf(yyg, lit("unrecognized escape in regex"), nao);
  return ERRTOK;
}

<REGEX,SREGEX>[\\] {
  yyerrprepf(yyg, lit("dangling backslash in regex"), nao);
  return ERRTOK;
}

<REGEX,SREGEX>{UANYN}  {
  wchar_t wchr[8];
  if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) {
    yylval->lexeme = chk_strdup(wchr);
    return TEXT;
  }
  yylval->chr = wchr[0];
  return REGCHAR;
}

<SREGEX,REGEX>. {
  yylval->chr = convert(unsigned char, yytext[0]) + 0xDC00;
  return REGCHAR;
}

<INITIAL>[ ]+ {
  yylval->lexeme = utf8_dup_from(yytext);
  return SPACE;
}

<INITIAL>({UONLY}|[^@\n\r ])+ {
  yylval->lexeme = utf8_dup_from(yytext);
  return TEXT;
}

<INITIAL>{NL} {
  yyextra->lineno++;
  return '\n';
}

<INITIAL>@{WS}\* {
  yy_push_state(SPECIAL, yyscanner);
  return '*';
}

<INITIAL>@ {
  yy_push_state(SPECIAL, yyscanner);
}

<INITIAL>^@{WS}*[#;].*{NL} {
  /* eat whole line comment */
  yyextra->lineno++;
}

<INITIAL>@{WS}*[#;].* {
  /* comment to end of line */
}

<STRLIT,WLIT>\" {
  yy_pop_state(yyscanner);
  return yytext[0];
}

<QSILIT,QWLIT>\` {
  yy_pop_state(yyscanner);
  return yytext[0];
}

<STRLIT,QSILIT,WLIT,QWLIT>[\\][abtnvfre "`'\\ ] {
  yylval->chr = char_esc(yytext[1]);
  return LITCHAR;
}

<QSILIT,QWLIT>[\\]@ {
  yylval->chr = char_esc(yytext[1]);
  return LITCHAR;
}

<STRLIT,QSILIT>{WS}[\\]{NL}{WS} {
  yyextra->lineno++;
}

<STRLIT,QSILIT,WLIT,QWLIT>{WS}[\\]{NL}{WS} {
  yyextra->lineno++;

  if (!opt_compat || opt_compat > 109)
    return ' ';
}


<STRLIT,QSILIT,WLIT,QWLIT>[\\](x{HEX}+|{OCT}+);?  {
  yylval->chr = num_esc(yyg, yytext+1);
  return LITCHAR;
}

<STRLIT,QSILIT,WLIT,QWLIT>[\\]x {
  yyerrorf(yyg, lit("\\x escape without digits"), nao);
}

<STRLIT,QSILIT,WLIT,QWLIT>[\\]. {
  yyerrorf(yyg, lit("unrecognized escape: \\~a"), chr(yytext[1]), nao);
}

<CHRLIT>(x{HEX}+|o{OCT}+) {
  yylval->chr = num_esc(yyg, yytext);
  return LITCHAR;
}

<CHRLIT>{TOK} {
  yylval->lexeme = utf8_dup_from(yytext);
  return SYMTOK;
}

<CHRLIT>[^ \t\n\r] {
  yylval->lexeme = utf8_dup_from(yytext);
  return SYMTOK; /* hack */
}

<STRLIT>{NL} {
  yyerrprepf(yyg, lit("newline in string literal"), nao);
  yyextra->lineno++;
  yylval->chr = yytext[0];
  return ERRTOK;
}

<CHRLIT>{NL} {
  yyerrprepf(yyg, lit("newline in character literal"), nao);
  yyextra->lineno++;
  yylval->chr = yytext[0];
  return ERRTOK;
}

<QSILIT>{NL} {
  yyerrprepf(yyg, lit("newline in string quasiliteral"), nao);
  yyextra->lineno++;
  yylval->chr = yytext[0];
  return ERRTOK;
}

<WLIT,QWLIT>{NL} {
  yyextra->lineno++;

  if (opt_compat && opt_compat <= 109)
    return ' ';

  yyerrprepf(yyg, lit("newline in word list literal"), nao);
  yylval->chr = yytext[0];
  return ERRTOK;
}

<QSILIT,QWLIT>@/([[({'^,@]|{TOK}) {
  yy_push_state(QSPECIAL, yyscanner);
  return yytext[0];
}

<QSILIT,QWLIT>@ {
  yyerrprepf(yyg, lit("malformed @ expression in quasiliteral"), nao);
  return ERRTOK;
}

<WLIT,QWLIT>{WS} {
  return ' ';
}

<JLIT>\" {
  yy_pop_state(yyscanner);
  return yytext[0];
}

<JLIT>[\\][bfnrt"\\/] {
  yylval->chr = char_esc(yytext[1]);
  return LITCHAR;
}

<JLIT>[\\]u[Dd][8-9A-Fa-f]{HEX}{2}[\\]u[Dd][C-Fc-f]{HEX}{2} {
  wchar_t ch0, ch1;
  yytext[6] = 0;
  ch0 = num_esc(yyg, yytext + 1);
  ch1 = num_esc(yyg, yytext + 7);
  yylval->chr = ((ch0 - 0xD800) << 10 | (ch1 - 0xDC00)) + 0x10000;
  return LITCHAR;
}

<JLIT>[\\]u{HEX}{4} {
  wchar_t ch = num_esc(yyg, yytext + 1);
  yylval->chr = if3(ch, ch, 0xDC00);
  return LITCHAR;
}

<JLIT>[\\]u {
  yyerrorf(yyg, lit("JSON \\u escape needs four digits"), nao);
}

<JLIT>[\\]. {
  yyerrorf(yyg, lit("unrecognized JSON escape: \\~a"), chr(yytext[1]), nao);
}

<JLIT>{NL} {
  yyerrprepf(yyg, lit("newline in JSON string"), nao);
  yyextra->lineno++;
  yylval->chr = yytext[0];
  return ERRTOK;
}

<STRLIT,CHRLIT,QSILIT,WLIT,QWLIT,JLIT>{UANYN} {
  wchar_t wchr[8];
  if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) {
    yylval->lexeme = chk_strdup(wchr);
    return TEXT;
  }
  yylval->chr = wchr[0];
  return LITCHAR;
}

<BUFLIT>{HEX} {
  yylval->chr = strtol(yytext, 0, 16);
  return LITCHAR;
}

<BUFLIT>' {
  return '\'';
}

<BUFLIT>{WS} {
}

<BUFLIT>{NL} {
  yyextra->lineno++;
}

<BUFLIT>. {
  yyerrorf(yyg, lit("bad character ~s in buffer literal"),
           chr(yytext[0]), nao);
}

<STRLIT,CHRLIT,QSILIT,WLIT,QWLIT,JLIT>. {
  yylval->chr = convert(unsigned char, yytext[0]) + 0xDC00;
  return LITCHAR;
}

<JSON>{JNUM} {
  if ((yylval->val = flo_str_utf8(yytext)) == nil)
    out_of_range_float(yyg, yytext);
  return NUMBER;
}

<JSON>true/({JPUNC}|[ \t\n]) {
  yylval->val = t;
  return JSKW;
}

<JSON>false/({JPUNC}|[ \t\n]) {
  yylval->val = nil;
  return JSKW;
}

<JSON>null/({JPUNC}|[ \t\n]) {
  yylval->val = null_s;
  return JSKW;
}

<JSON>{NJPUNC}+ {
  if (strcmp("true", yytext) == 0) {
    yylval->val = t;
    return JSKW;
  }

  if (strcmp("false", yytext) == 0) {
    yylval->val = nil;
    return JSKW;
  }

  if (strcmp("null", yytext) == 0) {
    yylval->val = null_s;
    return JSKW;
  }

  {
    val str = string_own(utf8_dup_from(yytext));
    yyerrorf(yyg, lit("unrecognized JSON syntax: ~a"), str, nao);
  }
}

<JSON>\" {
  yy_push_state(JLIT, yyscanner);
  return yytext[0];
}

<JSON>~[*] {
  yy_push_state(JMARKER, yyscanner);
  yy_push_state(NESTED, yyscanner);
  return JSPLICE;
}

<JSON>~ {
  yy_push_state(JMARKER, yyscanner);
  yy_push_state(NESTED, yyscanner);
  return yytext[0];
}

<JSON>{JPUNC} {
  return yytext[0];
}

<JSON>{NL} {
  yyextra->lineno++;
}

<JSON>{WS} {
}

<JSON>. {
  yyerrorf(yyg, lit("bad character ~s in JSON literal"),
           chr(yytext[0]), nao);
}

<JMARKER>. {
  internal_error("scanner processed input JMARKER state");
}

%%

static int directive_tok(scanner_t *yyscanner, int tok, int state)
{
  struct yyguts_t *yyg = convert(struct yyguts_t *, yyscanner);
  char *pstart = yytext + 1 + strspn(yytext + 1, " \t");
  char *pcolon = strchr(pstart, ':');
  char *pend = pstart + strspn(pstart, ":-abcdefghijklmnopqrstuvwxyz");

  *pend = 0;

  if (pcolon != 0) {
    val pkgname = string_utf8((*pcolon = 0, pstart));
    val package = if3(pstart[0], find_package(pkgname), keyword_package);
    if (!package) {
      yyerrprepf(yyg, lit("package ~a not found"), pkgname, nao);
      tok = ERRTOK;
    }
    if (package != user_package && package != keyword_package) {
      val sym = string_utf8(pcolon + 1);
      yyerrprepf(yyg, lit("~a:~a: original usr package expected, not ~a"),
                 pkgname, sym, pkgname, nao);
      tok = ERRTOK;
    }
  } else {
    val symname = string_utf8(pstart);
    val sym = intern_fallback(symname, cur_package);
    val package = symbol_package(sym);

    if (package != user_package && package != keyword_package) {
      yyerrprepf(yyg, lit("~a: this is ~a:~a, not usr:~a"),
                 symname, package_name(package), symname, symname, nao);
      tok = ERRTOK;
    }
  }

  if (state != 0)
    yy_push_state(state, yyscanner);
  else
    yy_pop_state(yyscanner);
  yylval->lineno = yyextra->lineno;
  return tok;
}

void end_of_regex(scanner_t *yyg)
{
  if (YYSTATE != REGEX && YYSTATE != SREGEX)
    internal_error("end_of_regex called in wrong scanner state");

  yy_pop_state(yyg);

  if (YYSTATE != INITIAL) {
    if (yy_top_state(yyg) == INITIAL
      || yy_top_state(yyg) == QSILIT
      || yy_top_state(yyg) == QWLIT)
      yy_pop_state(yyg);
  }
}

void end_of_char(scanner_t *yyg)
{
  if (YYSTATE != CHRLIT)
    internal_error("end_of_char called in wrong scanner state");

  yy_pop_state(yyg);
}

void end_of_buflit(scanner_t *yyg)
{
  if (YYSTATE != BUFLIT)
    internal_error("end_of_buflit called in wrong scanner state");

  yy_pop_state(yyg);
}

void end_of_json(scanner_t *yyg)
{
  if (YYSTATE == JLIT)
    yy_pop_state(yyg);

  if (YYSTATE != JSON)
    internal_error("end_of_json called in wrong scanner state");

  yy_pop_state(yyg);
}

/* The complexity here is necessary because TXR Lisp parsing looks ahead
 * by one token. (The reason for *that* is the support of a.b.c referencing dot
 * syntax in TXR Lisp.)
 *
 * Consider these two different cases:
 *
 * ^#J[,~(+ 2.0 2.0)]
 * ^#J[,~(+ 2.0 2.0) #J42]
 *
 * This end_of_json_unquote function gets called when the (+ 2.0 2.0)
 * has been parsed, but the Yacc-generated parser has shifted one tokan
 * ahead.  It has read the ] token in the one case or the #J token in
 * the other. These tokens have totally different effects on the Lex
 * start condition. When the lexer reads the ] token, it pops off a NESTED
 * state, whereas the #J token wants to push on a new JSON state.
 * By the time end_of_json_unquote has been called, this has already happened.
 *
 * To deal with this, we use the dummy JMARKER start state which serves as a
 * kind of parenthesis inside the start condition stack. BHefore scanning Lisp
 * unquote within JSON, we push JMARKER state first, then the NESTED state.
 *
 * If the lookahead token is like ], and pops off a state, it will pop off
 * our NESTED state, so we are left at the JMARKER state. If the lookahead
 * token is something else like #J (HASH_J), then it will push a new
 * state like JSON on top, and we have JMARKER NESTED JSON.
 *
 * So what we are doing here is popping off everything until we get down
 * to the JMARKER state, and putting it into our little save area.
 *
 * Then we lose the JMARKER state.
 *
 * If the save area is empty, it means that the lookahead token consumed
 * our NESTED state, and so we are done.
 *
 * If the save area is not empty, it means the lookahead put something
 * extra over our NESTED state. We drop that state from our save area,
 * and restore the rest of the save area back into the stack.
 * Effectively, we are deleting the unquote-related states from the
 * interior of the start condition stack, not to disturb new material
 * initiated by the lookahead token.
 */
void end_of_json_unquote(scanner_t *yyg)
{
  int stacksave[8];
  int top = 0;

  while (YYSTATE != JMARKER) {
    stacksave[top++] = YYSTATE;
    yy_pop_state(yyg);
  }

  yy_pop_state(yyg);

  if (top-- > 0) {
    while (top > 0)
      yy_push_state(stacksave[--top], yyg);
  }
}

val source_loc(val form)
{
  return gethash(form_to_ln_hash, form);
}

val source_loc_str(val form, val alt)
{
  cons_bind (line, file, gethash(form_to_ln_hash, form));
  if (missingp(alt))
    alt = lit("source location n/a");
  return if3(line, format(nil, lit("~a:~d"), file, line, nao), alt);
}

int yylex(YYSTYPE *yylval_param, yyscan_t yyscanner)
{
  struct yyguts_t * yyg = convert(struct yyguts_t *, yyscanner);
  int yy_char;

  if (yyextra->tok_idx > 0) {
    struct yy_token *tok = &yyextra->tok_pushback[--yyextra->tok_idx];
    yyextra->recent_tok = *tok;
    *yylval_param = tok->yy_lval;
    if (tok->yy_lex_state && tok->yy_lex_state != YYSTATE)
      yy_push_state(tok->yy_lex_state, yyg);
    return tok->yy_char;
  }

  yy_char = yyextra->recent_tok.yy_char = yylex_impl(yylval_param, yyscanner);
  yyextra->recent_tok.yy_lval = *yylval_param;
  yyextra->recent_tok.yy_lex_state = YYSTATE;

  return yy_char;
}

void prime_scanner(scanner_t *yyg, enum prime_parser prim)
{
  while (YYSTATE != INITIAL)
    yy_pop_state(yyg);

  switch (prim) {
  case prime_lisp:
  case prime_interactive:
    yy_push_state(SPECIAL, yyg);
    yy_push_state(NESTED, yyg);
    yy_push_state(NESTED, yyg);
    break;
  case prime_regex:
    yy_push_state(SREGEX, yyg);
    break;
  case prime_json:
    yy_push_state(JSON, yyg);
    break;
  }
}

void scrub_scanner(scanner_t *yyg, int yy_char, wchar_t *lexeme)
{
  struct yy_token *rtok = &yyextra->recent_tok;

  if (rtok->yy_char == yy_char && rtok->yy_lval.lexeme == lexeme) {
    rtok->yy_char = 0;
    rtok->yy_lval.lexeme = 0;
  }
}

void parser_l_init(void)
{
  prot1(&form_to_ln_hash);
  form_to_ln_hash = make_eq_hash(hash_weak_keys);
}