/* PSPP - a program for statistical analysis.
   Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>. */

#include <config.h>

#include "language/lexer/lexer.h"

#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <math.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <unictype.h>
#include <unistd.h>
#include <unistr.h>
#include <uniwidth.h>

#include "language/command.h"
#include "language/lexer/macro.h"
#include "language/lexer/scan.h"
#include "language/lexer/segment.h"
#include "language/lexer/token.h"
#include "libpspp/assertion.h"
#include "libpspp/cast.h"
#include "libpspp/deque.h"
#include "libpspp/i18n.h"
#include "libpspp/ll.h"
#include "libpspp/message.h"
#include "libpspp/misc.h"
#include "libpspp/str.h"
#include "libpspp/u8-istream.h"
#include "output/journal.h"
#include "output/output-item.h"

#include "gl/c-ctype.h"
#include "gl/minmax.h"
#include "gl/xalloc.h"
#include "gl/xmemdup0.h"

#include "gettext.h"
#define _(msgid) gettext (msgid)
#define N_(msgid) msgid

/* A token within a lex_source. */
struct lex_token
  {
    /* The regular token information. */
    struct token token;

    /* For a token obtained through the lexer in an ordinary way, this is the
       location of the token in terms of the lex_source's buffer.

       For a token produced through macro expansion, this is the entire macro
       call.

       src->tail <= line_pos <= token_pos <= src->head. */
    size_t token_pos;           /* Start of token. */
    size_t token_len;           /* Length of source for token in bytes. */
    size_t line_pos;            /* Start of line containing token_pos. */
    int first_line;             /* Line number at token_pos. */

    /* For a token obtained through macro expansion, this is just this token.

       For a token obtained through the lexer in an ordinary way, these are
       nulls and zeros. */
    char *macro_rep;        /* The whole macro expansion. */
    size_t ofs;             /* Offset of this token in macro_rep. */
    size_t len;             /* Length of this token in macro_rep. */
    size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  };

static void
lex_token_destroy (struct lex_token *t)
{
  token_uninit (&t->token);
  if (t->ref_cnt)
    {
      assert (*t->ref_cnt > 0);
      if (!--*t->ref_cnt)
        {
          free (t->macro_rep);
          free (t->ref_cnt);
        }
    }
  free (t);
}

/* A deque of lex_tokens that comprises one stage in the token pipeline in a
   lex_source. */
struct lex_stage
  {
    struct deque deque;
    struct lex_token **tokens;
  };

static void lex_stage_clear (struct lex_stage *);
static void lex_stage_uninit (struct lex_stage *);

static size_t lex_stage_count (const struct lex_stage *);
static bool lex_stage_is_empty (const struct lex_stage *);

static struct lex_token *lex_stage_last (struct lex_stage *);
static struct lex_token *lex_stage_first (struct lex_stage *);
static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);

static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
static void lex_stage_pop_first (struct lex_stage *);

static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
                             size_t n);

/* Deletes all the tokens from STAGE. */
static void
lex_stage_clear (struct lex_stage *stage)
{
  while (!deque_is_empty (&stage->deque))
    lex_stage_pop_first (stage);
}

/* Deletes all the tokens from STAGE and frees storage for the deque. */
static void
lex_stage_uninit (struct lex_stage *stage)
{
  lex_stage_clear (stage);
  free (stage->tokens);
}

/* Returns true if STAGE contains no tokens, otherwise false. */
static bool
lex_stage_is_empty (const struct lex_stage *stage)
{
  return deque_is_empty (&stage->deque);
}

/* Returns the number of tokens in STAGE. */
static size_t
lex_stage_count (const struct lex_stage *stage)
{
  return deque_count (&stage->deque);
}

/* Returns the last token in STAGE, which must be nonempty.  The last token is
   the one accessed with the greatest lookahead. */
static struct lex_token *
lex_stage_last (struct lex_stage *stage)
{
  return stage->tokens[deque_front (&stage->deque, 0)];
}

/* Returns the first token in STAGE, which must be nonempty.
   The first token is the one accessed with the least lookahead. */
static struct lex_token *
lex_stage_first (struct lex_stage *stage)
{
  return lex_stage_nth (stage, 0);
}

/* Returns the token the given INDEX in STAGE.  The first token (with the least
   lookahead) is 0, the second token is 1, and so on.  There must be at least
   INDEX + 1 tokens in STAGE. */
static struct lex_token *
lex_stage_nth (struct lex_stage *stage, size_t index)
{
  return stage->tokens[deque_back (&stage->deque, index)];
}

/* Adds TOKEN so that it becomes the last token in STAGE. */
static void
lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
{
  if (deque_is_full (&stage->deque))
    stage->tokens = deque_expand (&stage->deque, stage->tokens,
                                  sizeof *stage->tokens);
  stage->tokens[deque_push_front (&stage->deque)] = token;
}

/* Removes the first token from STAGE and uninitializes it. */
static void
lex_stage_pop_first (struct lex_stage *stage)
{
  lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
}

/* Removes the first N tokens from SRC, appending them to DST as the last
   tokens. */
static void
lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
{
  for (size_t i = 0; i < n; i++)
    {
      lex_stage_push_last (dst, lex_stage_first (src));
      deque_pop_back (&src->deque);
    }
}

/* A source of tokens, corresponding to a syntax file.

   This is conceptually a lex_reader wrapped with everything needed to convert
   its UTF-8 bytes into tokens. */
struct lex_source
  {
    struct ll ll;               /* In lexer's list of sources. */
    struct lex_reader *reader;
    struct lexer *lexer;
    struct segmenter segmenter;
    bool eof;                   /* True if T_STOP was read from 'reader'. */

    /* Buffer of UTF-8 bytes. */
    char *buffer;
    size_t allocated;           /* Number of bytes allocated. */
    size_t tail;                /* &buffer[0] offset into UTF-8 source. */
    size_t head;                /* &buffer[head - tail] offset into source. */

    /* Positions in source file, tail <= pos <= head for each member here. */
    size_t journal_pos;         /* First byte not yet output to journal. */
    size_t seg_pos;             /* First byte not yet scanned as token. */
    size_t line_pos;            /* First byte of line containing seg_pos. */

    int n_newlines;             /* Number of new-lines up to seg_pos. */
    bool suppress_next_newline;

    /* Tokens.

       This is a pipeline with the following stages.  Each token eventually
       made available to the parser passes through of these stages.  The stages
       are named after the processing that happens in each one.

       Initially, tokens come from the segmenter and scanner to 'pp':

       - pp: Tokens that need to pass through the macro preprocessor to end up
         in 'merge'.

       - merge: Tokens that need to pass through scan_merge() to end up in
         'lookahead'.

       - lookahead: Tokens available to the client for parsing. */
    struct lex_stage pp;
    struct lex_stage merge;
    struct lex_stage lookahead;
  };

static struct lex_source *lex_source_create (struct lexer *,
                                             struct lex_reader *);
static void lex_source_destroy (struct lex_source *);

/* Lexer. */
struct lexer
  {
    struct ll_list sources;     /* Contains "struct lex_source"s. */
    struct macro_set *macros;
  };

static struct lex_source *lex_source__ (const struct lexer *);
static char *lex_source_get_syntax__ (const struct lex_source *,
                                      int n0, int n1);
static const struct lex_token *lex_next__ (const struct lexer *, int n);
static void lex_source_push_endcmd__ (struct lex_source *);

static bool lex_source_get_lookahead (struct lex_source *);
static void lex_source_error_valist (struct lex_source *, int n0, int n1,
                                     const char *format, va_list)
   PRINTF_FORMAT (4, 0);
static const struct lex_token *lex_source_next__ (const struct lex_source *,
                                                  int n);

/* Initializes READER with the specified CLASS and otherwise some reasonable
   defaults.  The caller should fill in the others members as desired. */
void
lex_reader_init (struct lex_reader *reader,
                 const struct lex_reader_class *class)
{
  reader->class = class;
  reader->syntax = SEG_MODE_AUTO;
  reader->error = LEX_ERROR_CONTINUE;
  reader->file_name = NULL;
  reader->encoding = NULL;
  reader->line_number = 0;
  reader->eof = false;
}

/* Frees any file name already in READER and replaces it by a copy of
   FILE_NAME, or if FILE_NAME is null then clears any existing name. */
void
lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
{
  free (reader->file_name);
  reader->file_name = xstrdup_if_nonnull (file_name);
}

/* Creates and returns a new lexer. */
struct lexer *
lex_create (void)
{
  struct lexer *lexer = xmalloc (sizeof *lexer);
  *lexer = (struct lexer) {
    .sources = LL_INITIALIZER (lexer->sources),
    .macros = macro_set_create (),
  };
  return lexer;
}

/* Destroys LEXER. */
void
lex_destroy (struct lexer *lexer)
{
  if (lexer != NULL)
    {
      struct lex_source *source, *next;

      ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
        lex_source_destroy (source);
      macro_set_destroy (lexer->macros);
      free (lexer);
    }
}

/* Adds M to LEXER's set of macros.  M replaces any existing macro with the
   same name.  Takes ownership of M. */
void
lex_define_macro (struct lexer *lexer, struct macro *m)
{
  macro_set_add (lexer->macros, m);
}

/* Inserts READER into LEXER so that the next token read by LEXER comes from
   READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
   token. */
void
lex_include (struct lexer *lexer, struct lex_reader *reader)
{
  assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
  ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
}

/* Appends READER to LEXER, so that it will be read after all other current
   readers have already been read. */
void
lex_append (struct lexer *lexer, struct lex_reader *reader)
{
  ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
}

/* Advancing. */

/* Advances LEXER to the next token, consuming the current token. */
void
lex_get (struct lexer *lexer)
{
  struct lex_source *src;

  src = lex_source__ (lexer);
  if (src == NULL)
    return;

  if (!lex_stage_is_empty (&src->lookahead))
    lex_stage_pop_first (&src->lookahead);

  while (lex_stage_is_empty (&src->lookahead))
    if (!lex_source_get_lookahead (src))
      {
        lex_source_destroy (src);
        src = lex_source__ (lexer);
        if (src == NULL)
          return;
      }
}

/* Issuing errors. */

/* Prints a syntax error message containing the current token and
   given message MESSAGE (if non-null). */
void
lex_error (struct lexer *lexer, const char *format, ...)
{
  va_list args;

  va_start (args, format);
  lex_next_error_valist (lexer, 0, 0, format, args);
  va_end (args);
}

/* Prints a syntax error message containing the current token and
   given message MESSAGE (if non-null). */
void
lex_error_valist (struct lexer *lexer, const char *format, va_list args)
{
  lex_next_error_valist (lexer, 0, 0, format, args);
}

/* Prints a syntax error message containing the current token and
   given message MESSAGE (if non-null). */
void
lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
{
  va_list args;

  va_start (args, format);
  lex_next_error_valist (lexer, n0, n1, format, args);
  va_end (args);
}

/* Prints a syntax error message saying that one of the strings provided as
   varargs, up to the first NULL, is expected. */
void
(lex_error_expecting) (struct lexer *lexer, ...)
{
  va_list args;

  va_start (args, lexer);
  lex_error_expecting_valist (lexer, args);
  va_end (args);
}

/* Prints a syntax error message saying that one of the options provided in
   ARGS, up to the first NULL, is expected. */
void
lex_error_expecting_valist (struct lexer *lexer, va_list args)
{
  enum { MAX_OPTIONS = 9 };
  const char *options[MAX_OPTIONS];
  int n = 0;
  while (n < MAX_OPTIONS)
    {
      const char *option = va_arg (args, const char *);
      if (!option)
        break;

      options[n++] = option;
    }
  lex_error_expecting_array (lexer, options, n);
}

void
lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
{
  switch (n)
    {
    case 0:
      lex_error (lexer, NULL);
      break;

    case 1:
      lex_error (lexer, _("expecting %s"), options[0]);
      break;

    case 2:
      lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
      break;

    case 3:
      lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
                 options[2]);
      break;

    case 4:
      lex_error (lexer, _("expecting %s, %s, %s, or %s"),
                 options[0], options[1], options[2], options[3]);
      break;

    case 5:
      lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
                 options[0], options[1], options[2], options[3], options[4]);
      break;

    case 6:
      lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
                 options[0], options[1], options[2], options[3], options[4],
                 options[5]);
      break;

    case 7:
      lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
                 options[0], options[1], options[2], options[3], options[4],
                 options[5], options[6]);
      break;

    case 8:
      lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
                 options[0], options[1], options[2], options[3], options[4],
                 options[5], options[6], options[7]);
      break;

    default:
      lex_error (lexer, NULL);
    }
}

/* Reports an error to the effect that subcommand SBC may only be specified
   once.

   This function does not take a lexer as an argument or use lex_error(),
   because the result would ordinarily just be redundant: "Syntax error at
   SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
   not help the user find the error. */
void
lex_sbc_only_once (const char *sbc)
{
  msg (SE, _("Subcommand %s may only be specified once."), sbc);
}

/* Reports an error to the effect that subcommand SBC is missing.

   This function does not take a lexer as an argument or use lex_error(),
   because a missing subcommand can normally be detected only after the whole
   command has been parsed, and so lex_error() would always report "Syntax
   error at end of command", which does not help the user find the error. */
void
lex_sbc_missing (const char *sbc)
{
  msg (SE, _("Required subcommand %s was not specified."), sbc);
}

/* Reports an error to the effect that specification SPEC may only be specified
   once within subcommand SBC. */
void
lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
{
  lex_error (lexer, _("%s may only be specified once within subcommand %s"),
             spec, sbc);
}

/* Reports an error to the effect that specification SPEC is missing within
   subcommand SBC. */
void
lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
{
  lex_error (lexer, _("Required %s specification missing from %s subcommand"),
             sbc, spec);
}

/* Prints a syntax error message containing the current token and
   given message MESSAGE (if non-null). */
void
lex_next_error_valist (struct lexer *lexer, int n0, int n1,
                       const char *format, va_list args)
{
  struct lex_source *src = lex_source__ (lexer);

  if (src != NULL)
    lex_source_error_valist (src, n0, n1, format, args);
  else
    {
      struct string s;

      ds_init_empty (&s);
      ds_put_format (&s, _("Syntax error at end of input"));
      if (format != NULL)
        {
          ds_put_cstr (&s, ": ");
          ds_put_vformat (&s, format, args);
        }
      ds_put_byte (&s, '.');
      msg (SE, "%s", ds_cstr (&s));
      ds_destroy (&s);
    }
}

/* Checks that we're at end of command.
   If so, returns a successful command completion code.
   If not, flags a syntax error and returns an error command
   completion code. */
int
lex_end_of_command (struct lexer *lexer)
{
  if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
    {
      lex_error (lexer, _("expecting end of command"));
      return CMD_FAILURE;
    }
  else
    return CMD_SUCCESS;
}

/* Token testing functions. */

/* Returns true if the current token is a number. */
bool
lex_is_number (const struct lexer *lexer)
{
  return lex_next_is_number (lexer, 0);
}

/* Returns true if the current token is a string. */
bool
lex_is_string (const struct lexer *lexer)
{
  return lex_next_is_string (lexer, 0);
}

/* Returns the value of the current token, which must be a
   floating point number. */
double
lex_number (const struct lexer *lexer)
{
  return lex_next_number (lexer, 0);
}

/* Returns true iff the current token is an integer. */
bool
lex_is_integer (const struct lexer *lexer)
{
  return lex_next_is_integer (lexer, 0);
}

/* Returns the value of the current token, which must be an
   integer. */
long
lex_integer (const struct lexer *lexer)
{
  return lex_next_integer (lexer, 0);
}

/* Token testing functions with lookahead.

   A value of 0 for N as an argument to any of these functions refers to the
   current token.  Lookahead is limited to the current command.  Any N greater
   than the number of tokens remaining in the current command will be treated
   as referring to a T_ENDCMD token. */

/* Returns true if the token N ahead of the current token is a number. */
bool
lex_next_is_number (const struct lexer *lexer, int n)
{
  return token_is_number (lex_next (lexer, n));
}

/* Returns true if the token N ahead of the current token is a string. */
bool
lex_next_is_string (const struct lexer *lexer, int n)
{
  return token_is_string (lex_next (lexer, n));
}

/* Returns the value of the token N ahead of the current token, which must be a
   floating point number. */
double
lex_next_number (const struct lexer *lexer, int n)
{
  return token_number (lex_next (lexer, n));
}

/* Returns true if the token N ahead of the current token is an integer. */
bool
lex_next_is_integer (const struct lexer *lexer, int n)
{
  return token_is_integer (lex_next (lexer, n));
}

/* Returns the value of the token N ahead of the current token, which must be
   an integer. */
long
lex_next_integer (const struct lexer *lexer, int n)
{
  return token_integer (lex_next (lexer, n));
}

/* Token matching functions. */

/* If the current token has the specified TYPE, skips it and returns true.
   Otherwise, returns false. */
bool
lex_match (struct lexer *lexer, enum token_type type)
{
  if (lex_token (lexer) == type)
    {
      lex_get (lexer);
      return true;
    }
  else
    return false;
}

/* If the current token matches IDENTIFIER, skips it and returns true.
   IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
   returns false.

   IDENTIFIER must be an ASCII string. */
bool
lex_match_id (struct lexer *lexer, const char *identifier)
{
  return lex_match_id_n (lexer, identifier, 3);
}

/* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
   may be abbreviated to its first N letters.  Otherwise, returns false.

   IDENTIFIER must be an ASCII string. */
bool
lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
{
  if (lex_token (lexer) == T_ID
      && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
    {
      lex_get (lexer);
      return true;
    }
  else
    return false;
}

/* If the current token is integer X, skips it and returns true.  Otherwise,
   returns false. */
bool
lex_match_int (struct lexer *lexer, int x)
{
  if (lex_is_integer (lexer) && lex_integer (lexer) == x)
    {
      lex_get (lexer);
      return true;
    }
  else
    return false;
}

/* Forced matches. */

/* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
   abbreviated to its first 3 letters.  Otherwise, reports an error and returns
   false.

   IDENTIFIER must be an ASCII string. */
bool
lex_force_match_id (struct lexer *lexer, const char *identifier)
{
  if (lex_match_id (lexer, identifier))
    return true;
  else
    {
      lex_error_expecting (lexer, identifier);
      return false;
    }
}

/* If the current token has the specified TYPE, skips it and returns true.
   Otherwise, reports an error and returns false. */
bool
lex_force_match (struct lexer *lexer, enum token_type type)
{
  if (lex_token (lexer) == type)
    {
      lex_get (lexer);
      return true;
    }
  else
    {
      const char *type_string = token_type_to_string (type);
      if (type_string)
	{
	  char *s = xasprintf ("`%s'", type_string);
	  lex_error_expecting (lexer, s);
	  free (s);
	}
      else
	lex_error_expecting (lexer, token_type_to_name (type));

      return false;
    }
}

/* If the current token is a string, does nothing and returns true.
   Otherwise, reports an error and returns false. */
bool
lex_force_string (struct lexer *lexer)
{
  if (lex_is_string (lexer))
    return true;
  else
    {
      lex_error (lexer, _("expecting string"));
      return false;
    }
}

/* If the current token is a string or an identifier, does nothing and returns
   true.  Otherwise, reports an error and returns false.

   This is meant for use in syntactic situations where we want to encourage the
   user to supply a quoted string, but for compatibility we also accept
   identifiers.  (One example of such a situation is file names.)  Therefore,
   the error message issued when the current token is wrong only says that a
   string is expected and doesn't mention that an identifier would also be
   accepted. */
bool
lex_force_string_or_id (struct lexer *lexer)
{
  return lex_token (lexer) == T_ID || lex_force_string (lexer);
}

/* If the current token is an integer, does nothing and returns true.
   Otherwise, reports an error and returns false. */
bool
lex_force_int (struct lexer *lexer)
{
  if (lex_is_integer (lexer))
    return true;
  else
    {
      lex_error (lexer, _("expecting integer"));
      return false;
    }
}

/* If the current token is an integer in the range MIN...MAX (inclusive), does
   nothing and returns true.  Otherwise, reports an error and returns false.
   If NAME is nonnull, then it is used in the error message. */
bool
lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
{
  bool is_integer = lex_is_integer (lexer);
  bool too_small = is_integer && lex_integer (lexer) < min;
  bool too_big = is_integer && lex_integer (lexer) > max;
  if (is_integer && !too_small && !too_big)
    return true;

  if (min > max)
    {
      /* Weird, maybe a bug in the caller.  Just report that we needed an
         integer. */
      if (name)
        lex_error (lexer, _("Integer expected for %s."), name);
      else
        lex_error (lexer, _("Integer expected."));
    }
  else if (min == max)
    {
      if (name)
        lex_error (lexer, _("Expected %ld for %s."), min, name);
      else
        lex_error (lexer, _("Expected %ld."), min);
    }
  else if (min + 1 == max)
    {
      if (name)
        lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
      else
        lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
    }
  else
    {
      bool report_lower_bound = (min > INT_MIN / 2) || too_small;
      bool report_upper_bound = (max < INT_MAX / 2) || too_big;

      if (report_lower_bound && report_upper_bound)
        {
          if (name)
            lex_error (lexer,
                       _("Expected integer between %ld and %ld for %s."),
                       min, max, name);
          else
            lex_error (lexer, _("Expected integer between %ld and %ld."),
                       min, max);
        }
      else if (report_lower_bound)
        {
          if (min == 0)
            {
              if (name)
                lex_error (lexer, _("Expected non-negative integer for %s."),
                           name);
              else
                lex_error (lexer, _("Expected non-negative integer."));
            }
          else if (min == 1)
            {
              if (name)
                lex_error (lexer, _("Expected positive integer for %s."),
                           name);
              else
                lex_error (lexer, _("Expected positive integer."));
            }
        }
      else if (report_upper_bound)
        {
          if (name)
            lex_error (lexer,
                       _("Expected integer less than or equal to %ld for %s."),
                       max, name);
          else
            lex_error (lexer, _("Expected integer less than or equal to %ld."),
                       max);
        }
      else
        {
          if (name)
            lex_error (lexer, _("Integer expected for %s."), name);
          else
            lex_error (lexer, _("Integer expected."));
        }
    }
  return false;
}

/* If the current token is a number, does nothing and returns true.
   Otherwise, reports an error and returns false. */
bool
lex_force_num (struct lexer *lexer)
{
  if (lex_is_number (lexer))
    return true;

  lex_error (lexer, _("expecting number"));
  return false;
}

/* If the current token is an identifier, does nothing and returns true.
   Otherwise, reports an error and returns false. */
bool
lex_force_id (struct lexer *lexer)
{
  if (lex_token (lexer) == T_ID)
    return true;

  lex_error (lexer, _("expecting identifier"));
  return false;
}

/* Token accessors. */

/* Returns the type of LEXER's current token. */
enum token_type
lex_token (const struct lexer *lexer)
{
  return lex_next_token (lexer, 0);
}

/* Returns the number in LEXER's current token.

   Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
   tokens this function will always return zero. */
double
lex_tokval (const struct lexer *lexer)
{
  return lex_next_tokval (lexer, 0);
}

/* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.

   Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
   this functions this function will always return NULL.

   The UTF-8 encoding of the returned string is correct for variable names and
   other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
   data_in() to use it in a "union value".  */
const char *
lex_tokcstr (const struct lexer *lexer)
{
  return lex_next_tokcstr (lexer, 0);
}

/* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
   null-terminated (but the null terminator is not included in the returned
   substring's 'length').

   Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
   this functions this function will always return NULL.

   The UTF-8 encoding of the returned string is correct for variable names and
   other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
   data_in() to use it in a "union value".  */
struct substring
lex_tokss (const struct lexer *lexer)
{
  return lex_next_tokss (lexer, 0);
}

/* Looking ahead.

   A value of 0 for N as an argument to any of these functions refers to the
   current token.  Lookahead is limited to the current command.  Any N greater
   than the number of tokens remaining in the current command will be treated
   as referring to a T_ENDCMD token. */

static const struct lex_token *
lex_next__ (const struct lexer *lexer_, int n)
{
  struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
  struct lex_source *src = lex_source__ (lexer);

  if (src != NULL)
    return lex_source_next__ (src, n);
  else
    {
      static const struct lex_token stop_token = { .token = { .type = T_STOP } };
      return &stop_token;
    }
}

static const struct lex_token *
lex_source_next__ (const struct lex_source *src_, int n)
{
  struct lex_source *src = CONST_CAST (struct lex_source *, src_);
  while (lex_stage_count (&src->lookahead) <= n)
    {
      if (!lex_stage_is_empty (&src->lookahead))
        {
          const struct lex_token *t = lex_stage_last (&src->lookahead);
          if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
            return t;
        }

      lex_source_get_lookahead (src);
    }

  return lex_stage_nth (&src->lookahead, n);
}

/* Returns the "struct token" of the token N after the current one in LEXER.
   The returned pointer can be invalidated by pretty much any succeeding call
   into the lexer, although the string pointer within the returned token is
   only invalidated by consuming the token (e.g. with lex_get()). */
const struct token *
lex_next (const struct lexer *lexer, int n)
{
  return &lex_next__ (lexer, n)->token;
}

/* Returns the type of the token N after the current one in LEXER. */
enum token_type
lex_next_token (const struct lexer *lexer, int n)
{
  return lex_next (lexer, n)->type;
}

/* Returns the number in the tokn N after the current one in LEXER.

   Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
   tokens this function will always return zero. */
double
lex_next_tokval (const struct lexer *lexer, int n)
{
  return token_number (lex_next (lexer, n));
}

/* Returns the null-terminated string in the token N after the current one, in
   UTF-8 encoding.

   Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
   this functions this function will always return NULL.

   The UTF-8 encoding of the returned string is correct for variable names and
   other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
   data_in() to use it in a "union value".  */
const char *
lex_next_tokcstr (const struct lexer *lexer, int n)
{
  return lex_next_tokss (lexer, n).string;
}

/* Returns the string in the token N after the current one, in UTF-8 encoding.
   The string is null-terminated (but the null terminator is not included in
   the returned substring's 'length').

   Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
   tokens this functions this function will always return NULL.

   The UTF-8 encoding of the returned string is correct for variable names and
   other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
   data_in() to use it in a "union value".  */
struct substring
lex_next_tokss (const struct lexer *lexer, int n)
{
  return lex_next (lexer, n)->string;
}

/* Returns the text of the syntax in tokens N0 ahead of the current one,
   through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
   are both zero, this requests the syntax for the current token.)  The caller
   must eventually free the returned string (with free()).  The syntax is
   encoded in UTF-8 and in the original form supplied to the lexer so that, for
   example, it may include comments, spaces, and new-lines if it spans multiple
   tokens.  Macro expansion, however, has already been performed. */
char *
lex_next_representation (const struct lexer *lexer, int n0, int n1)
{
  return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
}

/* Returns true if the token N ahead of the current one was produced by macro
   expansion, false otherwise. */
bool
lex_next_is_from_macro (const struct lexer *lexer, int n)
{
  return lex_next__ (lexer, n)->macro_rep != NULL;
}

static bool
lex_tokens_match (const struct token *actual, const struct token *expected)
{
  if (actual->type != expected->type)
    return false;

  switch (actual->type)
    {
    case T_POS_NUM:
    case T_NEG_NUM:
      return actual->number == expected->number;

    case T_ID:
      return lex_id_match (expected->string, actual->string);

    case T_STRING:
      return (actual->string.length == expected->string.length
              && !memcmp (actual->string.string, expected->string.string,
                          actual->string.length));

    default:
      return true;
    }
}

/* If LEXER is positioned at the sequence of tokens that may be parsed from S,
   skips it and returns true.  Otherwise, returns false.

   S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
   "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
   first three letters. */
bool
lex_match_phrase (struct lexer *lexer, const char *s)
{
  struct string_lexer slex;
  struct token token;
  int i;

  i = 0;
  string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
  while (string_lexer_next (&slex, &token))
    {
      bool match = lex_tokens_match (lex_next (lexer, i++), &token);
      token_uninit (&token);
      if (!match)
        return false;
    }

  while (i-- > 0)
    lex_get (lexer);
  return true;
}

static int
count_newlines (char *s, size_t length)
{
  int n_newlines = 0;
  char *newline;

  while ((newline = memchr (s, '\n', length)) != NULL)
    {
      n_newlines++;
      length -= (newline + 1) - s;
      s = newline + 1;
    }

  return n_newlines;
}

static int
lex_token_get_last_line_number (const struct lex_source *src,
                                const struct lex_token *token)
{
  if (token->first_line == 0)
    return 0;
  else
    {
      char *token_str = &src->buffer[token->token_pos - src->tail];
      return token->first_line + count_newlines (token_str, token->token_len) + 1;
    }
}

static int
count_columns (const char *s_, size_t length)
{
  const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
  int columns;
  size_t ofs;
  int mblen;

  columns = 0;
  for (ofs = 0; ofs < length; ofs += mblen)
    {
      ucs4_t uc;

      mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
      if (uc != '\t')
        {
          int width = uc_width (uc, "UTF-8");
          if (width > 0)
            columns += width;
        }
      else
        columns = ROUND_UP (columns + 1, 8);
    }

  return columns + 1;
}

static int
lex_token_get_first_column (const struct lex_source *src,
                            const struct lex_token *token)
{
  return count_columns (&src->buffer[token->line_pos - src->tail],
                        token->token_pos - token->line_pos);
}

static int
lex_token_get_last_column (const struct lex_source *src,
                           const struct lex_token *token)
{
  char *start, *end, *newline;

  start = &src->buffer[token->line_pos - src->tail];
  end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
  newline = memrchr (start, '\n', end - start);
  if (newline != NULL)
    start = newline + 1;
  return count_columns (start, end - start);
}

static struct msg_location
lex_token_location (const struct lex_source *src,
                    const struct lex_token *t0,
                    const struct lex_token *t1)
{
  return (struct msg_location) {
    .file_name = src->reader->file_name,
    .first_line = t0->first_line,
    .last_line = lex_token_get_last_line_number (src, t1),
    .first_column = lex_token_get_first_column (src, t0),
    .last_column = lex_token_get_last_column (src, t1),
  };
}

static struct msg_location *
lex_token_location_rw (const struct lex_source *src,
                       const struct lex_token *t0,
                       const struct lex_token *t1)
{
  struct msg_location location = lex_token_location (src, t0, t1);
  return msg_location_dup (&location);
}

static struct msg_location *
lex_source_get_location (const struct lex_source *src, int n0, int n1)
{
  return lex_token_location_rw (src,
                                lex_source_next__ (src, n0),
                                lex_source_next__ (src, n1));
}

/* Returns the 1-based line number of the start of the syntax that represents
   the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
   if the token is drawn from a source that does not have line numbers. */
int
lex_get_first_line_number (const struct lexer *lexer, int n)
{
  const struct lex_source *src = lex_source__ (lexer);
  return src ? lex_source_next__ (src, n)->first_line : 0;
}

/* Returns the 1-based line number of the end of the syntax that represents the
   token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
   token or if the token is drawn from a source that does not have line
   numbers.

   Most of the time, a single token is wholly within a single line of syntax,
   but there are two exceptions: a T_STRING token can be made up of multiple
   segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
   token can consist of a "-" on one line followed by the number on the next.
 */
int
lex_get_last_line_number (const struct lexer *lexer, int n)
{
  const struct lex_source *src = lex_source__ (lexer);
  return src ? lex_token_get_last_line_number (src,
                                               lex_source_next__ (src, n)) : 0;
}

/* Returns the 1-based column number of the start of the syntax that represents
   the token N after the current one in LEXER.  Returns 0 for a T_STOP
   token.

   Column numbers are measured according to the width of characters as shown in
   a typical fixed-width font, in which CJK characters have width 2 and
   combining characters have width 0.  */
int
lex_get_first_column (const struct lexer *lexer, int n)
{
  const struct lex_source *src = lex_source__ (lexer);
  return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
}

/* Returns the 1-based column number of the end of the syntax that represents
   the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
   token.

   Column numbers are measured according to the width of characters as shown in
   a typical fixed-width font, in which CJK characters have width 2 and
   combining characters have width 0.  */
int
lex_get_last_column (const struct lexer *lexer, int n)
{
  const struct lex_source *src = lex_source__ (lexer);
  return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
}

/* Returns the name of the syntax file from which the current command is drawn.
   Returns NULL for a T_STOP token or if the command's source does not have
   line numbers.

   There is no version of this function that takes an N argument because
   lookahead only works to the end of a command and any given command is always
   within a single syntax file. */
const char *
lex_get_file_name (const struct lexer *lexer)
{
  struct lex_source *src = lex_source__ (lexer);
  return src == NULL ? NULL : src->reader->file_name;
}

/* Returns a newly allocated msg_location for the syntax that represents tokens
   with 0-based offsets N0...N1, inclusive, from the current token.  The caller
   must eventually free the location (with msg_location_destroy()). */
struct msg_location *
lex_get_location (const struct lexer *lexer, int n0, int n1)
{
  struct msg_location *loc = lex_get_lines (lexer, n0, n1);
  loc->first_column = lex_get_first_column (lexer, n0);
  loc->last_column = lex_get_last_column (lexer, n1);
  return loc;
}

/* Returns a newly allocated msg_location for the syntax that represents tokens
   with 0-based offsets N0...N1, inclusive, from the current token.  The
   location only covers the tokens' lines, not the columns.  The caller must
   eventually free the location (with msg_location_destroy()). */
struct msg_location *
lex_get_lines (const struct lexer *lexer, int n0, int n1)
{
  struct msg_location *loc = xmalloc (sizeof *loc);
  *loc = (struct msg_location) {
    .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
    .first_line = lex_get_first_line_number (lexer, n0),
    .last_line = lex_get_last_line_number (lexer, n1),
  };
  return loc;
}

const char *
lex_get_encoding (const struct lexer *lexer)
{
  struct lex_source *src = lex_source__ (lexer);
  return src == NULL ? NULL : src->reader->encoding;
}

/* Returns the syntax mode for the syntax file from which the current drawn is
   drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
   does not have line numbers.

   There is no version of this function that takes an N argument because
   lookahead only works to the end of a command and any given command is always
   within a single syntax file. */
enum segmenter_mode
lex_get_syntax_mode (const struct lexer *lexer)
{
  struct lex_source *src = lex_source__ (lexer);
  return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
}

/* Returns the error mode for the syntax file from which the current drawn is
   drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
   source does not have line numbers.

   There is no version of this function that takes an N argument because
   lookahead only works to the end of a command and any given command is always
   within a single syntax file. */
enum lex_error_mode
lex_get_error_mode (const struct lexer *lexer)
{
  struct lex_source *src = lex_source__ (lexer);
  return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
}

/* If the source that LEXER is currently reading has error mode
   LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
   token to be read comes directly from whatever is next read from the stream.

   It makes sense to call this function after encountering an error in a
   command entered on the console, because usually the user would prefer not to
   have cascading errors. */
void
lex_interactive_reset (struct lexer *lexer)
{
  struct lex_source *src = lex_source__ (lexer);
  if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
    {
      src->head = src->tail = 0;
      src->journal_pos = src->seg_pos = src->line_pos = 0;
      src->n_newlines = 0;
      src->suppress_next_newline = false;
      src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
                                       false);
      lex_stage_clear (&src->pp);
      lex_stage_clear (&src->merge);
      lex_stage_clear (&src->lookahead);
      lex_source_push_endcmd__ (src);
    }
}

/* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
void
lex_discard_rest_of_command (struct lexer *lexer)
{
  while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
    lex_get (lexer);
}

/* Discards all lookahead tokens in LEXER, then discards all input sources
   until it encounters one with error mode LEX_ERROR_TERMINAL or until it
   runs out of input sources. */
void
lex_discard_noninteractive (struct lexer *lexer)
{
  struct lex_source *src = lex_source__ (lexer);

  if (src != NULL)
    {
      lex_stage_clear (&src->pp);
      lex_stage_clear (&src->merge);
      lex_stage_clear (&src->lookahead);

      for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
           src = lex_source__ (lexer))
        lex_source_destroy (src);
    }
}

static size_t
lex_source_max_tail__ (const struct lex_source *src_)
{
  struct lex_source *src = CONST_CAST (struct lex_source *, src_);

  assert (src->seg_pos >= src->line_pos);
  size_t max_tail = MIN (src->journal_pos, src->line_pos);

  /* Use the oldest token also. */
  struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
  for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
    if (!lex_stage_is_empty (stages[i]))
      {
        struct lex_token *first = lex_stage_first (stages[i]);
        assert (first->token_pos >= first->line_pos);
        return MIN (max_tail, first->line_pos);
      }

  return max_tail;
}

static void
lex_source_expand__ (struct lex_source *src)
{
  if (src->head - src->tail >= src->allocated)
    {
      size_t max_tail = lex_source_max_tail__ (src);
      if (max_tail > src->tail)
        {
          /* Advance the tail, freeing up room at the head. */
          memmove (src->buffer, src->buffer + (max_tail - src->tail),
                   src->head - max_tail);
          src->tail = max_tail;
        }
      else
        {
          /* Buffer is completely full.  Expand it. */
          src->buffer = x2realloc (src->buffer, &src->allocated);
        }
    }
  else
    {
      /* There's space available at the head of the buffer.  Nothing to do. */
    }
}

static void
lex_source_read__ (struct lex_source *src)
{
  do
    {
      lex_source_expand__ (src);

      size_t head_ofs = src->head - src->tail;
      size_t space = src->allocated - head_ofs;
      enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
      size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
                                           space, prompt);
      assert (n <= space);

      if (n == 0)
        {
          /* End of input. */
          src->reader->eof = true;
          lex_source_expand__ (src);
          return;
        }

      src->head += n;
    }
  while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
                  src->head - src->seg_pos));
}

static struct lex_source *
lex_source__ (const struct lexer *lexer)
{
  return (ll_is_empty (&lexer->sources) ? NULL
          : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
}

/* Returns the text of the syntax in SRC for tokens N0 ahead of the current
   one, through N1 ahead of the current one, inclusive.  (For example, if N0
   and N1 are both zero, this requests the syntax for the current token.)  The
   caller must eventually free the returned string (with free()).  The syntax
   is encoded in UTF-8 and in the original form supplied to the lexer so that,
   for example, it may include comments, spaces, and new-lines if it spans
   multiple tokens.  Macro expansion, however, has already been performed. */
static char *
lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
{
  struct string s = DS_EMPTY_INITIALIZER;
  for (size_t i = n0; i <= n1; )
    {
      /* Find [I,J) as the longest sequence of tokens not produced by macro
         expansion, or otherwise the longest sequence expanded from a single
         macro call. */
      const struct lex_token *first = lex_source_next__ (src, i);
      size_t j;
      for (j = i + 1; j <= n1; j++)
        {
          const struct lex_token *cur = lex_source_next__ (src, j);
          if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
              || first->macro_rep != cur->macro_rep)
            break;
        }
      const struct lex_token *last = lex_source_next__ (src, j - 1);

      /* Now add the syntax for this sequence of tokens to SRC. */
      if (!ds_is_empty (&s))
        ds_put_byte (&s, ' ');
      if (!first->macro_rep)
        {
          size_t start = first->token_pos;
          size_t end = last->token_pos + last->token_len;
          ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
                                           end - start));
        }
      else
        {
          size_t start = first->ofs;
          size_t end = last->ofs + last->len;
          ds_put_substring (&s, ss_buffer (first->macro_rep + start,
                                           end - start));
        }

      i = j;
    }
  return ds_steal_cstr (&s);
}

static bool
lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
{
  for (size_t i = n0; i <= n1; i++)
    if (lex_source_next__ (src, i)->macro_rep)
      return true;
  return false;
}

/* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
   raw UTF-8 syntax for the macro call (not for the expansion) and for any
   other tokens included in that range.  The syntax is encoded in UTF-8 and in
   the original form supplied to the lexer so that, for example, it may include
   comments, spaces, and new-lines if it spans multiple tokens.

   Returns an empty string if the token range doesn't include a macro call.

   The caller must not modify or free the returned string. */
static struct substring
lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
{
  if (!lex_source_contains_macro_call (src, n0, n1))
    return ss_empty ();

  const struct lex_token *token0 = lex_source_next__ (src, n0);
  const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
  size_t start = token0->token_pos;
  size_t end = token1->token_pos + token1->token_len;

  return ss_buffer (&src->buffer[start - src->tail], end - start);
}

static void
lex_source_error_valist (struct lex_source *src, int n0, int n1,
                         const char *format, va_list args)
{
  const struct lex_token *token;
  struct string s;

  ds_init_empty (&s);

  token = lex_source_next__ (src, n0);
  if (token->token.type == T_ENDCMD)
    ds_put_cstr (&s, _("Syntax error at end of command"));
  else
    {
      /* Get the syntax that caused the error. */
      char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
      char syntax[64];
      str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
      free (raw_syntax);

      /* Get the macro call(s) that expanded to the syntax that caused the
         error. */
      char call[64];
      str_ellipsize (lex_source_get_macro_call (src, n0, n1),
                     call, sizeof call);

      if (syntax[0])
        {
          if (call[0])
            ds_put_format (&s,
                           _("Syntax error at `%s' (in expansion of `%s')"),
                           syntax, call);
          else
            ds_put_format (&s, _("Syntax error at `%s'"), syntax);
        }
      else
        {
          if (call[0])
            ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
                           call);
          else
            ds_put_cstr (&s, _("Syntax error"));
        }
    }

  if (format)
    {
      ds_put_cstr (&s, ": ");
      ds_put_vformat (&s, format, args);
    }
  if (ds_last (&s) != '.')
    ds_put_byte (&s, '.');

  struct msg *m = xmalloc (sizeof *m);
  *m = (struct msg) {
    .category = MSG_C_SYNTAX,
    .severity = MSG_S_ERROR,
    .location = lex_source_get_location (src, n0, n1),
    .text = ds_steal_cstr (&s),
  };
  msg_emit (m);
}

static void
lex_get_error (struct lex_source *src, const struct lex_token *token)
{
  char syntax[64];
  str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
                            token->token_len),
                 syntax, sizeof syntax);

  struct string s = DS_EMPTY_INITIALIZER;
  ds_put_format (&s, _("Syntax error at `%s'"), syntax);
  ds_put_format (&s, ": %s", token->token.string.string);

  struct msg *m = xmalloc (sizeof *m);
  *m = (struct msg) {
    .category = MSG_C_SYNTAX,
    .severity = MSG_S_ERROR,
    .location = lex_token_location_rw (src, token, token),
    .text = ds_steal_cstr (&s),
  };
  msg_emit (m);
}

/* Attempts to append an additional token to 'pp' in SRC, reading more from the
   underlying lex_reader if necessary.  Returns true if a new token was added
   to SRC's deque, false otherwise.  The caller should retry failures unless
   SRC's 'eof' marker was set to true indicating that there will be no more
   tokens from this source. */
static bool
lex_source_try_get_pp (struct lex_source *src)
{
  /* Append a new token to SRC and initialize it. */
  struct lex_token *token = xmalloc (sizeof *token);
  token->token = (struct token) { .type = T_STOP };
  token->macro_rep = NULL;
  token->ref_cnt = NULL;
  token->line_pos = src->line_pos;
  token->token_pos = src->seg_pos;
  if (src->reader->line_number > 0)
    token->first_line = src->reader->line_number + src->n_newlines;
  else
    token->first_line = 0;

  /* Extract a segment. */
  const char *segment;
  enum segment_type seg_type;
  int seg_len;
  for (;;)
    {
      segment = &src->buffer[src->seg_pos - src->tail];
      seg_len = segmenter_push (&src->segmenter, segment,
                                src->head - src->seg_pos,
                                src->reader->eof, &seg_type);
      if (seg_len >= 0)
        break;

      /* The segmenter needs more input to produce a segment. */
      assert (!src->reader->eof);
      lex_source_read__ (src);
    }

  /* Update state based on the segment. */
  token->token_len = seg_len;
  src->seg_pos += seg_len;
  if (seg_type == SEG_NEWLINE)
    {
      src->line_pos = src->seg_pos;
      src->n_newlines++;
    }

  /* Get a token from the segment. */
  enum tokenize_result result = token_from_segment (
    seg_type, ss_buffer (segment, seg_len), &token->token);

  /* If we've reached the end of a line, or the end of a command, then pass
     the line to the output engine as a syntax text item.  */
  int n_lines = seg_type == SEG_NEWLINE;
  if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
    {
      n_lines++;
      src->suppress_next_newline = true;
    }
  else if (n_lines > 0 && src->suppress_next_newline)
    {
      n_lines--;
      src->suppress_next_newline = false;
    }
  for (int i = 0; i < n_lines; i++)
    {
      /* Beginning of line. */
      const char *line = &src->buffer[src->journal_pos - src->tail];

      /* Calculate line length, including \n or \r\n end-of-line if present.

         We use src->head even though that may be beyond what we've actually
         converted to tokens (which is only through line_pos).  That's because,
         if we're emitting the line due to SEG_END_COMMAND, we want to take the
         whole line through the newline, not just through the '.'. */
      size_t max_len = src->head - src->journal_pos;
      const char *newline = memchr (line, '\n', max_len);
      size_t line_len = newline ? newline - line + 1 : max_len;

      /* Calculate line length excluding end-of-line. */
      size_t copy_len = line_len;
      if (copy_len > 0 && line[copy_len - 1] == '\n')
        copy_len--;
      if (copy_len > 0 && line[copy_len - 1] == '\r')
        copy_len--;

      /* Submit the line as syntax. */
      output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
                                                   xmemdup0 (line, copy_len),
                                                   NULL));

      src->journal_pos += line_len;
    }

  switch (result)
    {
    case TOKENIZE_ERROR:
      lex_get_error (src, token);
      /* Fall through. */
    case TOKENIZE_EMPTY:
      lex_token_destroy (token);
      return false;

    case TOKENIZE_TOKEN:
      if (token->token.type == T_STOP)
        {
          token->token.type = T_ENDCMD;
          src->eof = true;
        }
      lex_stage_push_last (&src->pp, token);
      return true;
    }
  NOT_REACHED ();
}

/* Attempts to append a new token to SRC.  Returns true if successful, false on
   failure.  On failure, the end of SRC has been reached and no more tokens
   will be forthcoming from it.

   Does not make the new token available for lookahead yet; the caller must
   adjust SRC's 'middle' pointer to do so. */
static bool
lex_source_get_pp (struct lex_source *src)
{
  while (!src->eof)
    if (lex_source_try_get_pp (src))
      return true;
  return false;
}

static bool
lex_source_try_get_merge (const struct lex_source *src_)
{
  struct lex_source *src = CONST_CAST (struct lex_source *, src_);

  if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
    return false;

  if (!settings_get_mexpand ())
    {
      lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
      return true;
    }

  /* Now pass tokens one-by-one to the macro expander.

     In the common case where there is no macro to expand, the loop is not
     entered.  */
  struct macro_call *mc;
  int n_call = macro_call_create (src->lexer->macros,
                                  &lex_stage_first (&src->pp)->token, &mc);
  for (int ofs = 1; !n_call; ofs++)
    {
      if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
        {
          /* This should not be reachable because we always get a T_ENDCMD at
             the end of an input file (transformed from T_STOP by
             lex_source_try_get_pp()) and the macro_expander should always
             terminate expansion on T_ENDCMD. */
          NOT_REACHED ();
        }

      const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
      size_t start = t->token_pos;
      size_t end = t->token_pos + t->token_len;
      const struct macro_token mt = {
        .token = t->token,
        .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
      };
      const struct msg_location loc = lex_token_location (src, t, t);
      n_call = macro_call_add (mc, &mt, &loc);
    }
  if (n_call < 0)
    {
      /* False alarm: no macro expansion after all.  Use first token as
         lookahead.  We'll retry macro expansion from the second token next
         time around. */
      macro_call_destroy (mc);
      lex_stage_shift (&src->merge, &src->pp, 1);
      return true;
    }

  /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
     are a macro call.  (These are likely to be the only tokens in 'pp'.)
     Expand them.  */
  const struct lex_token *c0 = lex_stage_first (&src->pp);
  const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
  struct macro_tokens expansion = { .n = 0 };
  struct msg_location loc = lex_token_location (src, c0, c1);
  macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
  macro_call_destroy (mc);

  /* Convert the macro expansion into syntax for possible error messages
     later. */
  size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
  size_t *len = xnmalloc (expansion.n, sizeof *len);
  struct string s = DS_EMPTY_INITIALIZER;
  macro_tokens_to_syntax (&expansion, &s, ofs, len);

  if (settings_get_mprint ())
    output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
                                          _("Macro Expansion")));

  /* Append the macro expansion tokens to the lookahead. */
  char *macro_rep = ds_steal_cstr (&s);
  size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
  *ref_cnt = expansion.n;
  for (size_t i = 0; i < expansion.n; i++)
    {
      struct lex_token *token = xmalloc (sizeof *token);
      *token = (struct lex_token) {
        .token = expansion.mts[i].token,
        .token_pos = c0->token_pos,
        .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
        .line_pos = c0->line_pos,
        .first_line = c0->first_line,
        .macro_rep = macro_rep,
        .ofs = ofs[i],
        .len = len[i],
        .ref_cnt = ref_cnt,
      };
      lex_stage_push_last (&src->merge, token);

      ss_dealloc (&expansion.mts[i].syntax);
    }
  free (expansion.mts);
  free (ofs);
  free (len);

  /* Destroy the tokens for the call. */
  for (size_t i = 0; i < n_call; i++)
    lex_stage_pop_first (&src->pp);

  return expansion.n > 0;
}

/* Attempts to obtain at least one new token into 'merge' in SRC.

   Returns true if successful, false on failure.  In the latter case, SRC is
   exhausted and 'src->eof' is now true. */
static bool
lex_source_get_merge (struct lex_source *src)
{
  while (!src->eof)
    if (lex_source_try_get_merge (src))
      return true;
  return false;
}

/* Attempts to obtain at least one new token into 'lookahead' in SRC.

   Returns true if successful, false on failure.  In the latter case, SRC is
   exhausted and 'src->eof' is now true. */
static bool
lex_source_get_lookahead (struct lex_source *src)
{
  struct merger m = MERGER_INIT;
  for (size_t i = 0; ; i++)
    {
      while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
        {
          /* We always get a T_ENDCMD at the end of an input file
             (transformed from T_STOP by lex_source_try_get_pp()) and
             merger_add() should never return -1 on T_ENDCMD. */
          assert (lex_stage_is_empty (&src->merge));
          return false;
        }

      struct token out;
      int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
                               &out);
      if (!retval)
        {
          lex_stage_shift (&src->lookahead, &src->merge, 1);
          return true;
        }
      else if (retval > 0)
        {
          /* Add a token that merges all the tokens together. */
          const struct lex_token *first = lex_stage_first (&src->merge);
          const struct lex_token *last = lex_stage_nth (&src->merge,
                                                        retval - 1);
          bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
          struct lex_token *t = xmalloc (sizeof *t);
          *t = (struct lex_token) {
            .token = out,
            .token_pos = first->token_pos,
            .token_len = (last->token_pos - first->token_pos) + last->token_len,
            .line_pos = first->line_pos,
            .first_line = first->first_line,

            /* This works well if all the tokens were not expanded from macros,
               or if they came from the same macro expansion.  It just gives up
               in the other (corner) cases. */
            .macro_rep = macro ? first->macro_rep : NULL,
            .ofs = macro ? first->ofs : 0,
            .len = macro ? (last->ofs - first->ofs) + last->len : 0,
            .ref_cnt = first->ref_cnt,
          };
          if (t->ref_cnt)
            ++*t->ref_cnt;
          lex_stage_push_last (&src->lookahead, t);

          for (int i = 0; i < retval; i++)
            lex_stage_pop_first (&src->merge);
          return true;
        }
    }
}

static void
lex_source_push_endcmd__ (struct lex_source *src)
{
  assert (lex_stage_is_empty (&src->lookahead));
  struct lex_token *token = xmalloc (sizeof *token);
  *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
  lex_stage_push_last (&src->lookahead, token);
}

static struct lex_source *
lex_source_create (struct lexer *lexer, struct lex_reader *reader)
{
  struct lex_source *src = xmalloc (sizeof *src);
  *src = (struct lex_source) {
    .reader = reader,
    .segmenter = segmenter_init (reader->syntax, false),
    .lexer = lexer,
  };

  lex_source_push_endcmd__ (src);

  return src;
}

static void
lex_source_destroy (struct lex_source *src)
{
  char *file_name = src->reader->file_name;
  char *encoding = src->reader->encoding;
  if (src->reader->class->destroy != NULL)
    src->reader->class->destroy (src->reader);
  free (file_name);
  free (encoding);
  free (src->buffer);
  lex_stage_uninit (&src->pp);
  lex_stage_uninit (&src->merge);
  lex_stage_uninit (&src->lookahead);
  ll_remove (&src->ll);
  free (src);
}

struct lex_file_reader
  {
    struct lex_reader reader;
    struct u8_istream *istream;
  };

static struct lex_reader_class lex_file_reader_class;

/* Creates and returns a new lex_reader that will read from file FILE_NAME (or
   from stdin if FILE_NAME is "-").  The file is expected to be encoded with
   ENCODING, which should take one of the forms accepted by
   u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
   mode of the new reader, respectively.

   Returns a null pointer if FILE_NAME cannot be opened. */
struct lex_reader *
lex_reader_for_file (const char *file_name, const char *encoding,
                     enum segmenter_mode syntax,
                     enum lex_error_mode error)
{
  struct lex_file_reader *r;
  struct u8_istream *istream;

  istream = (!strcmp(file_name, "-")
             ? u8_istream_for_fd (encoding, STDIN_FILENO)
             : u8_istream_for_file (encoding, file_name, O_RDONLY));
  if (istream == NULL)
    {
      msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
      return NULL;
    }

  r = xmalloc (sizeof *r);
  lex_reader_init (&r->reader, &lex_file_reader_class);
  r->reader.syntax = syntax;
  r->reader.error = error;
  r->reader.file_name = xstrdup (file_name);
  r->reader.encoding = xstrdup_if_nonnull (encoding);
  r->reader.line_number = 1;
  r->istream = istream;

  return &r->reader;
}

static struct lex_file_reader *
lex_file_reader_cast (struct lex_reader *r)
{
  return UP_CAST (r, struct lex_file_reader, reader);
}

static size_t
lex_file_read (struct lex_reader *r_, char *buf, size_t n,
               enum prompt_style prompt_style UNUSED)
{
  struct lex_file_reader *r = lex_file_reader_cast (r_);
  ssize_t n_read = u8_istream_read (r->istream, buf, n);
  if (n_read < 0)
    {
      msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
      return 0;
    }
  return n_read;
}

static void
lex_file_close (struct lex_reader *r_)
{
  struct lex_file_reader *r = lex_file_reader_cast (r_);

  if (u8_istream_fileno (r->istream) != STDIN_FILENO)
    {
      if (u8_istream_close (r->istream) != 0)
        msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
    }
  else
    u8_istream_free (r->istream);

  free (r);
}

static struct lex_reader_class lex_file_reader_class =
  {
    lex_file_read,
    lex_file_close
  };

struct lex_string_reader
  {
    struct lex_reader reader;
    struct substring s;
    size_t offset;
  };

static struct lex_reader_class lex_string_reader_class;

/* Creates and returns a new lex_reader for the contents of S, which must be
   encoded in the given ENCODING.  The new reader takes ownership of S and will free it
   with ss_dealloc() when it is closed. */
struct lex_reader *
lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
{
  struct lex_string_reader *r;

  r = xmalloc (sizeof *r);
  lex_reader_init (&r->reader, &lex_string_reader_class);
  r->reader.syntax = SEG_MODE_AUTO;
  r->reader.encoding = xstrdup_if_nonnull (encoding);
  r->s = s;
  r->offset = 0;

  return &r->reader;
}

/* Creates and returns a new lex_reader for a copy of null-terminated string S,
   which must be encoded in ENCODING.  The caller retains ownership of S. */
struct lex_reader *
lex_reader_for_string (const char *s, const char *encoding)
{
  struct substring ss;
  ss_alloc_substring (&ss, ss_cstr (s));
  return lex_reader_for_substring_nocopy (ss, encoding);
}

/* Formats FORMAT as a printf()-like format string and creates and returns a
   new lex_reader for the formatted result.  */
struct lex_reader *
lex_reader_for_format (const char *format, const char *encoding, ...)
{
  struct lex_reader *r;
  va_list args;

  va_start (args, encoding);
  r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
  va_end (args);

  return r;
}

static struct lex_string_reader *
lex_string_reader_cast (struct lex_reader *r)
{
  return UP_CAST (r, struct lex_string_reader, reader);
}

static size_t
lex_string_read (struct lex_reader *r_, char *buf, size_t n,
                 enum prompt_style prompt_style UNUSED)
{
  struct lex_string_reader *r = lex_string_reader_cast (r_);
  size_t chunk;

  chunk = MIN (n, r->s.length - r->offset);
  memcpy (buf, r->s.string + r->offset, chunk);
  r->offset += chunk;

  return chunk;
}

static void
lex_string_close (struct lex_reader *r_)
{
  struct lex_string_reader *r = lex_string_reader_cast (r_);

  ss_dealloc (&r->s);
  free (r);
}

static struct lex_reader_class lex_string_reader_class =
  {
    lex_string_read,
    lex_string_close
  };
