#!/usr/bin/python3
#
# parse.py: parse the Exidy Sorceror Basic file for Wizard's castle
#
# ### license
#
# NOTE: this is not a full/proper parser for Sorceror BASIC. It is *just*
# enough to parse the original Wizard's Castle.
#

import os
import sys
import re

# The DATA statements' values are loaded into this list.
DATA_VALUES = [ ]

# User-defined functions
FN = { }

# Various internal tokens. These are prefixed with '!' to ensure uniqueness.
# Internal token representing a string constant in the program
SCONST = '!STR'
ASSIGN = '!ASSIGN'

### some kind of expression. likely an assignment. transitory.
EXPR = '!EXPR'

STMT_PRINT = '!PRINT'
STMT_PNOCR = '!PNOCR'  # Don't print a trailing newline
STMT_IF = '!IF'
STMT_THEN = '!THEN'
STMT_GOTO = '!GOTO'
STMT_GOSUB = '!GOSUB'
STMT_RETURN = '!RETURN'
STMT_ON = '!ON'
STMT_FOR = '!FOR'
STMT_NEXT = '!NEXT'
STMT_INPUT = '!INPUT'
STMT_NOOP = '!NOOP'
STMT_END = '!END'
STMT_POKE = '!POKE'
STMT_READ = '!READ'
STMT_RESTORE = '!RESTORE'

TOKENS = [
  # From Sorceror BASIC
  #'REM',  # Handled specifically in parse()
  'CLEAR',
  'DIM',
  'PRINT',
  'POKE',
  'PEEK',
  'DEFFN',
  'FOR',
  'NEXT',
  'INPUT',
  'GOSUB',
  'RETURN',
  'GOTO',
  'IF',
  'THEN',
  'ON',
  'RESTORE',
  'READ',
  #'DATA',  # Handled specifically in parse()
  'END',
  ]


_RE_ASSIGN = re.compile('([A-Z][A-Z123]?\\$?(\\([0-9,AQ-]+\\))?)=(.*)')

def parse_stmt(txt):
  for t in TOKENS:
    if txt.startswith(t):
      remainder = txt[len(t):]
      if t == 'GOTO':
        return (STMT_GOTO, int(remainder))
      if t == 'GOSUB':
        return (STMT_GOSUB, int(remainder))
      if t == 'RETURN':
        return (STMT_RETURN,)
      if t == 'NEXT':
        # We don't need the variable. Each NEXT statement properly names its
        # closest enclosing loop.
        return (STMT_NEXT,)
      if t == 'END':
        return (STMT_END,)
      if t in ['CLEAR', 'DIM']:
        return (STMT_NOOP,)
      if t == 'RESTORE':
        return (STMT_RESTORE,)
      if t == 'DEFFN':
        fn = txt[5]
        expr = parse_expr_list([txt[10:]])
        FN[fn] = expr
        return (STMT_NOOP,)
      if t == 'POKE':
        return (STMT_POKE,) + tuple(remainder.split(','))
      if t == 'READ':
        return (STMT_READ, remainder.split(','))
      return (t, remainder)
  match = _RE_ASSIGN.match(txt)
  if match:
    varname, _, expr = match.groups()
    return (ASSIGN, varname.replace('$', '_S'), expr)
  return (EXPR, txt)


# Allow a few operators. Allow lower-case int().
_RE_COMPARATOR = re.compile('([A-Z0-9][A-Z_(),0-9+-]*)'
                            '(<|>|<=|>=|<>|=)'
                            '([A-Zi0-9][A-Znt_(),0-9+*]*)')

# A restricted version for the various ON ... GO* statements.
_RE_COMPARATOR_STRICT = re.compile('([A-Z]+)(<|=)([0-9]+)')

# Be wary of line 810.
_RE_STR_COMPARE = re.compile('(.*?)([A-Z_]+|LEFT_S\\(O_S,2\\))(=|<>)')

# Array rename, to avoid conflict with plainly-named variables.
_RE_ARRAY_RENAME = re.compile('\\b([CT])\\(')

def parse_expr_list(elist, concat=False, strict=False):
  """Parse an expression list, returning a Python expression.

  NOTE: this destroys the parameter, ELIST.
  """
  expr = [ ]

  # Line 730 is annoying. Force some proper behavior: skip the + operators,
  # and don't wrap the subexpressions in str().
  stringize = 'MID$(' not in elist

  while elist:
    e = elist.pop(0)
    if concat and expr:
      if stringize or len(expr) == 1:
        expr.append('+')  # string concat
    if isinstance(e, tuple) and e[0] == SCONST:
      expr.append(repr(e[1]))
    elif e.startswith('-('):
      # This is line 680. The regex doesn't work well on it. Force it.
      expr.append('-_CMP(C(Q,1), "=", X)*_CMP(C(Q,2), "=", Y)*_CMP(C(Q,3), "=", Z)')
    elif e.startswith('VF'):
      # This is line 2680. Again, poor regex behavior. Force it.
      expr.append('VF+_CMP(PEEK(FND(Z)), "=", 25)')
    else:
      # There might be a semicolon. Fix it.
      if ';' in e:
        lead, e = e.split(';')
        expr.extend(parse_expr_list([lead]))
        expr.append('+')
        # FALLTHRU to handle the trailer (E).

      # Use special name for string variables. Avoid floating point. Convert
      # AND, OR, and INT to their direct Python equivalents. Also: rename any
      # C() or T() array usage, as they conflict with the plainly-named vars.
      e = e.replace('$', '_S').replace('1E3', '1000').replace('AND', ' and ') \
          .replace('OR', ' or ').replace('INT(', 'int(')

      # Distinguish arrays from normal variables.
      e = _RE_ARRAY_RENAME.sub('\\1_A(', e)

      # Perform this AFTER the AND/OR substitution, in order to introduce spaces
      # to break apart operator/varnames.
      if strict:
        e = _RE_COMPARATOR_STRICT.sub('_CMP(\\1, "\\2", \\3)', e)
      else:
        e = _RE_COMPARATOR.sub('_CMP(\\1, "\\2", \\3)', e)

      # Handle comparisons where the RHS is a string constant.
      match = _RE_STR_COMPARE.search(e)
      if match:
        expr.append('%s_CMP(%s, "%s", ' % match.groups())
        assert elist[0][0] == SCONST
        assert not concat
        expr.append(repr(elist[0][1]))
        expr.append(')')
        elist.pop(0)
        continue

      # If we're concatenating, then the subexpression may not be a string.
      # Ensure it's in string format.
      if concat and stringize:
        expr.append('str(%s)' % e)
      else:
        expr.append(e)

  return ''.join(expr)


_RE_DIGITS = re.compile(' *[0-9]+')
_RE_ON_GO = re.compile('(.*)(GOTO|GOSUB)(.*)')

def split_parts(parts):
  result = [ ]

  # Avoid modifying caller-provided list
  parts = parts[:]

  while parts:
    item = parts.pop(0)
    if item[0] == 'IF':
      cond = [ ]
      while item[0] != 'THEN' and 'THEN' not in item[1]:
        cond.append(item[1])
        item = parts.pop(0)
        assert item[0] == SCONST
        cond.append(item)
        item = parts.pop(0)
      if item[0] == 'THEN':
        result.append((STMT_IF, parse_expr_list(cond)))
        result.append((STMT_THEN,))
        then_clause = item[1]
      else:
        trailing, then_clause = item[1].split('THEN', 1)
        cond.append(trailing)
        result.append((STMT_IF, parse_expr_list(cond)))
        result.append((STMT_THEN,))

      if _RE_DIGITS.match(then_clause):
        result.append((STMT_GOTO, int(then_clause)))
      else:
        # parse and push into PARTS for further expansion
        s = parse_stmt(then_clause)
        parts.insert(0, s)
    elif item[0] == 'ON':
      if 'GO' in item[1]:
        cond, op, lines = _RE_ON_GO.match(item[1]).groups()
        cond = [cond]
      else:
        # We already know this doesn't need to be generalized. There is a single
        # SCONST, and the rest of the condition.
        cond = [item[1]]
        item = parts.pop(0)
        assert item[0] == SCONST
        cond.append(item)
        item = parts.pop(0)
        item, op, lines = _RE_ON_GO.match(item[1]).groups()
        cond.append(item)
      result.append((STMT_ON, parse_expr_list(cond, strict=True), op,
                     [int(l) for l in lines.split(',')]))
    elif item[0] == 'FOR':
      idx1 = item[1].index('=')
      idx2 = item[1].index('TO')
      result.append((STMT_FOR, item[1][:idx1],
                     parse_expr_list([item[1][idx1+1:idx2]]),
                     parse_expr_list([(item[1][idx2+2:])])))
    else:
      result.append(item)

  return result


def build_statement(stmt, expr):
  if stmt[0] == ASSIGN:
    return stmt + (parse_expr_list(expr),)

  if stmt[0] == STMT_INPUT:
    # Either: [(SCONST, ""), ";O$"] or ["O$"]
    if len(expr) == 2:
      expr = expr[0][1]
    else:
      expr = ''

  return stmt + (expr,)


def collapse_expressions(parts):
  result = [ ]
  stmt = None
  while parts:
    item = parts.pop(0)
    token = item[0]
    if stmt:
      if token is SCONST:
        if item[1]:
          expr.append(item)
        continue
      elif token is EXPR:
        if item[1]:  # and item[1] != 'CHR$(12)':
          expr.append(item[1])
        continue
      result.append(build_statement(stmt, expr))
      stmt = None
      # FALLTHRU to deal with ITEM

    if token == 'PRINT':
      stmt = (STMT_PRINT,)
      # Skip any empty values, and the "clear screen" character.
      if item[1] and item[1] != 'CHR$(12)':
        expr = [item[1]]
      else:
        expr = [ ]
    elif token == ASSIGN:
      # Rename the arrays, to avoid conflicts with the plain-named vars.
      stmt = (ASSIGN, item[1].replace('T(', 'T_A(').replace('C(', 'C_A('))
      if item[2]:
        expr = [item[2]]
      else:
        expr = [ ]
    elif token == 'INPUT':
      # Hard code: we know O$ is used for all INPUT statements
      stmt = (STMT_INPUT, 'O_S')
      if item[1]:
        expr = [item[1]]
      else:
        expr = [ ]
    else:
      result.append(item)

  if stmt:
    result.append(build_statement(stmt, expr))
  return result


def examine_print(parts):
  for i in range(len(parts)):
    if parts[i][0] == STMT_PRINT:
      if parts[i][1]:
        stmt = STMT_PRINT
        last = parts[i][1][-1]
        if last == ';':
          stmt = STMT_PNOCR
          del parts[i][1][-1]
        elif isinstance(last, str) and last.endswith(';'):
          stmt = STMT_PNOCR
        parts[i] = (stmt, parse_expr_list([(isinstance(s, str) and s.strip(';') or s)
                                           for s in parts[i][1]],
                                          concat=True))
      else:
        # eval() will produce an empty string to print
        parts[i] = (STMT_PRINT, '""')
  return parts


def rewrite(parts):
  parts = split_parts(parts)
  parts = collapse_expressions(parts)
  parts = examine_print(parts)
  return parts


def parse(line):
  "Return a list of (TOKEN, INFO) pairs, representing the line of code."

  # DATA statements are parsed with a bit of care. We don't want the quote
  # parsing to interfere. The commas in the lines are enough for us.
  if line.startswith('DATA'):
    DATA_VALUES.extend(s.strip('"') for s in line[4:].split(','))
    return [(STMT_NOOP,)]

  # Ignore all REM statements
  if line.startswith('REM'):
    return [(STMT_NOOP,)]

  # First off, we need to hide the text strings. This will allow easier
  # splitting/parsing of the remaining text.
  parts = line.split('"')

  # PARTS is now: [UNQUOTED QUOTED]* UNQUOTED
  for i in range(1, len(parts), 2):
    parts[i] = (SCONST, parts[i])

  # For the remaining text, split each statement around ':'
  parts2 = [ ]
  for p in parts:
    if isinstance(p, str):
      parts2.extend(parse_stmt(s) for s in p.split(':'))
    else:
      parts2.append(p)

  # Rewrite the pieces of this statement list into more reasonable bits.
  # In particular, this collapses various parts into a singular statement.
  return rewrite(parts2)


def read_raw_program(fp):
  lines = [l.strip() for l in fp.readlines()]
  result = { }
  for line in lines:
    if not line:
      continue

    lnum, text = line.split(' ', 1)
    result[int(lnum)] = parse(text)

  return result


if __name__ == '__main__':
  PROG = read_raw_program(sys.stdin)
  import pprint
  pprint.pprint(PROG)
  #pprint.pprint(DATA_VALUES)
  #pprint.pprint(FN)

  # Anything unparsed?
  for l, stmts in PROG.items():
    for s in stmts:
      if not s[0].startswith('!'):
        print(s)