#!/usr/bin/python3 # # parse.py: parse the Exidy Sorceror Basic file for Wizard's castle # # ### license # # NOTE: this is not a full/proper parser for Sorceror BASIC. It is *just* # enough to parse the original Wizard's Castle. # import os import sys import re # The DATA statements' values are loaded into this list. DATA_VALUES = [ ] # User-defined functions FN = { } # Various internal tokens. These are prefixed with '!' to ensure uniqueness. # Internal token representing a string constant in the program SCONST = '!STR' ASSIGN = '!ASSIGN' ### some kind of expression. likely an assignment. transitory. EXPR = '!EXPR' STMT_PRINT = '!PRINT' STMT_PNOCR = '!PNOCR' # Don't print a trailing newline STMT_IF = '!IF' STMT_THEN = '!THEN' STMT_GOTO = '!GOTO' STMT_GOSUB = '!GOSUB' STMT_RETURN = '!RETURN' STMT_ON = '!ON' STMT_FOR = '!FOR' STMT_NEXT = '!NEXT' STMT_INPUT = '!INPUT' STMT_NOOP = '!NOOP' STMT_END = '!END' STMT_POKE = '!POKE' STMT_READ = '!READ' STMT_RESTORE = '!RESTORE' TOKENS = [ # From Sorceror BASIC #'REM', # Handled specifically in parse() 'CLEAR', 'DIM', 'PRINT', 'POKE', 'PEEK', 'DEFFN', 'FOR', 'NEXT', 'INPUT', 'GOSUB', 'RETURN', 'GOTO', 'IF', 'THEN', 'ON', 'RESTORE', 'READ', #'DATA', # Handled specifically in parse() 'END', ] _RE_ASSIGN = re.compile('([A-Z][A-Z123]?\\$?(\\([0-9,AQ-]+\\))?)=(.*)') def parse_stmt(txt): for t in TOKENS: if txt.startswith(t): remainder = txt[len(t):] if t == 'GOTO': return (STMT_GOTO, int(remainder)) if t == 'GOSUB': return (STMT_GOSUB, int(remainder)) if t == 'RETURN': return (STMT_RETURN,) if t == 'NEXT': # We don't need the variable. Each NEXT statement properly names its # closest enclosing loop. return (STMT_NEXT,) if t == 'END': return (STMT_END,) if t in ['CLEAR', 'DIM']: return (STMT_NOOP,) if t == 'RESTORE': return (STMT_RESTORE,) if t == 'DEFFN': fn = txt[5] expr = parse_expr_list([txt[10:]]) FN[fn] = expr return (STMT_NOOP,) if t == 'POKE': return (STMT_POKE,) + tuple(remainder.split(',')) if t == 'READ': return (STMT_READ, remainder.split(',')) return (t, remainder) match = _RE_ASSIGN.match(txt) if match: varname, _, expr = match.groups() return (ASSIGN, varname.replace('$', '_S'), expr) return (EXPR, txt) # Allow a few operators. Allow lower-case int(). _RE_COMPARATOR = re.compile('([A-Z0-9][A-Z_(),0-9+-]*)' '(<|>|<=|>=|<>|=)' '([A-Zi0-9][A-Znt_(),0-9+*]*)') # A restricted version for the various ON ... GO* statements. _RE_COMPARATOR_STRICT = re.compile('([A-Z]+)(<|=)([0-9]+)') # Be wary of line 810. _RE_STR_COMPARE = re.compile('(.*?)([A-Z_]+|LEFT_S\\(O_S,2\\))(=|<>)') # Array rename, to avoid conflict with plainly-named variables. _RE_ARRAY_RENAME = re.compile('\\b([CT])\\(') def parse_expr_list(elist, concat=False, strict=False): """Parse an expression list, returning a Python expression. NOTE: this destroys the parameter, ELIST. """ expr = [ ] # Line 730 is annoying. Force some proper behavior: skip the + operators, # and don't wrap the subexpressions in str(). stringize = 'MID$(' not in elist while elist: e = elist.pop(0) if concat and expr: if stringize or len(expr) == 1: expr.append('+') # string concat if isinstance(e, tuple) and e[0] == SCONST: expr.append(repr(e[1])) elif e.startswith('-('): # This is line 680. The regex doesn't work well on it. Force it. expr.append('-_CMP(C(Q,1), "=", X)*_CMP(C(Q,2), "=", Y)*_CMP(C(Q,3), "=", Z)') elif e.startswith('VF'): # This is line 2680. Again, poor regex behavior. Force it. expr.append('VF+_CMP(PEEK(FND(Z)), "=", 25)') else: # There might be a semicolon. Fix it. if ';' in e: lead, e = e.split(';') expr.extend(parse_expr_list([lead])) expr.append('+') # FALLTHRU to handle the trailer (E). # Use special name for string variables. Avoid floating point. Convert # AND, OR, and INT to their direct Python equivalents. Also: rename any # C() or T() array usage, as they conflict with the plainly-named vars. e = e.replace('$', '_S').replace('1E3', '1000').replace('AND', ' and ') \ .replace('OR', ' or ').replace('INT(', 'int(') # Distinguish arrays from normal variables. e = _RE_ARRAY_RENAME.sub('\\1_A(', e) # Perform this AFTER the AND/OR substitution, in order to introduce spaces # to break apart operator/varnames. if strict: e = _RE_COMPARATOR_STRICT.sub('_CMP(\\1, "\\2", \\3)', e) else: e = _RE_COMPARATOR.sub('_CMP(\\1, "\\2", \\3)', e) # Handle comparisons where the RHS is a string constant. match = _RE_STR_COMPARE.search(e) if match: expr.append('%s_CMP(%s, "%s", ' % match.groups()) assert elist[0][0] == SCONST assert not concat expr.append(repr(elist[0][1])) expr.append(')') elist.pop(0) continue # If we're concatenating, then the subexpression may not be a string. # Ensure it's in string format. if concat and stringize: expr.append('str(%s)' % e) else: expr.append(e) return ''.join(expr) _RE_DIGITS = re.compile(' *[0-9]+') _RE_ON_GO = re.compile('(.*)(GOTO|GOSUB)(.*)') def split_parts(parts): result = [ ] # Avoid modifying caller-provided list parts = parts[:] while parts: item = parts.pop(0) if item[0] == 'IF': cond = [ ] while item[0] != 'THEN' and 'THEN' not in item[1]: cond.append(item[1]) item = parts.pop(0) assert item[0] == SCONST cond.append(item) item = parts.pop(0) if item[0] == 'THEN': result.append((STMT_IF, parse_expr_list(cond))) result.append((STMT_THEN,)) then_clause = item[1] else: trailing, then_clause = item[1].split('THEN', 1) cond.append(trailing) result.append((STMT_IF, parse_expr_list(cond))) result.append((STMT_THEN,)) if _RE_DIGITS.match(then_clause): result.append((STMT_GOTO, int(then_clause))) else: # parse and push into PARTS for further expansion s = parse_stmt(then_clause) parts.insert(0, s) elif item[0] == 'ON': if 'GO' in item[1]: cond, op, lines = _RE_ON_GO.match(item[1]).groups() cond = [cond] else: # We already know this doesn't need to be generalized. There is a single # SCONST, and the rest of the condition. cond = [item[1]] item = parts.pop(0) assert item[0] == SCONST cond.append(item) item = parts.pop(0) item, op, lines = _RE_ON_GO.match(item[1]).groups() cond.append(item) result.append((STMT_ON, parse_expr_list(cond, strict=True), op, [int(l) for l in lines.split(',')])) elif item[0] == 'FOR': idx1 = item[1].index('=') idx2 = item[1].index('TO') result.append((STMT_FOR, item[1][:idx1], parse_expr_list([item[1][idx1+1:idx2]]), parse_expr_list([(item[1][idx2+2:])]))) else: result.append(item) return result def build_statement(stmt, expr): if stmt[0] == ASSIGN: return stmt + (parse_expr_list(expr),) if stmt[0] == STMT_INPUT: # Either: [(SCONST, ""), ";O$"] or ["O$"] if len(expr) == 2: expr = expr[0][1] else: expr = '' return stmt + (expr,) def collapse_expressions(parts): result = [ ] stmt = None while parts: item = parts.pop(0) token = item[0] if stmt: if token is SCONST: if item[1]: expr.append(item) continue elif token is EXPR: if item[1]: # and item[1] != 'CHR$(12)': expr.append(item[1]) continue result.append(build_statement(stmt, expr)) stmt = None # FALLTHRU to deal with ITEM if token == 'PRINT': stmt = (STMT_PRINT,) # Skip any empty values, and the "clear screen" character. if item[1] and item[1] != 'CHR$(12)': expr = [item[1]] else: expr = [ ] elif token == ASSIGN: # Rename the arrays, to avoid conflicts with the plain-named vars. stmt = (ASSIGN, item[1].replace('T(', 'T_A(').replace('C(', 'C_A(')) if item[2]: expr = [item[2]] else: expr = [ ] elif token == 'INPUT': # Hard code: we know O$ is used for all INPUT statements stmt = (STMT_INPUT, 'O_S') if item[1]: expr = [item[1]] else: expr = [ ] else: result.append(item) if stmt: result.append(build_statement(stmt, expr)) return result def examine_print(parts): for i in range(len(parts)): if parts[i][0] == STMT_PRINT: if parts[i][1]: stmt = STMT_PRINT last = parts[i][1][-1] if last == ';': stmt = STMT_PNOCR del parts[i][1][-1] elif isinstance(last, str) and last.endswith(';'): stmt = STMT_PNOCR parts[i] = (stmt, parse_expr_list([(isinstance(s, str) and s.strip(';') or s) for s in parts[i][1]], concat=True)) else: # eval() will produce an empty string to print parts[i] = (STMT_PRINT, '""') return parts def rewrite(parts): parts = split_parts(parts) parts = collapse_expressions(parts) parts = examine_print(parts) return parts def parse(line): "Return a list of (TOKEN, INFO) pairs, representing the line of code." # DATA statements are parsed with a bit of care. We don't want the quote # parsing to interfere. The commas in the lines are enough for us. if line.startswith('DATA'): DATA_VALUES.extend(s.strip('"') for s in line[4:].split(',')) return [(STMT_NOOP,)] # Ignore all REM statements if line.startswith('REM'): return [(STMT_NOOP,)] # First off, we need to hide the text strings. This will allow easier # splitting/parsing of the remaining text. parts = line.split('"') # PARTS is now: [UNQUOTED QUOTED]* UNQUOTED for i in range(1, len(parts), 2): parts[i] = (SCONST, parts[i]) # For the remaining text, split each statement around ':' parts2 = [ ] for p in parts: if isinstance(p, str): parts2.extend(parse_stmt(s) for s in p.split(':')) else: parts2.append(p) # Rewrite the pieces of this statement list into more reasonable bits. # In particular, this collapses various parts into a singular statement. return rewrite(parts2) def read_raw_program(fp): lines = [l.strip() for l in fp.readlines()] result = { } for line in lines: if not line: continue lnum, text = line.split(' ', 1) result[int(lnum)] = parse(text) return result if __name__ == '__main__': PROG = read_raw_program(sys.stdin) import pprint pprint.pprint(PROG) #pprint.pprint(DATA_VALUES) #pprint.pprint(FN) # Anything unparsed? for l, stmts in PROG.items(): for s in stmts: if not s[0].startswith('!'): print(s)