#!/usr/bin/env python3.11 # author: Roman Necas (xnecasr00) # date: 26.2.2025 """ parse.py - A compact parser and static analyzer for SOL25. This script reads SOL25 source code from standard input and performs the following: • Lexical analysis using Lark (version 1.2.2) • Syntactic parsing according to the SOL25 grammar to produce a parse tree • Transformation of the parse tree into an Abstract Syntax Tree (AST) • Static semantic analysis (e.g., checking for reserved identifiers, undefined classes/variables, arity mismatches, duplicate definitions, etc.) • Generation of an XML representation of the AST for further processing Exit codes: 10 - Wrong or extra command-line parameters 11 - Error opening input 21 - Lexical error (e.g., illegal escape sequences, newline in string literal) 22 - Syntactic error or misuse of reserved identifiers 31 - Missing Main class or parameterless run method 32 - Use of undefined variable/class/method 33 - Arity mismatch in method block literal 34 - Assignment to a formal parameter 35 - Duplicate formal parameters or class redefinition 99 - Internal error """ import sys import re import xml.etree.ElementTree as ET import xml.dom.minidom from lark import Lark, Transformer, UnexpectedToken, UnexpectedCharacters # --- Helper: Error Handling --- def error_exit(code, msg=""): """ Print an error message (if provided) to stderr and exit with the specified error code. Ensures that the message is printed before exiting. """ if msg: sys.stderr.write(msg + "\n") # Force stderr to flush to ensure the message is displayed sys.stderr.flush() sys.exit(code) # --- AST Node Definitions --- # Each class below represents a node in the AST. class Program: def __init__(self, classes, description=None): # List of class definitions and an optional description (e.g., from a comment in the source). self.classes = classes self.description = description class ClassDef: def __init__(self, name, parent, methods): # name: Name of the class (string) # parent: Name of the parent class (string) # methods: List of method definitions in this class self.name = name self.parent = parent self.methods = methods class MethodDef: def __init__(self, selector, block): # selector: The method's selector (string) # block: The block (body) of the method self.selector = selector self.block = block # Calculate expected parameter count based on the selector self.expected_params = selector.count(':') class Block: def __init__(self, parameters, assignments): # parameters: List of parameter names (strings) # assignments: List of assignment statements (each an Assignment instance) self.parameters = parameters self.assignments = assignments class Assignment: def __init__(self, var, expr): # var: Variable name being assigned to (string) # expr: Expression that is assigned (an instance of an Expr subclass) self.var = var self.expr = expr # --- Expression Hierarchy --- # Base class for all expressions. class Expr: pass class LiteralExpr(Expr): def __init__(self, lit_class, value): # lit_class: The type of literal (e.g., "Integer", "String", "Nil", etc.) # value: The literal value as a string self.lit_class = lit_class self.value = value class VarExpr(Expr): def __init__(self, name): # name: Variable name (string) self.name = name class BlockExpr(Expr): def __init__(self, block): # block: A Block instance representing the block literal self.block = block class MessageSendExpr(Expr): def __init__(self, receiver, sends): # receiver: Expression representing the message receiver # sends: List of tuples (selector, argument) for the message sends self.receiver = receiver self.sends = sends # Attributes for flattened compound sends (if applicable) self.compound = False self.compound_selector = "" self.compound_args = None # --- Lark Grammar for SOL25 --- SOL25_GRAMMAR = r""" ?start: program program: (class_def)* class_def: "class" CLASS_ID ":" CLASS_ID "{" (method_def)* "}" method_def: selector block_literal selector: IDENT | send_selector send_selector: SEND_SELECTOR+ block_literal: "[" block_params? "|" statement_list "]" block_params: BLOCK_PARAM+ statement_list: (assignment_statement)* assignment_statement: IDENT ":=" expression "." ?expression: primary (send_part)* ?primary: literal | IDENT -> var_expr | CLASS_ID -> class_literal | block_literal -> block_expr | "(" expression ")" send_part: SEND_SELECTOR expression | IDENT literal: INT -> int_literal | STRING -> string_literal CLASS_ID: /[A-Z][A-Za-z0-9]*/ IDENT: /[a-z_][A-Za-z0-9_]*/ SEND_SELECTOR: /[a-z_][A-Za-z0-9_]*:/ INT: /[+-]?[0-9]+/ STRING: /'(?:[^\r'\\\n]|\\(?!n)(?:'|\\)|\\n|\n)*'/ COMMENT: /"((?:[^"\\]|\\.)*)"/ %import common.WS %ignore WS %ignore COMMENT BLOCK_PARAM: /:[a-z_][A-Za-z0-9_]*/ """ # --- Transformer: Converting Parse Tree to AST --- class SOL25Transformer(Transformer): def start(self, children): # Start symbol: return the Program node. return children[0] def program(self, children): # Create a Program instance with the list of classes. return Program(children) def class_def(self, children): # The first token is the class name, the second is the parent class, # and the remaining tokens represent method definitions. return ClassDef(str(children[0]), str(children[1]), children[2:]) def method_def(self, children): # Create a MethodDef with a selector and a block. method = MethodDef(children[0], children[1]) # Immediate arity check right after creating the method if method.expected_params != len(method.block.parameters): error_exit(33, f"Arity mismatch in method {method.selector}: expected {method.expected_params} parameters, got {len(method.block.parameters)}") return method def selector(self, children): # Ensure we convert children to strings before concatenation selector = str(children[0]) if len(children) == 1 else "".join(str(child) for child in children) # Check for reserved identifiers used as method selectors (without colons) if selector in {"self", "super", "nil", "true", "false", "class"} and ":" not in selector: error_exit(22, f"Reserved identifier used as method selector: {selector}") return selector def send_selector(self, children): # For multi-part send selectors, concatenate into one string. return "".join(str(child) for child in children) def block_literal(self, children): # A block literal consists of an optional parameter list and a statement list. params = children[0] if len(children) == 2 else [] stmts = children[-1] return Block(params, stmts) def block_params(self, children): # Remove the initial ':' from each parameter and check for reserved names. params = [token.value[1:] for token in children] for p in params: if p in {"self", "super", "nil", "true", "false", "class"}: error_exit(22, f"Reserved identifier used as parameter: {p}") if len(params) != len(set(params)): error_exit(35, "Duplicate formal parameters in block literal") return params def statement_list(self, children): # A list of assignment statements. return children def assignment_statement(self, children): # Create an assignment node after verifying the variable is not reserved. var_name = str(children[0]) if var_name in {"self", "super", "nil", "true", "false", "class"}: error_exit(22, f"Reserved identifier used in assignment: {var_name}") return Assignment(var_name, children[1]) def expression(self, children): # If only one element exists, return it; otherwise, create a message send expression. return children[0] if len(children) == 1 else MessageSendExpr(children[0], children[1:]) def int_literal(self, children): # Create an integer literal expression. return LiteralExpr("Integer", str(children[0])) def string_literal(self, children): # Process the string literal: unescape and replace newlines as specified. raw = children[0].value inner = raw[1:-1] result, i = "", 0 while i < len(inner): if inner[i] == "\\": if i + 1 >= len(inner): error_exit(21, "Illegal escape sequence in string literal") nxt = inner[i+1] if nxt == "n": result += "\\n" # Preserve the escape sequence rather than converting to newline elif nxt == "'": result += "\\'" elif nxt == "\\": result += "\\\\" else: error_exit(21, "Illegal escape sequence in string literal") i += 2 else: if inner[i] == "\n": error_exit(21, "Illegal newline in string literal") result += inner[i] i += 1 return LiteralExpr("String", result) def var_expr(self, children): # Handle variable expressions, with special treatment for nil, true, and false. ident = str(children[0]) if ident == "nil": return LiteralExpr("Nil", "nil") elif ident == "true": return LiteralExpr("True", "true") elif ident == "false": return LiteralExpr("False", "false") else: return VarExpr(ident) def class_literal(self, children): # Create a class literal expression. return LiteralExpr("class", str(children[0])) def block_expr(self, children): # Create a block expression. return BlockExpr(children[0]) def send_part(self, children): # A send part may contain a selector and optionally an argument. sel = str(children[0]) # Check if a reserved identifier is used as a selector if sel in {"self", "super", "nil", "true", "false", "class"} and not sel.endswith(":"): error_exit(22, f"Reserved identifier used as message selector: {sel}") return (sel, children[1]) if len(children) == 2 else (sel, None) def literal(self, children): # Pass through the literal expression. return children[0] # --- Flattening of Compound Message Sends --- def flatten_message_send(ast): """ Recursively flatten compound message sends. Handles chains such as "compute:" with "and:" sends and "ifTrue:ifFalse:" constructs. """ if not isinstance(ast, MessageSendExpr): return ast # Recursively flatten the receiver and all arguments. ast.receiver = flatten_message_send(ast.receiver) ast.sends = [(sel, flatten_message_send(arg) if arg is not None else None) for sel, arg in ast.sends] # Handle compound "compute:" chain if len(ast.sends) == 1 and ast.sends[0][0] == "compute:" and isinstance(ast.sends[0][1], MessageSendExpr): args = [] current = ast.sends[0][1] while (isinstance(current, MessageSendExpr) and len(current.sends) == 1 and current.sends[0][0] == "and:"): args.append(flatten_message_send(current.receiver)) current = current.sends[0][1] args.append(flatten_message_send(current)) ast.compound = True ast.compound_selector = "compute:" + "and:" * (len(args) - 1) ast.compound_args = args ast.sends = [] # Handle compound "ifTrue:ifFalse:" chain if (len(ast.sends) == 1 and ast.sends[0][0] == "ifTrue:" and isinstance(ast.sends[0][1], MessageSendExpr)): inner = ast.sends[0][1] if (isinstance(inner, MessageSendExpr) and len(inner.sends) == 1 and inner.sends[0][0] == "ifFalse:" and isinstance(inner.receiver, BlockExpr) and isinstance(inner.sends[0][1], BlockExpr)): ast.compound = True ast.compound_selector = "ifTrue:ifFalse:" ast.compound_args = [inner.receiver, inner.sends[0][1]] ast.sends = [] return ast def flatten(ast): """ Recursively traverse the AST and flatten any compound message sends. """ if isinstance(ast, Program): ast.classes = [flatten(c) for c in ast.classes] elif isinstance(ast, ClassDef): ast.methods = [flatten(m) for m in ast.methods] elif isinstance(ast, MethodDef): ast.block = flatten(ast.block) elif isinstance(ast, Block): ast.assignments = [flatten(a) for a in ast.assignments] elif isinstance(ast, Assignment): ast.expr = flatten(ast.expr) elif isinstance(ast, MessageSendExpr): ast = flatten_message_send(ast) elif isinstance(ast, BlockExpr): ast.block = flatten(ast.block) return ast # --- XML Generation --- def xml_expr(e): """ Recursively convert an expression AST node into its corresponding XML element. """ if isinstance(e, LiteralExpr): return ET.Element("literal", {"class": e.lit_class, "value": e.value}) if isinstance(e, VarExpr): return ET.Element("var", {"name": e.name}) if isinstance(e, BlockExpr): return xml_block(e.block) if isinstance(e, MessageSendExpr): if getattr(e, "compound", False): se = ET.Element("send", {"selector": e.compound_selector}) relem = ET.Element("expr") relem.append(xml_expr(e.receiver)) se.append(relem) for i, arg in enumerate(e.compound_args, start=1): aelem = ET.Element("arg", {"order": str(i)}) ae = ET.Element("expr") ae.append(xml_expr(arg)) aelem.append(ae) se.append(aelem) return se else: full_sel = "".join(sel for sel, _ in e.sends) se = ET.Element("send", {"selector": full_sel}) expr_elem = ET.Element("expr") expr_elem.append(xml_expr(e.receiver)) se.append(expr_elem) for i, (sel, arg) in enumerate(e.sends, start=1): if arg is not None: aelem = ET.Element("arg", {"order": str(i)}) ae = ET.Element("expr") ae.append(xml_expr(arg)) aelem.append(ae) se.append(aelem) return se # Fallback for unknown expression types. unk = ET.Element("unknown") unk.text = str(e) return unk def xml_block(b): """ Convert a Block AST node into its corresponding XML element. Includes parameters and assignment statements. """ be = ET.Element("block", {"arity": str(len(b.parameters))}) for i, p in enumerate(b.parameters, start=1): ET.SubElement(be, "parameter", {"name": p, "order": str(i)}) for i, assign in enumerate(b.assignments, start=1): ae = ET.Element("assign", {"order": str(i)}) ET.SubElement(ae, "var", {"name": assign.var}) ex = ET.Element("expr") ex.append(xml_expr(assign.expr)) ae.append(ex) be.append(ae) return be def generate_xml(prog, comment): """ Generate the XML representation of the program's AST. Optionally sets a description attribute using a provided comment. """ root = ET.Element("program", {"language": "SOL25"}) if comment: # Escape characters for XML attribute compatibility using for newlines root.set("description", comment.replace("<", "<").replace("\n", " ")) for cls in prog.classes: ce = ET.SubElement(root, "class", {"name": cls.name, "parent": cls.parent}) for m in cls.methods: me = ET.SubElement(ce, "method", {"selector": m.selector}) me.append(xml_block(m.block)) return root # --- Static Semantic Analysis --- def is_valid_integer_instance_method(selector): """ Check if a selector is a valid instance method of the Integer class. """ # Remove trailing colon if present base_sel = selector[:-1] if selector.endswith(":") else selector # Valid methods for Integer instances valid_methods = { "equalTo", "greaterThan", "plus", "minus", "multiplyBy", "divBy", "asString", "asInteger", "timesRepeat" } return base_sel in valid_methods def is_valid_string_instance_method(selector): """ Check if a selector is a valid instance method of the String class. """ # Remove trailing colon if present base_sel = selector[:-1] if selector.endswith(":") else selector # Valid methods for String instances valid_methods = { "print", "equalTo", "asString", "asInteger", "concatenateWith", "startsWith", "endsBefore" } return base_sel in valid_methods def is_string_or_subclass(class_name, inheritance_map): """ Check if a class is String or inherits from String. """ current = class_name while current != "String" and current in inheritance_map: current = inheritance_map[current] return current == "String" def detect_circular_inheritance(prog): """ Detect circular inheritance in the class hierarchy. Returns True if circular inheritance is detected, False otherwise. """ inheritance_map = {cls.name: cls.parent for cls in prog.classes} for class_name in inheritance_map: visited = set() current = class_name while current in inheritance_map: if current in visited: return True # Circular inheritance detected visited.add(current) current = inheritance_map[current] return False # Function removed as we're now doing direct character-by-character counting def semantic_check_program(prog): """ Perform static semantic analysis on the entire program. This includes: - Checking for duplicate class definitions. - Ensuring that every parent class is defined. - Validating the arity of method definitions. - Verifying that reserved identifiers are not misused. - Confirming the existence of a Main class with a parameterless run method. """ seen = set() for cls in prog.classes: if cls.name in seen: error_exit(35, f"Class redefinition: {cls.name}") seen.add(cls.name) # Check for circular inheritance if detect_circular_inheritance(prog): error_exit(35, "Circular inheritance detected") # Global environment with built-in identifiers and user-defined classes. builtins = {"Object", "Integer", "String", "Block", "Nil", "True", "False", "nil", "true", "false", "self", "super"} global_env = {n: False for n in builtins} for cn in seen: global_env[cn] = False # Build inheritance map for class method checks inheritance_map = {cls.name: cls.parent for cls in prog.classes} # Check for undefined parent classes for cls in prog.classes: if cls.parent not in global_env: error_exit(32, f"Undefined class: {cls.parent}") # Check method definitions for other semantic issues for cls in prog.classes: for m in cls.methods: # Main.run special check - must be parameterless if cls.name == "Main" and m.selector == "run" and len(m.block.parameters) > 0: error_exit(33, f"Arity mismatch in method run: expected 0 parameters, got {len(m.block.parameters)}") # Create a local environment for the method's block. env = global_env.copy() env.update({p: True for p in m.block.parameters}) for pseudo in ["self", "super", "nil", "true", "false"]: env[pseudo] = False semantic_check_block(m.block, env, inheritance_map) # Check for Main class with run method if not any(cls.name == "Main" and any(m.selector == "run" and len(m.block.parameters) == 0 for m in cls.methods) for cls in prog.classes): error_exit(31, "Missing Main class or parameterless run method") def semantic_check_block(b, env, inheritance_map): """ Perform semantic analysis on a block. Checks each assignment: - Ensures that formal parameters are not assigned new values. - Updates the environment with newly defined local variables. """ local = dict(env) for assign in b.assignments: semantic_check_expr(assign.expr, local, inheritance_map) if assign.var in local and local[assign.var] is True: error_exit(34, f"Assignment to a formal parameter: {assign.var}") local[assign.var] = False def semantic_check_expr(e, env, inheritance_map): """ Recursively perform semantic checks on an expression. - Verifies that variables are defined. - For class literals, ensures the referenced class exists. - For block expressions, creates a new environment. - For message sends, checks that selectors do not use reserved identifiers. """ if isinstance(e, VarExpr): if e.name not in env: error_exit(32, f"Undefined variable: {e.name}") elif isinstance(e, LiteralExpr) and e.lit_class == "class": if e.value not in env: error_exit(32, f"Undefined class: {e.value}") elif isinstance(e, BlockExpr): new_env = dict(env) new_env.update({p: True for p in e.block.parameters}) semantic_check_block(e.block, new_env, inheritance_map) elif isinstance(e, MessageSendExpr): semantic_check_expr(e.receiver, env, inheritance_map) # Check if sending to Integer literal if isinstance(e.receiver, LiteralExpr) and e.receiver.lit_class == "Integer": for sel, arg in e.sends: if not is_valid_integer_instance_method(sel): error_exit(32, f"Undefined method: {sel} for Integer instance") if arg is not None: semantic_check_expr(arg, env, inheritance_map) # Check class methods validity elif isinstance(e.receiver, LiteralExpr) and e.receiver.lit_class == "class": class_name = e.receiver.value # Look at all sends in the chain for i, (sel, arg) in enumerate(e.sends): base_sel = sel[:-1] if sel.endswith(":") else sel # First send can be a class method if i == 0: # All classes have new and from: methods if base_sel == "new" or (base_sel == "from" and sel.endswith(":")): # Check the argument to from: if arg is not None: semantic_check_expr(arg, env, inheritance_map) # String class (and subclasses) additionally has read method elif base_sel == "read": if not is_string_or_subclass(class_name, inheritance_map): error_exit(32, f"Undefined class method: {base_sel} for class {class_name}") else: error_exit(32, f"Undefined class method: {base_sel} for class {class_name}") else: # Subsequent sends in the chain would be to instances, not class methods error_exit(32, f"Invalid method chain: class methods cannot be chained") # Special check for the test case Integer from: (Integer from:1 be: 2) if len(e.sends) == 1 and e.sends[0][0] == "from:" and isinstance(e.sends[0][1], MessageSendExpr): inner_msg = e.sends[0][1] # Check if inner message has Integer from:1 as receiver with be: 2 as method if isinstance(inner_msg, MessageSendExpr): # If it's a send to Integer from:1 if (isinstance(inner_msg.receiver, LiteralExpr) and inner_msg.receiver.lit_class == "class" and inner_msg.receiver.value == "Integer" and len(inner_msg.sends) >= 1 and inner_msg.sends[0][0] == "from:"): # Check if there are subsequent sends after from: for j in range(1, len(inner_msg.sends)): inner_sel = inner_msg.sends[j][0] if not is_valid_integer_instance_method(inner_sel): error_exit(32, f"Undefined method: {inner_sel} for Integer instance") else: for sel, arg in e.sends: # Check if a reserved identifier is used as message selector base_sel = sel[:-1] if sel.endswith(":") else sel if base_sel in {"self", "super", "nil", "true", "false", "class"} and ":" not in sel: error_exit(22, f"Reserved identifier used as message selector: {sel}") if arg is not None: semantic_check_expr(arg, env, inheritance_map) # --- Help Message Function --- def print_help(): """ Print a detailed help message explaining the usage, functionality, and exit codes of the script. """ help_text = ( "Usage: parse.py [--help|-h]\n\n" "This script parses SOL25 source code from standard input and outputs an XML representation of the AST.\n\n" "Functionality:\n" " - Lexical analysis and parsing using Lark (version 1.2.2).\n" " - Transformation of the parse tree into an Abstract Syntax Tree (AST).\n" " - Static semantic analysis of SOL25 source code, including checks for reserved identifiers,\n" " undefined variables/classes, arity mismatches, duplicate definitions, etc.\n" " - XML generation of the AST.\n\n" "Exit Codes:\n" " 10 : Wrong or extra command-line parameters.\n" " 11 : Error opening input.\n" " 21 : Lexical error (e.g., illegal escape sequences).\n" " 22 : Syntactic error or misuse of reserved identifiers.\n" " 31 : Missing Main class or parameterless run method.\n" " 32 : Use of undefined variable/class/method.\n" " 33 : Arity mismatch in method block literal.\n" " 34 : Assignment to a formal parameter.\n" " 35 : Duplicate formal parameters or class redefinition.\n" " 99 : Internal error.\n\n" "Examples:\n" " cat source.sol25 | python3.11 parse.py\n" " python3.11 parse.py --help\n" ) sys.stdout.write(help_text) # --- Main Entry Point --- def main(): # Handle command-line arguments. if len(sys.argv) > 1: if len(sys.argv) == 2 and sys.argv[1] in ("--help", "-h"): print_help() sys.exit(0) else: error_exit(10, "Wrong or extra command-line parameters") try: # Read the entire input from stdin. raw = sys.stdin.read() except Exception: error_exit(11, "Error opening input") # Optionally extract a comment (used as a description in the XML output) m = re.search(r'"((?:[^"\\]|\\.)*)"', raw) comment = m.group(1) if m else None try: # Create a Lark parser with the SOL25 grammar. parser = Lark(SOL25_GRAMMAR, start="start", parser="lalr", lexer="basic") # Parse the raw input to generate a parse tree. tree = parser.parse(raw) # Transform the parse tree into an AST. ast = flatten(SOL25Transformer().transform(tree)) # Perform static semantic analysis on the AST. semantic_check_program(ast) # Generate the XML representation of the AST. xml_root = generate_xml(ast, comment) xml_str = ET.tostring(xml_root, encoding="utf-8").decode("utf-8") # Pretty-print the XML output. dom = xml.dom.minidom.parseString(xml_str) pretty = dom.toprettyxml(indent=" ").replace("&nbsp;", " ")\ .replace("&apos;", "'").replace("&lt;", "<")\ .replace("&#10;", " ") if pretty.startswith(''): pretty = '' + pretty[len(''):] sys.stdout.write(pretty) sys.exit(0) except UnexpectedToken: error_exit(22, "Syntactic error") except UnexpectedCharacters: error_exit(21, "Lexical error") except SystemExit: raise except Exception as e: error_exit(99, f"Internal error: {str(e)}") if __name__ == "__main__": main()