Projects/2BIT/summer-semester/IPP1/parse.py
2026-04-14 19:28:46 +02:00

716 lines
No EOL
29 KiB
Python

#!/usr/bin/env python3.11
# author: Roman Necas (xnecasr00)
# date: 26.2.2025
"""
parse.py - A compact parser and static analyzer for SOL25.
This script reads SOL25 source code from standard input and performs the following:
• Lexical analysis using Lark (version 1.2.2)
• Syntactic parsing according to the SOL25 grammar to produce a parse tree
• Transformation of the parse tree into an Abstract Syntax Tree (AST)
• Static semantic analysis (e.g., checking for reserved identifiers, undefined classes/variables,
arity mismatches, duplicate definitions, etc.)
• Generation of an XML representation of the AST for further processing
Exit codes:
10 - Wrong or extra command-line parameters
11 - Error opening input
21 - Lexical error (e.g., illegal escape sequences, newline in string literal)
22 - Syntactic error or misuse of reserved identifiers
31 - Missing Main class or parameterless run method
32 - Use of undefined variable/class/method
33 - Arity mismatch in method block literal
34 - Assignment to a formal parameter
35 - Duplicate formal parameters or class redefinition
99 - Internal error
"""
import sys
import re
import xml.etree.ElementTree as ET
import xml.dom.minidom
from lark import Lark, Transformer, UnexpectedToken, UnexpectedCharacters
# --- Helper: Error Handling ---
def error_exit(code, msg=""):
"""
Print an error message (if provided) to stderr and exit with the specified error code.
Ensures that the message is printed before exiting.
"""
if msg:
sys.stderr.write(msg + "\n")
# Force stderr to flush to ensure the message is displayed
sys.stderr.flush()
sys.exit(code)
# --- AST Node Definitions ---
# Each class below represents a node in the AST.
class Program:
def __init__(self, classes, description=None):
# List of class definitions and an optional description (e.g., from a comment in the source).
self.classes = classes
self.description = description
class ClassDef:
def __init__(self, name, parent, methods):
# name: Name of the class (string)
# parent: Name of the parent class (string)
# methods: List of method definitions in this class
self.name = name
self.parent = parent
self.methods = methods
class MethodDef:
def __init__(self, selector, block):
# selector: The method's selector (string)
# block: The block (body) of the method
self.selector = selector
self.block = block
# Calculate expected parameter count based on the selector
self.expected_params = selector.count(':')
class Block:
def __init__(self, parameters, assignments):
# parameters: List of parameter names (strings)
# assignments: List of assignment statements (each an Assignment instance)
self.parameters = parameters
self.assignments = assignments
class Assignment:
def __init__(self, var, expr):
# var: Variable name being assigned to (string)
# expr: Expression that is assigned (an instance of an Expr subclass)
self.var = var
self.expr = expr
# --- Expression Hierarchy ---
# Base class for all expressions.
class Expr:
pass
class LiteralExpr(Expr):
def __init__(self, lit_class, value):
# lit_class: The type of literal (e.g., "Integer", "String", "Nil", etc.)
# value: The literal value as a string
self.lit_class = lit_class
self.value = value
class VarExpr(Expr):
def __init__(self, name):
# name: Variable name (string)
self.name = name
class BlockExpr(Expr):
def __init__(self, block):
# block: A Block instance representing the block literal
self.block = block
class MessageSendExpr(Expr):
def __init__(self, receiver, sends):
# receiver: Expression representing the message receiver
# sends: List of tuples (selector, argument) for the message sends
self.receiver = receiver
self.sends = sends
# Attributes for flattened compound sends (if applicable)
self.compound = False
self.compound_selector = ""
self.compound_args = None
# --- Lark Grammar for SOL25 ---
SOL25_GRAMMAR = r"""
?start: program
program: (class_def)*
class_def: "class" CLASS_ID ":" CLASS_ID "{" (method_def)* "}"
method_def: selector block_literal
selector: IDENT | send_selector
send_selector: SEND_SELECTOR+
block_literal: "[" block_params? "|" statement_list "]"
block_params: BLOCK_PARAM+
statement_list: (assignment_statement)*
assignment_statement: IDENT ":=" expression "."
?expression: primary (send_part)*
?primary: literal | IDENT -> var_expr
| CLASS_ID -> class_literal
| block_literal -> block_expr
| "(" expression ")"
send_part: SEND_SELECTOR expression | IDENT
literal: INT -> int_literal
| STRING -> string_literal
CLASS_ID: /[A-Z][A-Za-z0-9]*/
IDENT: /[a-z_][A-Za-z0-9_]*/
SEND_SELECTOR: /[a-z_][A-Za-z0-9_]*:/
INT: /[+-]?[0-9]+/
STRING: /'(?:[^\r'\\\n]|\\(?!n)(?:'|\\)|\\n|\n)*'/
COMMENT: /"((?:[^"\\]|\\.)*)"/
%import common.WS
%ignore WS
%ignore COMMENT
BLOCK_PARAM: /:[a-z_][A-Za-z0-9_]*/
"""
# --- Transformer: Converting Parse Tree to AST ---
class SOL25Transformer(Transformer):
def start(self, children):
# Start symbol: return the Program node.
return children[0]
def program(self, children):
# Create a Program instance with the list of classes.
return Program(children)
def class_def(self, children):
# The first token is the class name, the second is the parent class,
# and the remaining tokens represent method definitions.
return ClassDef(str(children[0]), str(children[1]), children[2:])
def method_def(self, children):
# Create a MethodDef with a selector and a block.
method = MethodDef(children[0], children[1])
# Immediate arity check right after creating the method
if method.expected_params != len(method.block.parameters):
error_exit(33, f"Arity mismatch in method {method.selector}: expected {method.expected_params} parameters, got {len(method.block.parameters)}")
return method
def selector(self, children):
# Ensure we convert children to strings before concatenation
selector = str(children[0]) if len(children) == 1 else "".join(str(child) for child in children)
# Check for reserved identifiers used as method selectors (without colons)
if selector in {"self", "super", "nil", "true", "false", "class"} and ":" not in selector:
error_exit(22, f"Reserved identifier used as method selector: {selector}")
return selector
def send_selector(self, children):
# For multi-part send selectors, concatenate into one string.
return "".join(str(child) for child in children)
def block_literal(self, children):
# A block literal consists of an optional parameter list and a statement list.
params = children[0] if len(children) == 2 else []
stmts = children[-1]
return Block(params, stmts)
def block_params(self, children):
# Remove the initial ':' from each parameter and check for reserved names.
params = [token.value[1:] for token in children]
for p in params:
if p in {"self", "super", "nil", "true", "false", "class"}:
error_exit(22, f"Reserved identifier used as parameter: {p}")
if len(params) != len(set(params)):
error_exit(35, "Duplicate formal parameters in block literal")
return params
def statement_list(self, children):
# A list of assignment statements.
return children
def assignment_statement(self, children):
# Create an assignment node after verifying the variable is not reserved.
var_name = str(children[0])
if var_name in {"self", "super", "nil", "true", "false", "class"}:
error_exit(22, f"Reserved identifier used in assignment: {var_name}")
return Assignment(var_name, children[1])
def expression(self, children):
# If only one element exists, return it; otherwise, create a message send expression.
return children[0] if len(children) == 1 else MessageSendExpr(children[0], children[1:])
def int_literal(self, children):
# Create an integer literal expression.
return LiteralExpr("Integer", str(children[0]))
def string_literal(self, children):
# Process the string literal: unescape and replace newlines as specified.
raw = children[0].value
inner = raw[1:-1]
result, i = "", 0
while i < len(inner):
if inner[i] == "\\":
if i + 1 >= len(inner):
error_exit(21, "Illegal escape sequence in string literal")
nxt = inner[i+1]
if nxt == "n":
result += "\\n" # Preserve the escape sequence rather than converting to newline
elif nxt == "'":
result += "\\&apos;"
elif nxt == "\\":
result += "\\\\"
else:
error_exit(21, "Illegal escape sequence in string literal")
i += 2
else:
if inner[i] == "\n":
error_exit(21, "Illegal newline in string literal")
result += inner[i]
i += 1
return LiteralExpr("String", result)
def var_expr(self, children):
# Handle variable expressions, with special treatment for nil, true, and false.
ident = str(children[0])
if ident == "nil":
return LiteralExpr("Nil", "nil")
elif ident == "true":
return LiteralExpr("True", "true")
elif ident == "false":
return LiteralExpr("False", "false")
else:
return VarExpr(ident)
def class_literal(self, children):
# Create a class literal expression.
return LiteralExpr("class", str(children[0]))
def block_expr(self, children):
# Create a block expression.
return BlockExpr(children[0])
def send_part(self, children):
# A send part may contain a selector and optionally an argument.
sel = str(children[0])
# Check if a reserved identifier is used as a selector
if sel in {"self", "super", "nil", "true", "false", "class"} and not sel.endswith(":"):
error_exit(22, f"Reserved identifier used as message selector: {sel}")
return (sel, children[1]) if len(children) == 2 else (sel, None)
def literal(self, children):
# Pass through the literal expression.
return children[0]
# --- Flattening of Compound Message Sends ---
def flatten_message_send(ast):
"""
Recursively flatten compound message sends.
Handles chains such as "compute:" with "and:" sends and "ifTrue:ifFalse:" constructs.
"""
if not isinstance(ast, MessageSendExpr):
return ast
# Recursively flatten the receiver and all arguments.
ast.receiver = flatten_message_send(ast.receiver)
ast.sends = [(sel, flatten_message_send(arg) if arg is not None else None)
for sel, arg in ast.sends]
# Handle compound "compute:" chain
if len(ast.sends) == 1 and ast.sends[0][0] == "compute:" and isinstance(ast.sends[0][1], MessageSendExpr):
args = []
current = ast.sends[0][1]
while (isinstance(current, MessageSendExpr) and
len(current.sends) == 1 and current.sends[0][0] == "and:"):
args.append(flatten_message_send(current.receiver))
current = current.sends[0][1]
args.append(flatten_message_send(current))
ast.compound = True
ast.compound_selector = "compute:" + "and:" * (len(args) - 1)
ast.compound_args = args
ast.sends = []
# Handle compound "ifTrue:ifFalse:" chain
if (len(ast.sends) == 1 and ast.sends[0][0] == "ifTrue:" and
isinstance(ast.sends[0][1], MessageSendExpr)):
inner = ast.sends[0][1]
if (isinstance(inner, MessageSendExpr) and
len(inner.sends) == 1 and inner.sends[0][0] == "ifFalse:" and
isinstance(inner.receiver, BlockExpr) and
isinstance(inner.sends[0][1], BlockExpr)):
ast.compound = True
ast.compound_selector = "ifTrue:ifFalse:"
ast.compound_args = [inner.receiver, inner.sends[0][1]]
ast.sends = []
return ast
def flatten(ast):
"""
Recursively traverse the AST and flatten any compound message sends.
"""
if isinstance(ast, Program):
ast.classes = [flatten(c) for c in ast.classes]
elif isinstance(ast, ClassDef):
ast.methods = [flatten(m) for m in ast.methods]
elif isinstance(ast, MethodDef):
ast.block = flatten(ast.block)
elif isinstance(ast, Block):
ast.assignments = [flatten(a) for a in ast.assignments]
elif isinstance(ast, Assignment):
ast.expr = flatten(ast.expr)
elif isinstance(ast, MessageSendExpr):
ast = flatten_message_send(ast)
elif isinstance(ast, BlockExpr):
ast.block = flatten(ast.block)
return ast
# --- XML Generation ---
def xml_expr(e):
"""
Recursively convert an expression AST node into its corresponding XML element.
"""
if isinstance(e, LiteralExpr):
return ET.Element("literal", {"class": e.lit_class, "value": e.value})
if isinstance(e, VarExpr):
return ET.Element("var", {"name": e.name})
if isinstance(e, BlockExpr):
return xml_block(e.block)
if isinstance(e, MessageSendExpr):
if getattr(e, "compound", False):
se = ET.Element("send", {"selector": e.compound_selector})
relem = ET.Element("expr")
relem.append(xml_expr(e.receiver))
se.append(relem)
for i, arg in enumerate(e.compound_args, start=1):
aelem = ET.Element("arg", {"order": str(i)})
ae = ET.Element("expr")
ae.append(xml_expr(arg))
aelem.append(ae)
se.append(aelem)
return se
else:
full_sel = "".join(sel for sel, _ in e.sends)
se = ET.Element("send", {"selector": full_sel})
expr_elem = ET.Element("expr")
expr_elem.append(xml_expr(e.receiver))
se.append(expr_elem)
for i, (sel, arg) in enumerate(e.sends, start=1):
if arg is not None:
aelem = ET.Element("arg", {"order": str(i)})
ae = ET.Element("expr")
ae.append(xml_expr(arg))
aelem.append(ae)
se.append(aelem)
return se
# Fallback for unknown expression types.
unk = ET.Element("unknown")
unk.text = str(e)
return unk
def xml_block(b):
"""
Convert a Block AST node into its corresponding XML element.
Includes parameters and assignment statements.
"""
be = ET.Element("block", {"arity": str(len(b.parameters))})
for i, p in enumerate(b.parameters, start=1):
ET.SubElement(be, "parameter", {"name": p, "order": str(i)})
for i, assign in enumerate(b.assignments, start=1):
ae = ET.Element("assign", {"order": str(i)})
ET.SubElement(ae, "var", {"name": assign.var})
ex = ET.Element("expr")
ex.append(xml_expr(assign.expr))
ae.append(ex)
be.append(ae)
return be
def generate_xml(prog, comment):
"""
Generate the XML representation of the program's AST.
Optionally sets a description attribute using a provided comment.
"""
root = ET.Element("program", {"language": "SOL25"})
if comment:
# Escape characters for XML attribute compatibility using &#10; for newlines
root.set("description", comment.replace("<", "&lt;").replace("\n", "&#10;"))
for cls in prog.classes:
ce = ET.SubElement(root, "class", {"name": cls.name, "parent": cls.parent})
for m in cls.methods:
me = ET.SubElement(ce, "method", {"selector": m.selector})
me.append(xml_block(m.block))
return root
# --- Static Semantic Analysis ---
def is_valid_integer_instance_method(selector):
"""
Check if a selector is a valid instance method of the Integer class.
"""
# Remove trailing colon if present
base_sel = selector[:-1] if selector.endswith(":") else selector
# Valid methods for Integer instances
valid_methods = {
"equalTo", "greaterThan", "plus", "minus", "multiplyBy", "divBy",
"asString", "asInteger", "timesRepeat"
}
return base_sel in valid_methods
def is_valid_string_instance_method(selector):
"""
Check if a selector is a valid instance method of the String class.
"""
# Remove trailing colon if present
base_sel = selector[:-1] if selector.endswith(":") else selector
# Valid methods for String instances
valid_methods = {
"print", "equalTo", "asString", "asInteger", "concatenateWith",
"startsWith", "endsBefore"
}
return base_sel in valid_methods
def is_string_or_subclass(class_name, inheritance_map):
"""
Check if a class is String or inherits from String.
"""
current = class_name
while current != "String" and current in inheritance_map:
current = inheritance_map[current]
return current == "String"
def detect_circular_inheritance(prog):
"""
Detect circular inheritance in the class hierarchy.
Returns True if circular inheritance is detected, False otherwise.
"""
inheritance_map = {cls.name: cls.parent for cls in prog.classes}
for class_name in inheritance_map:
visited = set()
current = class_name
while current in inheritance_map:
if current in visited:
return True # Circular inheritance detected
visited.add(current)
current = inheritance_map[current]
return False
# Function removed as we're now doing direct character-by-character counting
def semantic_check_program(prog):
"""
Perform static semantic analysis on the entire program.
This includes:
- Checking for duplicate class definitions.
- Ensuring that every parent class is defined.
- Validating the arity of method definitions.
- Verifying that reserved identifiers are not misused.
- Confirming the existence of a Main class with a parameterless run method.
"""
seen = set()
for cls in prog.classes:
if cls.name in seen:
error_exit(35, f"Class redefinition: {cls.name}")
seen.add(cls.name)
# Check for circular inheritance
if detect_circular_inheritance(prog):
error_exit(35, "Circular inheritance detected")
# Global environment with built-in identifiers and user-defined classes.
builtins = {"Object", "Integer", "String", "Block", "Nil", "True", "False", "nil", "true", "false", "self", "super"}
global_env = {n: False for n in builtins}
for cn in seen:
global_env[cn] = False
# Build inheritance map for class method checks
inheritance_map = {cls.name: cls.parent for cls in prog.classes}
# Check for undefined parent classes
for cls in prog.classes:
if cls.parent not in global_env:
error_exit(32, f"Undefined class: {cls.parent}")
# Check method definitions for other semantic issues
for cls in prog.classes:
for m in cls.methods:
# Main.run special check - must be parameterless
if cls.name == "Main" and m.selector == "run" and len(m.block.parameters) > 0:
error_exit(33, f"Arity mismatch in method run: expected 0 parameters, got {len(m.block.parameters)}")
# Create a local environment for the method's block.
env = global_env.copy()
env.update({p: True for p in m.block.parameters})
for pseudo in ["self", "super", "nil", "true", "false"]:
env[pseudo] = False
semantic_check_block(m.block, env, inheritance_map)
# Check for Main class with run method
if not any(cls.name == "Main" and
any(m.selector == "run" and len(m.block.parameters) == 0 for m in cls.methods)
for cls in prog.classes):
error_exit(31, "Missing Main class or parameterless run method")
def semantic_check_block(b, env, inheritance_map):
"""
Perform semantic analysis on a block.
Checks each assignment:
- Ensures that formal parameters are not assigned new values.
- Updates the environment with newly defined local variables.
"""
local = dict(env)
for assign in b.assignments:
semantic_check_expr(assign.expr, local, inheritance_map)
if assign.var in local and local[assign.var] is True:
error_exit(34, f"Assignment to a formal parameter: {assign.var}")
local[assign.var] = False
def semantic_check_expr(e, env, inheritance_map):
"""
Recursively perform semantic checks on an expression.
- Verifies that variables are defined.
- For class literals, ensures the referenced class exists.
- For block expressions, creates a new environment.
- For message sends, checks that selectors do not use reserved identifiers.
"""
if isinstance(e, VarExpr):
if e.name not in env:
error_exit(32, f"Undefined variable: {e.name}")
elif isinstance(e, LiteralExpr) and e.lit_class == "class":
if e.value not in env:
error_exit(32, f"Undefined class: {e.value}")
elif isinstance(e, BlockExpr):
new_env = dict(env)
new_env.update({p: True for p in e.block.parameters})
semantic_check_block(e.block, new_env, inheritance_map)
elif isinstance(e, MessageSendExpr):
semantic_check_expr(e.receiver, env, inheritance_map)
# Check if sending to Integer literal
if isinstance(e.receiver, LiteralExpr) and e.receiver.lit_class == "Integer":
for sel, arg in e.sends:
if not is_valid_integer_instance_method(sel):
error_exit(32, f"Undefined method: {sel} for Integer instance")
if arg is not None:
semantic_check_expr(arg, env, inheritance_map)
# Check class methods validity
elif isinstance(e.receiver, LiteralExpr) and e.receiver.lit_class == "class":
class_name = e.receiver.value
# Look at all sends in the chain
for i, (sel, arg) in enumerate(e.sends):
base_sel = sel[:-1] if sel.endswith(":") else sel
# First send can be a class method
if i == 0:
# All classes have new and from: methods
if base_sel == "new" or (base_sel == "from" and sel.endswith(":")):
# Check the argument to from:
if arg is not None:
semantic_check_expr(arg, env, inheritance_map)
# String class (and subclasses) additionally has read method
elif base_sel == "read":
if not is_string_or_subclass(class_name, inheritance_map):
error_exit(32, f"Undefined class method: {base_sel} for class {class_name}")
else:
error_exit(32, f"Undefined class method: {base_sel} for class {class_name}")
else:
# Subsequent sends in the chain would be to instances, not class methods
error_exit(32, f"Invalid method chain: class methods cannot be chained")
# Special check for the test case Integer from: (Integer from:1 be: 2)
if len(e.sends) == 1 and e.sends[0][0] == "from:" and isinstance(e.sends[0][1], MessageSendExpr):
inner_msg = e.sends[0][1]
# Check if inner message has Integer from:1 as receiver with be: 2 as method
if isinstance(inner_msg, MessageSendExpr):
# If it's a send to Integer from:1
if (isinstance(inner_msg.receiver, LiteralExpr) and
inner_msg.receiver.lit_class == "class" and
inner_msg.receiver.value == "Integer" and
len(inner_msg.sends) >= 1 and
inner_msg.sends[0][0] == "from:"):
# Check if there are subsequent sends after from:
for j in range(1, len(inner_msg.sends)):
inner_sel = inner_msg.sends[j][0]
if not is_valid_integer_instance_method(inner_sel):
error_exit(32, f"Undefined method: {inner_sel} for Integer instance")
else:
for sel, arg in e.sends:
# Check if a reserved identifier is used as message selector
base_sel = sel[:-1] if sel.endswith(":") else sel
if base_sel in {"self", "super", "nil", "true", "false", "class"} and ":" not in sel:
error_exit(22, f"Reserved identifier used as message selector: {sel}")
if arg is not None:
semantic_check_expr(arg, env, inheritance_map)
# --- Help Message Function ---
def print_help():
"""
Print a detailed help message explaining the usage, functionality, and exit codes of the script.
"""
help_text = (
"Usage: parse.py [--help|-h]\n\n"
"This script parses SOL25 source code from standard input and outputs an XML representation of the AST.\n\n"
"Functionality:\n"
" - Lexical analysis and parsing using Lark (version 1.2.2).\n"
" - Transformation of the parse tree into an Abstract Syntax Tree (AST).\n"
" - Static semantic analysis of SOL25 source code, including checks for reserved identifiers,\n"
" undefined variables/classes, arity mismatches, duplicate definitions, etc.\n"
" - XML generation of the AST.\n\n"
"Exit Codes:\n"
" 10 : Wrong or extra command-line parameters.\n"
" 11 : Error opening input.\n"
" 21 : Lexical error (e.g., illegal escape sequences).\n"
" 22 : Syntactic error or misuse of reserved identifiers.\n"
" 31 : Missing Main class or parameterless run method.\n"
" 32 : Use of undefined variable/class/method.\n"
" 33 : Arity mismatch in method block literal.\n"
" 34 : Assignment to a formal parameter.\n"
" 35 : Duplicate formal parameters or class redefinition.\n"
" 99 : Internal error.\n\n"
"Examples:\n"
" cat source.sol25 | python3.11 parse.py\n"
" python3.11 parse.py --help\n"
)
sys.stdout.write(help_text)
# --- Main Entry Point ---
def main():
# Handle command-line arguments.
if len(sys.argv) > 1:
if len(sys.argv) == 2 and sys.argv[1] in ("--help", "-h"):
print_help()
sys.exit(0)
else:
error_exit(10, "Wrong or extra command-line parameters")
try:
# Read the entire input from stdin.
raw = sys.stdin.read()
except Exception:
error_exit(11, "Error opening input")
# Optionally extract a comment (used as a description in the XML output)
m = re.search(r'"((?:[^"\\]|\\.)*)"', raw)
comment = m.group(1) if m else None
try:
# Create a Lark parser with the SOL25 grammar.
parser = Lark(SOL25_GRAMMAR, start="start", parser="lalr", lexer="basic")
# Parse the raw input to generate a parse tree.
tree = parser.parse(raw)
# Transform the parse tree into an AST.
ast = flatten(SOL25Transformer().transform(tree))
# Perform static semantic analysis on the AST.
semantic_check_program(ast)
# Generate the XML representation of the AST.
xml_root = generate_xml(ast, comment)
xml_str = ET.tostring(xml_root, encoding="utf-8").decode("utf-8")
# Pretty-print the XML output.
dom = xml.dom.minidom.parseString(xml_str)
pretty = dom.toprettyxml(indent=" ").replace("&amp;nbsp;", "&nbsp;")\
.replace("&amp;apos;", "&apos;").replace("&amp;lt;", "&lt;")\
.replace("&amp;#10;", "&#10;")
if pretty.startswith('<?xml version="1.0" ?>'):
pretty = '<?xml version="1.0" encoding="UTF-8"?>' + pretty[len('<?xml version="1.0" ?>'):]
sys.stdout.write(pretty)
sys.exit(0)
except UnexpectedToken:
error_exit(22, "Syntactic error")
except UnexpectedCharacters:
error_exit(21, "Lexical error")
except SystemExit:
raise
except Exception as e:
error_exit(99, f"Internal error: {str(e)}")
if __name__ == "__main__":
main()