From 4e95bc05a3db4524a815b1aa7dd1e5f1cc731a3e Mon Sep 17 00:00:00 2001 From: Nilstrieb <48135649+Nilstrieb@users.noreply.github.com> Date: Sun, 23 Jul 2023 14:02:53 +0200 Subject: [PATCH] parser --- jest.config.js | 6 +- src/ast.ts | 121 ++++++++++++++++--- src/error.ts | 7 ++ src/index.ts | 16 ++- src/lexer.test.ts | 18 +-- src/lexer.ts | 213 ++++++++++++++++++++++----------- src/parser.ts | 293 +++++++++++++++++++++++++++++++++++++++++----- src/printer.ts | 96 +++++++++++++++ tsconfig.json | 2 +- 9 files changed, 640 insertions(+), 132 deletions(-) create mode 100644 src/printer.ts diff --git a/jest.config.js b/jest.config.js index b413e10..3abcbd9 100644 --- a/jest.config.js +++ b/jest.config.js @@ -1,5 +1,5 @@ /** @type {import('ts-jest').JestConfigWithTsJest} */ module.exports = { - preset: 'ts-jest', - testEnvironment: 'node', -}; \ No newline at end of file + preset: "ts-jest", + testEnvironment: "node", +}; diff --git a/src/ast.ts b/src/ast.ts index f52f088..7fe0e36 100644 --- a/src/ast.ts +++ b/src/ast.ts @@ -1,33 +1,116 @@ import { Span } from "./error"; export type ItemKind = { - kind: "function", - node: FunctionDef, + kind: "function"; + node: FunctionDef; }; export type Item = ItemKind & { - span: Span, -} + span: Span; +}; export type FunctionDef = { - name: string, - args: FunctionArg[], - body: Expr, -} + name: string; + args: FunctionArg[]; + body: Expr; +}; export type FunctionArg = { - name: string, - span: Span, -} + name: string; + span: Span; +}; -export type ExprKind = { - kind: "lit_string", - value: string, -} | { - kind: "ident", - value: string, -} +export type ExprKind = + | { kind: "empty" } + | { kind: "let"; name: string; rhs: Expr; after: Expr } + | { kind: "block"; exprs: Expr[] } + | { + kind: "literal"; + value: Literal; + } + | { + kind: "ident"; + value: string; + } + | { + kind: "binary"; + binaryKind: BinaryKind; + lhs: Expr; + rhs: Expr; + } + | { + kind: "unary", + unaryKind: UnaryKind, + rhs: Expr, + } + | { + kind: "call", + lhs: Expr, + args: Expr[], + }; export type Expr = ExprKind & { - span: Span, + span: Span; +}; + +export type Literal = + | { + kind: "str"; + value: string; + } + | { + kind: "int"; + value: number; + }; + +export type BinaryKind = + | "+" + | "-" + | "*" + | "/" + | "&" + | "|" + | "<" + | ">" + | "==" + | "<=" + | ">=" + | "!="; + +export const COMPARISON_KINDS: BinaryKind[] = [ + ">", + "<", + "==", + "<=", + ">=", + "!=", +]; +export const LOGICAL_KINDS: BinaryKind[] = ["&", "|"]; +export const ARITH_TERM_KINDS: BinaryKind[] = ["+", "-"]; +export const ARITH_FACTOR_KINDS: BinaryKind[] = ["*", "/"]; + +const BINARY_KIND_PREC_CLASS = new Map([ + ["+", 0], + ["-", 0], + ["*", 0], + ["/", 0], + ["&", 1], + ["|", 2], + ["<", 3], + [">", 4], + ["==", 5], + ["<=", 6], + [">=", 7], + ["!=", 8], +]); + +export function binaryExprPrecedenceClass(k: BinaryKind): number { + const cls = BINARY_KIND_PREC_CLASS.get(k); + if (!cls) { + throw new Error(`Invalid binary kind: ${k}`); + } + return cls; } + +export type UnaryKind = '!' | '-'; +export const UNARY_KINDS: UnaryKind[] = ['!', '-']; \ No newline at end of file diff --git a/src/error.ts b/src/error.ts index e344cc1..8754d3f 100644 --- a/src/error.ts +++ b/src/error.ts @@ -3,6 +3,13 @@ export type Span = { end: number; }; +export function spanMerge(a: Span, b: Span): Span { + return { + start: Math.min(a.start, b.start), + end: Math.max(a.end, b.end), + }; +} + export class CompilerError extends Error { msg: string; span: Span; diff --git a/src/index.ts b/src/index.ts index 83174d3..4cd946d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,18 +1,30 @@ import { withErrorHandler } from "./error"; import { tokenize } from "./lexer"; import { parse } from "./parser"; +import { printAst } from "./printer"; const input = ` -function hello() {} +function main() = ( + print("Hello, world!"); + "uwu"; +); `; function main() { withErrorHandler(input, () => { const tokens = tokenize(input); + console.log("-----TOKENS---"); console.log(tokens); const ast = parse(tokens); - console.log(ast); + console.log("-----AST------"); + + console.dir(ast, { depth: 10 }); + + const printed = printAst(ast); + console.log("-----AST pretty------"); + console.log(printed); + }); } diff --git a/src/lexer.test.ts b/src/lexer.test.ts index 319c46e..086ca7b 100644 --- a/src/lexer.test.ts +++ b/src/lexer.test.ts @@ -1,17 +1,17 @@ import { tokenize } from "./lexer"; -it('should tokenize an emtpy function', () => { - const input = `function hello() {}`; +it("should tokenize an emtpy function", () => { + const input = `function hello() = {}`; - const tokens = tokenize(input); + const tokens = tokenize(input); - expect(tokens).toMatchSnapshot(); + expect(tokens).toMatchSnapshot(); }); -it('should tokenize hello world', () => { - const input = `print("hello world")`; +it("should tokenize hello world", () => { + const input = `print("hello world")`; - const tokens = tokenize(input); + const tokens = tokenize(input); - expect(tokens).toMatchSnapshot(); -}); \ No newline at end of file + expect(tokens).toMatchSnapshot(); +}); diff --git a/src/lexer.ts b/src/lexer.ts index 061b795..7731466 100644 --- a/src/lexer.ts +++ b/src/lexer.ts @@ -1,23 +1,62 @@ import { CompilerError, Span } from "./error"; export type DatalessToken = - | "kw_function" - | "kw_let" - | "p_popen" - | "p_pclose" - | "p_bopen" - | "p_bclose" - | "p_semi"; + | "function" + | "let" + | "in" + | "(" + | ")" + | ";" + | "," + | "=" + | "+" + | "-" + | "*" + | "/" + | "&" + | "|" + | "!" + | "<" + | ">" + | "==" + | "<=" + | ">=" + | "!=" + | "!"; -export type TokenKind = - | { kind: DatalessToken } - | { kind: "identifier"; ident: string } - | { kind: "lit_string"; value: string }; +export type TokenIdent = { kind: "identifier"; ident: string }; + +export type TokenLit = + | { + kind: "lit_string"; + value: string; + } + | { + kind: "lit_int"; + value: number; + }; + +export type TokenKind = { kind: DatalessToken } | TokenIdent | TokenLit; export type Token = TokenKind & { span: Span; }; +export type BaseToken = { kind: Token["kind"] }; + +const SINGLE_PUNCT: string[] = [ + "(", + ")", + ";", + ",", + "+", + "-", + "*", + "/", + "&", + "|", +]; + export function tokenize(input: string): Token[] { const tokens: Token[] = []; let i = 0; @@ -25,65 +64,102 @@ export function tokenize(input: string): Token[] { finish: while (i < input.length) { const next = input[i]; const span: Span = { start: i, end: i + 1 }; - switch (next) { - case undefined: { - break finish; - } - case "(": { - tokens.push({ kind: "p_popen", span }); - break; - } - case ")": { - tokens.push({ kind: "p_pclose", span }); - break; - } - case "{": { - tokens.push({ kind: "p_bopen", span }); - break; - } - case "}": { - tokens.push({ kind: "p_bclose", span }); - break; - } - case ";": { - tokens.push({ kind: "p_semi", span }); - break; - } - case '"': { - while (true) { - const next = input[i + 1]; - span.end++; - i++; - if (next === '"') { - break; - } - if (next === undefined) { - throw new CompilerError(`Unterminated string literal`, span); - } + + if (SINGLE_PUNCT.includes(next)) { + tokens.push({ kind: next as DatalessToken, span }); + } else { + switch (next) { + case undefined: { + break finish; } - const value = input.slice(span.start + 1, span.end - 1); - tokens.push({ kind: "lit_string", span, value }); - break; - } - default: { - if (isDigit(next)) { - throw new Error("digit"); - } else if (isIdentStart(next)) { - while (isIdentContinue(input[i + 1])) { + case "=": { + if (input[i + 1] === "=") { span.end++; i++; - } - const ident = input.slice(span.start, span.end); - let kw = isKeyword(ident); - if (kw) { - tokens.push({ kind: kw, span }); + tokens.push({ kind: "==", span }); } else { - tokens.push({ kind: "identifier", span, ident: ident }); + tokens.push({ kind: "=", span }); + } + break; + } + case ">": { + if (input[i + 1] === "=") { + span.end++; + i++; + tokens.push({ kind: ">=", span }); + } else { + tokens.push({ kind: ">", span }); + } + break; + } + case "<": { + if (input[i + 1] === "=") { + span.end++; + i++; + tokens.push({ kind: "<=", span }); + } else { + tokens.push({ kind: "<", span }); + } + break; + } + case "!": { + if (input[i + 1] === "=") { + span.end++; + i++; + tokens.push({ kind: "!=", span }); + } else { + tokens.push({ kind: "!", span }); + } + break; + } + case '"': { + while (true) { + const next = input[i + 1]; + span.end++; + i++; + if (next === '"') { + break; + } + if (next === undefined) { + throw new CompilerError(`Unterminated string literal`, span); + } + } + const value = input.slice(span.start + 1, span.end - 1); + tokens.push({ kind: "lit_string", span, value }); + break; + } + default: { + if (isDigit(next)) { + while (isDigit(input[i + 1])) { + span.end++; + i++; + } + const digit = input.slice(span.start, span.end); + const int = parseInt(digit, 10); + if (Number.isNaN(int)) { + throw new Error( + `\`${digit}\` was tokenized to a number even though it is not` + ); + } + + tokens.push({ kind: "lit_int", value: int, span }); + } else if (isIdentStart(next)) { + while (isIdentContinue(input[i + 1])) { + span.end++; + i++; + } + const ident = input.slice(span.start, span.end); + let kw = isKeyword(ident); + if (kw) { + tokens.push({ kind: kw, span }); + } else { + tokens.push({ kind: "identifier", span, ident: ident }); + } + } else if (isWhitespace(next)) { + // ignore + } else { + throw new CompilerError(`Invalid character: \`${next}\``, span); } - } else if (isWhitespace(next)) { - // ignore - } else { - throw new CompilerError(`Invalid character: \`${next}\``, span); } } } @@ -117,10 +193,7 @@ function isWhitespace(char: string): boolean { return char === " " || char === "\t" || char === "\n" || char === "\r"; } -const keywords = new Map([ - ["function", "kw_function"], - ["let", "kw_let"], -]); +const keywords = new Set(["function", "let", "in"]); function isKeyword(kw: string): DatalessToken | undefined { - return keywords.get(kw); + return keywords.has(kw) ? (kw as DatalessToken) : undefined; } diff --git a/src/parser.ts b/src/parser.ts index 4cf2a26..df74686 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -1,45 +1,282 @@ -import { FunctionDef, Item } from "./ast"; +import { + ARITH_FACTOR_KINDS, + ARITH_TERM_KINDS, + BinaryKind, + COMPARISON_KINDS, + Expr, + FunctionDef, + Item, + LOGICAL_KINDS, + UNARY_KINDS, + UnaryKind, +} from "./ast"; import { CompilerError, todo } from "./error"; -import { Token } from "./lexer"; +import { BaseToken, Token, TokenIdent } from "./lexer"; + +type Parser = (t: Token[]) => [Token[], T]; export function parse(t: Token[]): Item[] { - const items: Item[] = []; + const items: Item[] = []; - while (t.length > 0) { - let item; - [t, item] = parseItem(t); - items.push(item); - } + while (t.length > 0) { + let item; + [t, item] = parseItem(t); + items.push(item); + } - return items; + return items; } function parseItem(t: Token[]): [Token[], Item] { - let next; - [t, next] = nextT(t); - if (next.kind === "kw_function") { + let tok; + [t, tok] = next(t); + if (tok.kind === "function") { + let name; + [t, name] = expectNext(t, "identifier"); - const def: FunctionDef = { - name: "", - args: [], - body: todo("todo", next.span) - } + [t] = expectNext(t, "("); + [t] = expectNext(t, ")"); + [t] = expectNext(t, "="); - return [t, {kind: "function", node: def, span: next.span}] - } else { - unexpectedToken(next); - } + let body; + [t, body] = parseExpr(t); + + [t] = expectNext(t, ";"); + + const def: FunctionDef = { + name: name.ident, + args: [], + body, + }; + + return [t, { kind: "function", node: def, span: tok.span }]; + } else { + unexpectedToken(tok); + } } -function nextT(t: Token[]): [Token[], Token] { - const next = t[0]; - if (!next) { - throw new CompilerError("unexpected end of file", {start: Number.MAX_SAFE_INTEGER, end: Number.MAX_SAFE_INTEGER}) +function parseExpr(t: Token[]): [Token[], Expr] { + /* + EXPR = { "let" NAME "=" EXPR "in" EXPR | COMPARISON } + + // The precende here is pretty arbitrary since we forbid mixing of operators + // with different precedence classes anyways. + COMPARISON = LOGICAL { ( ">" | "<" | "==" | "<=" | ">=" | "!=" ) COMPARISON } + LOGICAL = ARITH_TERM { ( "&" | "|" ) LOGICAL } + + // Here it matters though. + ARITH_TERM = ATOM { ( "+" | "-" ) ARITH_TERM } + ARITH_FACTOR = UNARY { ( "*" | "/" ) ARITH_FACTOR } + + UNARY = { "!" | "-" } CALL + + CALL = ATOM { "(" EXPR_LIST ")" } + + ATOM = "(" { EXPR ";" } EXPR ")" | IDENT | LITERAL | EMPTY + EMPTY = + EXPR_LIST = { EXPR { "," EXPR } { "," } } + */ + const [, peak] = next(t); + + if (peak.kind === "let") { + [t] = next(t); + let name; + [t, name] = expectNext(t, "identifier"); + expectNext(t, "="); + let rhs; + [t, rhs] = parseExpr(t); + expectNext(t, "in"); + let after; + [t, after] = parseExpr(t); + + return [t, { kind: "let", name: name.ident, rhs, after, span: t[0].span }]; + } + + return parseExprComparison(t); +} + +function mkParserExprBinary( + lower: Parser, + kinds: string[] +): Parser { + function parser(t: Token[]): [Token[], Expr] { + let lhs; + [t, lhs] = lower(t); + + const [, peak] = next(t); + if (kinds.includes(peak.kind)) { + [t] = next(t); + let rhs; + [t, rhs] = parser(t); + const span = peak.span; + return [ + t, + { kind: "binary", binaryKind: peak.kind as BinaryKind, lhs, rhs, span }, + ]; } - const rest = t.slice(1); - return [rest, next]; + + return [t, lhs]; + } + + return parser; +} + +const parseExprArithFactor = mkParserExprBinary( + parseExprUnary, + ARITH_FACTOR_KINDS +); + +const parseExprArithTerm = mkParserExprBinary( + parseExprArithFactor, + ARITH_TERM_KINDS +); + +const parseExprLogical = mkParserExprBinary(parseExprArithTerm, LOGICAL_KINDS); + +const parseExprComparison = mkParserExprBinary( + parseExprLogical, + COMPARISON_KINDS +); + +function parseExprUnary(t: Token[]): [Token[], Expr] { + const [, peak] = next(t); + if (peak.kind in UNARY_KINDS) { + let rhs; + [t, rhs] = parseExprUnary(t); + return [ + t, + { + kind: "unary", + unaryKind: peak.kind as UnaryKind, + rhs, + span: peak.span, + }, + ]; + } + + return parseExprCall(t); +} +function parseExprCall(t: Token[]): [Token[], Expr] { + let lhs: Expr; + [t, lhs] = parseExprAtom(t); + + while (next(t)[1].kind === "(") { + let popen; + [t, popen] = next(t); + const args = []; + while (next(t)[1].kind !== ")") { + let arg; + [t, arg] = parseExpr(t); + args.push(arg); + // TODO i think this is incorrect + [t] = eat(t, ","); + } + [t] = expectNext(t, ")"); + + lhs = { kind: "call", span: popen.span, lhs, args }; + } + + return [t, lhs]; +} + +function parseExprAtom(startT: Token[]): [Token[], Expr] { + let [t, tok] = next(startT); + + if (tok.kind === "(") { + let expr: Expr; + [t, expr] = parseExpr(t); + + const exprs = [expr]; + while (next(t)[1].kind !== ")") { + [t] = expectNext(t, ";"); + [t, expr] = parseExpr(t); + exprs.push(expr); + } + [t] = expectNext(t, ")"); + + return [t, { kind: "block", span: tok.span, exprs }]; + } + + if (tok.kind === "lit_string") { + return [ + t, + { + kind: "literal", + span: tok.span, + value: { kind: "str", value: tok.value }, + }, + ]; + } + + if (tok.kind === "lit_int") { + return [ + t, + { + kind: "literal", + span: tok.span, + value: { kind: "int", value: tok.value }, + }, + ]; + } + + if (tok.kind === "identifier") { + return [t, { kind: "ident", span: tok.span, value: tok.ident }]; + } + + // Parse nothing at all. + return [startT, { kind: "empty", span: tok.span }]; +} + +// helpers + +function eat( + t: Token[], + kind: T["kind"] +): [Token[], T | undefined] { + const [tnext, tok] = next(t); + if (tok.kind === kind) { + return [tnext, tok as unknown as T]; + } + return [t, undefined]; +} + +function expectNext( + t: Token[], + kind: T["kind"] +): [Token[], T] { + let tok; + [t, tok] = next(t); + const token = expectToken(kind, tok); + return [t, token]; +} + +function next(t: Token[]): [Token[], Token] { + const [rest, next] = maybeNextT(t); + if (!next) { + throw new CompilerError("unexpected end of file", { + start: Number.MAX_SAFE_INTEGER, + end: Number.MAX_SAFE_INTEGER, + }); + } + return [rest, next]; +} + +function maybeNextT(t: Token[]): [Token[], Token | undefined] { + const next = t[0]; + const rest = t.slice(1); + return [rest, next]; } function unexpectedToken(token: Token): never { - throw new CompilerError("unexpected token", token.span); + throw new CompilerError("unexpected token", token.span); +} + +function expectToken(kind: T["kind"], token: Token): T { + if (token.kind !== kind) { + throw new CompilerError( + `expected ${kind}, found ${token.kind}`, + token.span + ); + } + return token as unknown as T; } diff --git a/src/printer.ts b/src/printer.ts new file mode 100644 index 0000000..1b23411 --- /dev/null +++ b/src/printer.ts @@ -0,0 +1,96 @@ +import { Expr, FunctionDef, Item } from "./ast"; + +export function printAst(ast: Item[]): string { + return ast.map(printItem).join("\n"); +} + +function printItem(item: Item): string { + switch (item.kind) { + case "function": { + return printFunction(item.node); + } + } +} + +function printFunction(func: FunctionDef): string { + const args = func.args.map(({ name }) => name).join(", "); + return `function ${func.name}(${args}) = ${printExpr(func.body, 0)}`; +} + +function printExpr(expr: Expr, indent: number): string { + switch (expr.kind) { + case "empty": { + return ""; + } + case "let": { + return `let ${expr.name} = ${printExpr(expr.rhs, 1)} in${linebreak( + indent + )}`; + } + case "block": { + const exprs = expr.exprs.map((expr) => printExpr(expr, indent + 1)); + + if (exprs.length === 1) { + return `(${exprs[0]})`; + } + const shortExprs = + exprs.map((s) => s.length).reduce((a, b) => a + b, 0) < 40; + if (shortExprs) { + const alreadyHasTrailingSpace = expr.exprs[exprs.length - 1]?.kind === "empty"; + const trailingSpace = alreadyHasTrailingSpace ? "" : " "; + return `( ${exprs.join("; ")}${trailingSpace})`; + } else { + const joiner = `;${linebreak(indent + 1)}`; + return ( + `(${linebreak(indent + 1)}` + + `${exprs.join(joiner)}` + + `${linebreak(indent)})` + ); + } + } + case "literal": { + switch (expr.value.kind) { + case "str": { + return `"${expr.value.value}"`; + } + case "int": { + return `${expr.value.value}`; + } + } + } + case "ident": { + return expr.value; + } + case "binary": { + return `${printExpr(expr.lhs, indent)} ${expr.binaryKind} ${printExpr( + expr.rhs, + indent + )}`; + } + case "unary": { + return `${expr.unaryKind}${printExpr(expr.rhs, indent)}`; + } + case "call": { + const args = expr.args.map((arg) => printExpr(arg, indent + 1)); + const shortArgs = + args.map((s) => s.length).reduce((a, b) => a + b, 0) < 40; + if (shortArgs) { + return `${printExpr(expr.lhs, indent)}(${args.join(", ")})`; + } else { + return ( + `${printExpr(expr.lhs, indent)}(${linebreak(indent + 1)}` + + `${args.join(linebreak(indent + 1))}` + + `${linebreak(indent)})` + ); + } + } + } +} + +function linebreak(indent: number): string { + return `\n${ind(indent)}`; +} + +function ind(indent: number): string { + return " ".repeat(indent * 2); +} diff --git a/tsconfig.json b/tsconfig.json index 7ee82d7..b520cbf 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -95,7 +95,7 @@ // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ - "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ + "noFallthroughCasesInSwitch": true /* Enable error reporting for fallthrough cases in switch statements. */, // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */