parser

2026-03-14 20:26:06 +01:00 · 2023-07-23 14:02:53 +02:00 · 2023-07-23 14:02:53 +02:00 · 4e95bc05a3
commit 4e95bc05a3
parent 91b183c002
9 changed files with 640 additions and 132 deletions
--- a/jest.config.js
+++ b/jest.config.js
@ -1,5 +1,5 @@
 /** @type {import('ts-jest').JestConfigWithTsJest} */
 module.exports = {
-  preset: 'ts-jest',
+  preset: "ts-jest",
-  testEnvironment: 'node',
+  testEnvironment: "node",
-};
+};
--- a/src/ast.ts
+++ b/src/ast.ts
@ -1,33 +1,116 @@
 import { Span } from "./error";
 export type ItemKind = {
-    kind: "function",
+  kind: "function";
-    node: FunctionDef,
+  node: FunctionDef;
 };
 export type Item = ItemKind & {
-    span: Span,
+  span: Span;
-}
+};
 export type FunctionDef = {
-    name: string,
+  name: string;
-    args: FunctionArg[], 
+  args: FunctionArg[];
-    body: Expr,
+  body: Expr;
-}
+};
 export type FunctionArg = {
-    name: string,
+  name: string;
-    span: Span,
+  span: Span;
-}
+};
-export type ExprKind = {
+export type ExprKind =
-    kind: "lit_string",
+  | { kind: "empty" }
-    value: string,
+  | { kind: "let"; name: string; rhs: Expr; after: Expr }
-} | {
+  | { kind: "block"; exprs: Expr[] }
-    kind: "ident",
+  | {
-    value: string,
+      kind: "literal";
-}
+      value: Literal;
    }
  | {
      kind: "ident";
      value: string;
    }
  | {
      kind: "binary";
      binaryKind: BinaryKind;
      lhs: Expr;
      rhs: Expr;
    }
  | {
    kind: "unary",
    unaryKind: UnaryKind,
    rhs: Expr,
  }
  | {
    kind: "call",
    lhs: Expr,
    args: Expr[],
  };
 export type Expr = ExprKind & {
-    span: Span,
+  span: Span;
 };
 export type Literal =
  | {
      kind: "str";
      value: string;
    }
  | {
      kind: "int";
      value: number;
    };
 export type BinaryKind =
  | "+"
  | "-"
  | "*"
  | "/"
  | "&"
  | "|"
  | "<"
  | ">"
  | "=="
  | "<="
  | ">="
  | "!=";
 export const COMPARISON_KINDS: BinaryKind[] = [
  ">",
  "<",
  "==",
  "<=",
  ">=",
  "!=",
 ];
 export const LOGICAL_KINDS: BinaryKind[] = ["&", "|"];
 export const ARITH_TERM_KINDS: BinaryKind[] = ["+", "-"];
 export const ARITH_FACTOR_KINDS: BinaryKind[] = ["*", "/"];
 const BINARY_KIND_PREC_CLASS = new Map<BinaryKind, number>([
  ["+", 0],
  ["-", 0],
  ["*", 0],
  ["/", 0],
  ["&", 1],
  ["|", 2],
  ["<", 3],
  [">", 4],
  ["==", 5],
  ["<=", 6],
  [">=", 7],
  ["!=", 8],
 ]);
 export function binaryExprPrecedenceClass(k: BinaryKind): number {
  const cls = BINARY_KIND_PREC_CLASS.get(k);
  if (!cls) {
    throw new Error(`Invalid binary kind: ${k}`);
  }
  return cls;
 }
 export type UnaryKind = '!' | '-';
 export const UNARY_KINDS: UnaryKind[] = ['!', '-'];
--- a/src/error.ts
+++ b/src/error.ts
@ -3,6 +3,13 @@ export type Span = {
  end: number;
 };
 export function spanMerge(a: Span, b: Span): Span {
  return {
    start: Math.min(a.start, b.start),
    end: Math.max(a.end, b.end),
  };
 }
 export class CompilerError extends Error {
  msg: string;
  span: Span;
--- a/src/index.ts
+++ b/src/index.ts
@ -1,18 +1,30 @@
 import { withErrorHandler } from "./error";
 import { tokenize } from "./lexer";
 import { parse } from "./parser";
 import { printAst } from "./printer";
 const input = `
-function hello() {}
+function main() = (
  print("Hello, world!");
  "uwu";
 );
 `;
 function main() {
  withErrorHandler(input, () => {
    const tokens = tokenize(input);
    console.log("-----TOKENS---");
    console.log(tokens);
    const ast = parse(tokens);
-    console.log(ast);
+    console.log("-----AST------");
    console.dir(ast, { depth: 10 });
    const printed = printAst(ast);
    console.log("-----AST pretty------");
    console.log(printed);
  });
 }
--- a/src/lexer.test.ts
+++ b/src/lexer.test.ts
@ -1,17 +1,17 @@
 import { tokenize } from "./lexer";
-it('should tokenize an emtpy function', () => {
+it("should tokenize an emtpy function", () => {
-    const input = `function hello() {}`;
+  const input = `function hello() = {}`;
-    const tokens = tokenize(input);
+  const tokens = tokenize(input);
-    expect(tokens).toMatchSnapshot();
+  expect(tokens).toMatchSnapshot();
 });
-it('should tokenize hello world', () => {
+it("should tokenize hello world", () => {
-    const input = `print("hello world")`;
+  const input = `print("hello world")`;
-    const tokens = tokenize(input);
+  const tokens = tokenize(input);
-    expect(tokens).toMatchSnapshot();
+  expect(tokens).toMatchSnapshot();
-});
+});
--- a/src/lexer.ts
+++ b/src/lexer.ts
@ -1,23 +1,62 @@
 import { CompilerError, Span } from "./error";
 export type DatalessToken =
-  | "kw_function"
+  | "function"
-  | "kw_let"
+  | "let"
-  | "p_popen"
+  | "in"
-  | "p_pclose"
+  | "("
-  | "p_bopen"
+  | ")"
-  | "p_bclose"
+  | ";"
-  | "p_semi";
+  | ","
  | "="
  | "+"
  | "-"
  | "*"
  | "/"
  | "&"
  | "|"
  | "!"
  | "<"
  | ">"
  | "=="
  | "<="
  | ">="
  | "!="
  | "!";
-export type TokenKind =
+export type TokenIdent = { kind: "identifier"; ident: string };
-  | { kind: DatalessToken }
+
-  | { kind: "identifier"; ident: string }
+export type TokenLit =
-  | { kind: "lit_string"; value: string };
+  | {
      kind: "lit_string";
      value: string;
    }
  | {
      kind: "lit_int";
      value: number;
    };
 export type TokenKind = { kind: DatalessToken } | TokenIdent | TokenLit;
 export type Token = TokenKind & {
  span: Span;
 };
 export type BaseToken = { kind: Token["kind"] };
 const SINGLE_PUNCT: string[] = [
  "(",
  ")",
  ";",
  ",",
  "+",
  "-",
  "*",
  "/",
  "&",
  "|",
 ];
 export function tokenize(input: string): Token[] {
  const tokens: Token[] = [];
  let i = 0;
@ -25,65 +64,102 @@ export function tokenize(input: string): Token[] {
  finish: while (i < input.length) {
    const next = input[i];
    const span: Span = { start: i, end: i + 1 };
-    switch (next) {
+
-      case undefined: {
+    if (SINGLE_PUNCT.includes(next)) {
-        break finish;
+      tokens.push({ kind: next as DatalessToken, span });
-      }
+    } else {
-      case "(": {
+      switch (next) {
-        tokens.push({ kind: "p_popen", span });
+        case undefined: {
-        break;
+          break finish;
      }
      case ")": {
        tokens.push({ kind: "p_pclose", span });
        break;
      }
      case "{": {
        tokens.push({ kind: "p_bopen", span });
        break;
      }
      case "}": {
        tokens.push({ kind: "p_bclose", span });
        break;
      }
      case ";": {
        tokens.push({ kind: "p_semi", span });
        break;
      }
      case '"': {
        while (true) {
          const next = input[i + 1];
          span.end++;
          i++;
          if (next === '"') {
            break;
          }
          if (next === undefined) {
            throw new CompilerError(`Unterminated string literal`, span);
          }
        }
-        const value = input.slice(span.start + 1, span.end - 1);
+        case "=": {
-        tokens.push({ kind: "lit_string", span, value });
+          if (input[i + 1] === "=") {
        break;
      }
      default: {
        if (isDigit(next)) {
          throw new Error("digit");
        } else if (isIdentStart(next)) {
          while (isIdentContinue(input[i + 1])) {
            span.end++;
            i++;
-          }
+            tokens.push({ kind: "==", span });
          const ident = input.slice(span.start, span.end);
          let kw = isKeyword(ident);
          if (kw) {
            tokens.push({ kind: kw, span });
          } else {
-            tokens.push({ kind: "identifier", span, ident: ident });
+            tokens.push({ kind: "=", span });
          }
          break;
        }
        case ">": {
          if (input[i + 1] === "=") {
            span.end++;
            i++;
            tokens.push({ kind: ">=", span });
          } else {
            tokens.push({ kind: ">", span });
          }
          break;
        }
        case "<": {
          if (input[i + 1] === "=") {
            span.end++;
            i++;
            tokens.push({ kind: "<=", span });
          } else {
            tokens.push({ kind: "<", span });
          }
          break;
        }
        case "!": {
          if (input[i + 1] === "=") {
            span.end++;
            i++;
            tokens.push({ kind: "!=", span });
          } else {
            tokens.push({ kind: "!", span });
          }
          break;
        }
        case '"': {
          while (true) {
            const next = input[i + 1];
            span.end++;
            i++;
            if (next === '"') {
              break;
            }
            if (next === undefined) {
              throw new CompilerError(`Unterminated string literal`, span);
            }
          }
          const value = input.slice(span.start + 1, span.end - 1);
          tokens.push({ kind: "lit_string", span, value });
          break;
        }
        default: {
          if (isDigit(next)) {
            while (isDigit(input[i + 1])) {
              span.end++;
              i++;
            }
            const digit = input.slice(span.start, span.end);
            const int = parseInt(digit, 10);
            if (Number.isNaN(int)) {
              throw new Error(
                `\`${digit}\` was tokenized to a number even though it is not`
              );
            }
            tokens.push({ kind: "lit_int", value: int, span });
          } else if (isIdentStart(next)) {
            while (isIdentContinue(input[i + 1])) {
              span.end++;
              i++;
            }
            const ident = input.slice(span.start, span.end);
            let kw = isKeyword(ident);
            if (kw) {
              tokens.push({ kind: kw, span });
            } else {
              tokens.push({ kind: "identifier", span, ident: ident });
            }
          } else if (isWhitespace(next)) {
            // ignore
          } else {
            throw new CompilerError(`Invalid character: \`${next}\``, span);
          }
        } else if (isWhitespace(next)) {
          // ignore
        } else {
          throw new CompilerError(`Invalid character: \`${next}\``, span);
        }
      }
    }
@ -117,10 +193,7 @@ function isWhitespace(char: string): boolean {
  return char === " " || char === "\t" || char === "\n" || char === "\r";
 }
-const keywords = new Map<string, DatalessToken>([
+const keywords = new Set<string>(["function", "let", "in"]);
  ["function", "kw_function"],
  ["let", "kw_let"],
 ]);
 function isKeyword(kw: string): DatalessToken | undefined {
-  return keywords.get(kw);
+  return keywords.has(kw) ? (kw as DatalessToken) : undefined;
 }
--- a/src/parser.ts
+++ b/src/parser.ts
@ -1,45 +1,282 @@
-import { FunctionDef, Item } from "./ast";
+import {
  ARITH_FACTOR_KINDS,
  ARITH_TERM_KINDS,
  BinaryKind,
  COMPARISON_KINDS,
  Expr,
  FunctionDef,
  Item,
  LOGICAL_KINDS,
  UNARY_KINDS,
  UnaryKind,
 } from "./ast";
 import { CompilerError, todo } from "./error";
-import { Token } from "./lexer";
+import { BaseToken, Token, TokenIdent } from "./lexer";
 type Parser<T> = (t: Token[]) => [Token[], T];
 export function parse(t: Token[]): Item[] {
-    const items: Item[] = [];
+  const items: Item[] = [];
-    while (t.length > 0) {
+  while (t.length > 0) {
-        let item;
+    let item;
-        [t, item] = parseItem(t);
+    [t, item] = parseItem(t);
-        items.push(item);
+    items.push(item);
-    }
+  }
-    return items;
+  return items;
 }
 function parseItem(t: Token[]): [Token[], Item] {
-    let next;
+  let tok;
-    [t, next] = nextT(t);
+  [t, tok] = next(t);
-    if (next.kind === "kw_function") {
+  if (tok.kind === "function") {
    let name;
    [t, name] = expectNext<TokenIdent>(t, "identifier");
-        const def: FunctionDef = {
+    [t] = expectNext(t, "(");
-            name: "",
+    [t] = expectNext(t, ")");
-            args: [],
+    [t] = expectNext(t, "=");
            body: todo("todo", next.span)
        }
-        return [t, {kind: "function", node: def, span: next.span}]
+    let body;
-    } else {
+    [t, body] = parseExpr(t);
-        unexpectedToken(next);
+
-    }
+    [t] = expectNext(t, ";");
    const def: FunctionDef = {
      name: name.ident,
      args: [],
      body,
    };
    return [t, { kind: "function", node: def, span: tok.span }];
  } else {
    unexpectedToken(tok);
  }
 }
-function nextT(t: Token[]): [Token[], Token] {
+function parseExpr(t: Token[]): [Token[], Expr] {
-    const next = t[0];
+  /*
-    if (!next) {
+  EXPR = { "let" NAME "=" EXPR "in" EXPR | COMPARISON }
-        throw new CompilerError("unexpected end of file", {start: Number.MAX_SAFE_INTEGER, end: Number.MAX_SAFE_INTEGER})
+
  // The precende here is pretty arbitrary since we forbid mixing of operators
  // with different precedence classes anyways.
  COMPARISON = LOGICAL { ( ">" | "<" | "==" | "<=" | ">=" | "!=" ) COMPARISON }
  LOGICAL = ARITH_TERM { ( "&" | "|" ) LOGICAL }
  // Here it matters though.
  ARITH_TERM = ATOM { ( "+" | "-" ) ARITH_TERM }
  ARITH_FACTOR = UNARY { ( "*" | "/" ) ARITH_FACTOR }
  UNARY = { "!" | "-" } CALL
  CALL = ATOM { "(" EXPR_LIST ")" }
  ATOM = "(" { EXPR ";" } EXPR ")" | IDENT | LITERAL | EMPTY
  EMPTY =
  EXPR_LIST = { EXPR { "," EXPR } { "," } }
  */
  const [, peak] = next(t);
  if (peak.kind === "let") {
    [t] = next(t);
    let name;
    [t, name] = expectNext<TokenIdent>(t, "identifier");
    expectNext(t, "=");
    let rhs;
    [t, rhs] = parseExpr(t);
    expectNext(t, "in");
    let after;
    [t, after] = parseExpr(t);
    return [t, { kind: "let", name: name.ident, rhs, after, span: t[0].span }];
  }
  return parseExprComparison(t);
 }
 function mkParserExprBinary(
  lower: Parser<Expr>,
  kinds: string[]
 ): Parser<Expr> {
  function parser(t: Token[]): [Token[], Expr] {
    let lhs;
    [t, lhs] = lower(t);
    const [, peak] = next(t);
    if (kinds.includes(peak.kind)) {
      [t] = next(t);
      let rhs;
      [t, rhs] = parser(t);
      const span = peak.span;
      return [
        t,
        { kind: "binary", binaryKind: peak.kind as BinaryKind, lhs, rhs, span },
      ];
    }
-    const rest = t.slice(1);
+
-    return [rest, next];
+    return [t, lhs];
  }
  return parser;
 }
 const parseExprArithFactor = mkParserExprBinary(
  parseExprUnary,
  ARITH_FACTOR_KINDS
 );
 const parseExprArithTerm = mkParserExprBinary(
  parseExprArithFactor,
  ARITH_TERM_KINDS
 );
 const parseExprLogical = mkParserExprBinary(parseExprArithTerm, LOGICAL_KINDS);
 const parseExprComparison = mkParserExprBinary(
  parseExprLogical,
  COMPARISON_KINDS
 );
 function parseExprUnary(t: Token[]): [Token[], Expr] {
  const [, peak] = next(t);
  if (peak.kind in UNARY_KINDS) {
    let rhs;
    [t, rhs] = parseExprUnary(t);
    return [
      t,
      {
        kind: "unary",
        unaryKind: peak.kind as UnaryKind,
        rhs,
        span: peak.span,
      },
    ];
  }
  return parseExprCall(t);
 }
 function parseExprCall(t: Token[]): [Token[], Expr] {
  let lhs: Expr;
  [t, lhs] = parseExprAtom(t);
  while (next(t)[1].kind === "(") {
    let popen;
    [t, popen] = next(t);
    const args = [];
    while (next(t)[1].kind !== ")") {
      let arg;
      [t, arg] = parseExpr(t);
      args.push(arg);
      // TODO i think this is incorrect
      [t] = eat(t, ",");
    }
    [t] = expectNext(t, ")");
    lhs = { kind: "call", span: popen.span, lhs, args };
  }
  return [t, lhs];
 }
 function parseExprAtom(startT: Token[]): [Token[], Expr] {
  let [t, tok] = next(startT);
  if (tok.kind === "(") {
    let expr: Expr;
    [t, expr] = parseExpr(t);
    const exprs = [expr];
    while (next(t)[1].kind !== ")") {
      [t] = expectNext(t, ";");
      [t, expr] = parseExpr(t);
      exprs.push(expr);
    }
    [t] = expectNext(t, ")");
    return [t, { kind: "block", span: tok.span, exprs }];
  }
  if (tok.kind === "lit_string") {
    return [
      t,
      {
        kind: "literal",
        span: tok.span,
        value: { kind: "str", value: tok.value },
      },
    ];
  }
  if (tok.kind === "lit_int") {
    return [
      t,
      {
        kind: "literal",
        span: tok.span,
        value: { kind: "int", value: tok.value },
      },
    ];
  }
  if (tok.kind === "identifier") {
    return [t, { kind: "ident", span: tok.span, value: tok.ident }];
  }
  // Parse nothing at all.
  return [startT, { kind: "empty", span: tok.span }];
 }
 // helpers
 function eat<T extends BaseToken>(
  t: Token[],
  kind: T["kind"]
 ): [Token[], T | undefined] {
  const [tnext, tok] = next(t);
  if (tok.kind === kind) {
    return [tnext, tok as unknown as T];
  }
  return [t, undefined];
 }
 function expectNext<T extends BaseToken>(
  t: Token[],
  kind: T["kind"]
 ): [Token[], T] {
  let tok;
  [t, tok] = next(t);
  const token = expectToken(kind, tok);
  return [t, token];
 }
 function next(t: Token[]): [Token[], Token] {
  const [rest, next] = maybeNextT(t);
  if (!next) {
    throw new CompilerError("unexpected end of file", {
      start: Number.MAX_SAFE_INTEGER,
      end: Number.MAX_SAFE_INTEGER,
    });
  }
  return [rest, next];
 }
 function maybeNextT(t: Token[]): [Token[], Token | undefined] {
  const next = t[0];
  const rest = t.slice(1);
  return [rest, next];
 }
 function unexpectedToken(token: Token): never {
-    throw new CompilerError("unexpected token", token.span);
+  throw new CompilerError("unexpected token", token.span);
 }
 function expectToken<T extends BaseToken>(kind: T["kind"], token: Token): T {
  if (token.kind !== kind) {
    throw new CompilerError(
      `expected ${kind}, found ${token.kind}`,
      token.span
    );
  }
  return token as unknown as T;
 }
--- a/src/printer.ts
+++ b/src/printer.ts
@ -0,0 +1,96 @@
 import { Expr, FunctionDef, Item } from "./ast";
 export function printAst(ast: Item[]): string {
  return ast.map(printItem).join("\n");
 }
 function printItem(item: Item): string {
  switch (item.kind) {
    case "function": {
      return printFunction(item.node);
    }
  }
 }
 function printFunction(func: FunctionDef): string {
  const args = func.args.map(({ name }) => name).join(", ");
  return `function ${func.name}(${args}) = ${printExpr(func.body, 0)}`;
 }
 function printExpr(expr: Expr, indent: number): string {
  switch (expr.kind) {
    case "empty": {
      return "";
    }
    case "let": {
      return `let ${expr.name} = ${printExpr(expr.rhs, 1)} in${linebreak(
        indent
      )}`;
    }
    case "block": {
      const exprs = expr.exprs.map((expr) => printExpr(expr, indent + 1));
      if (exprs.length === 1) {
        return `(${exprs[0]})`;
      }
      const shortExprs =
        exprs.map((s) => s.length).reduce((a, b) => a + b, 0) < 40;
      if (shortExprs) {
        const alreadyHasTrailingSpace = expr.exprs[exprs.length - 1]?.kind === "empty";
        const trailingSpace = alreadyHasTrailingSpace ? "" : " ";
        return `( ${exprs.join("; ")}${trailingSpace})`;
      } else {
        const joiner = `;${linebreak(indent + 1)}`;
        return (
          `(${linebreak(indent + 1)}` +
          `${exprs.join(joiner)}` +
          `${linebreak(indent)})`
        );
      }
    }
    case "literal": {
      switch (expr.value.kind) {
        case "str": {
          return `"${expr.value.value}"`;
        }
        case "int": {
          return `${expr.value.value}`;
        }
      }
    }
    case "ident": {
      return expr.value;
    }
    case "binary": {
      return `${printExpr(expr.lhs, indent)} ${expr.binaryKind} ${printExpr(
        expr.rhs,
        indent
      )}`;
    }
    case "unary": {
      return `${expr.unaryKind}${printExpr(expr.rhs, indent)}`;
    }
    case "call": {
      const args = expr.args.map((arg) => printExpr(arg, indent + 1));
      const shortArgs =
        args.map((s) => s.length).reduce((a, b) => a + b, 0) < 40;
      if (shortArgs) {
        return `${printExpr(expr.lhs, indent)}(${args.join(", ")})`;
      } else {
        return (
          `${printExpr(expr.lhs, indent)}(${linebreak(indent + 1)}` +
          `${args.join(linebreak(indent + 1))}` +
          `${linebreak(indent)})`
        );
      }
    }
  }
 }
 function linebreak(indent: number): string {
  return `\n${ind(indent)}`;
 }
 function ind(indent: number): string {
  return "  ".repeat(indent * 2);
 }
--- a/tsconfig.json
+++ b/tsconfig.json
@ -95,7 +95,7 @@
    // "noUnusedParameters": true,                       /* Raise an error when a function parameter isn't read. */
    // "exactOptionalPropertyTypes": true,               /* Interpret optional property types as written, rather than adding 'undefined'. */
    // "noImplicitReturns": true,                        /* Enable error reporting for codepaths that do not explicitly return in a function. */
-    "noFallthroughCasesInSwitch": true,                  /* Enable error reporting for fallthrough cases in switch statements. */
+    "noFallthroughCasesInSwitch": true /* Enable error reporting for fallthrough cases in switch statements. */,
    // "noUncheckedIndexedAccess": true,                 /* Add 'undefined' to a type when accessed using an index. */
    // "noImplicitOverride": true,                       /* Ensure overriding members in derived classes are marked with an override modifier. */
    // "noPropertyAccessFromIndexSignature": true,       /* Enforces using indexed accessors for keys declared using an indexed type. */