riverdelta/src/lexer.ts

333 lines
7.2 KiB
TypeScript

import { CompilerError, LoadedFile, Span } from "./error";
export type DatalessToken =
| "function"
| "let"
| "if"
| "then"
| "else"
| "type"
| "loop"
| "break"
| "import"
| "extern"
| "mod"
| "global"
| "struct"
| "("
| ")"
| "{"
| "}"
| "["
| "]"
| ";"
| ":"
| "."
| ","
| "="
| "+"
| "-"
| "*"
| "/"
| "%"
| "&"
| "|"
| "!"
| "<"
| ">"
| "=="
| "<="
| ">="
| "!="
| "end of file";
export type TokenIdent = { kind: "identifier"; ident: string };
export type TokenLitString = {
kind: "lit_string";
value: string;
};
export type LitIntType = "Int" | "I32";
export type TokenLit =
| TokenLitString
| {
kind: "lit_int";
value: number;
type: LitIntType;
};
export type TokenKind = { kind: DatalessToken } | TokenIdent | TokenLit;
export type Token = TokenKind & {
span: Span;
};
export type BaseToken = { kind: Token["kind"] };
const SINGLE_PUNCT: string[] = [
"(",
")",
"}",
"{",
"[",
"]",
";",
":",
".",
",",
"+",
"-",
"*",
"/",
"&",
"|",
"%",
];
export function tokenize(file: LoadedFile): Token[] {
const { content: input } = file;
const tokens: Token[] = [];
let i = 0;
finish: while (i < input.length) {
const next = input[i];
const span: Span = new Span(i, i + 1, file);
if (next === "/" && input[i + 1] === "/") {
while (input[i] !== "\n") {
i++;
}
continue;
}
if (next === "/" && input[i + 1] === "*") {
i++;
i++;
while (input[i] !== "*" && input[i + 1] !== "/") {
i++;
if (input[i] === undefined) {
throw new CompilerError("unterminated block comment", span);
}
}
i++;
i++;
continue;
}
if (SINGLE_PUNCT.includes(next)) {
tokens.push({ kind: next as DatalessToken, span });
} else {
switch (next) {
case undefined: {
break finish;
}
case "=": {
if (input[i + 1] === "=") {
span.end++;
i++;
tokens.push({ kind: "==", span });
} else {
tokens.push({ kind: "=", span });
}
break;
}
case ">": {
if (input[i + 1] === "=") {
span.end++;
i++;
tokens.push({ kind: ">=", span });
} else {
tokens.push({ kind: ">", span });
}
break;
}
case "<": {
if (input[i + 1] === "=") {
span.end++;
i++;
tokens.push({ kind: "<=", span });
} else {
tokens.push({ kind: "<", span });
}
break;
}
case "!": {
if (input[i + 1] === "=") {
span.end++;
i++;
tokens.push({ kind: "!=", span });
} else {
tokens.push({ kind: "!", span });
}
break;
}
case '"': {
const result = [];
while (true) {
const next = input[i + 1];
span.end++;
i++;
if (next === '"') {
break;
}
if (next === "\\") {
span.end++;
i++;
switch (input[i]) {
case "\\":
result.push("\\");
break;
case '"':
result.push('"');
break;
case "n":
result.push("\n");
break;
case "r":
result.push("\r");
break;
case "t":
result.push("\t");
break;
case "a":
result.push("\x07");
break;
case "3":
// device control 3 for callie's big project
result.push("\x13");
break;
case "M":
// end of medium for callie's big project
result.push("\x19");
break;
default:
throw new CompilerError(
`invalid escape character: ${input[i]}`,
new Span(span.end - 1, span.end, file),
);
}
continue;
}
result.push(next);
if (next === undefined) {
throw new CompilerError(`Unterminated string literal`, span);
}
}
const value = result.join("");
tokens.push({ kind: "lit_string", span, value });
break;
}
default: {
if (isDigit(next)) {
while (isDigit(input[i + 1])) {
span.end++;
i++;
}
const digit = input.slice(span.start, span.end);
const int = parseInt(digit, 10);
if (Number.isNaN(int)) {
throw new Error(
`\`${digit}\` was tokenized to a number even though it is not`,
);
}
let type: LitIntType = "Int";
if (input[i + 1] === "_" && isIdentStart(input[i + 2])) {
if (input.slice(i + 2, i + 5) === "Int") {
i += 4;
type = "Int";
} else if (input.slice(i + 2, i + 5) === "I32") {
i += 4;
type = "I32";
}
}
tokens.push({ kind: "lit_int", value: int, span, type });
} else if (isIdentStart(next)) {
while (isIdentContinue(input[i + 1])) {
span.end++;
i++;
}
const ident = input.slice(span.start, span.end);
const kw = isKeyword(ident);
if (kw) {
tokens.push({ kind: kw, span });
} else {
tokens.push({ kind: "identifier", span, ident: ident });
}
} else if (isWhitespace(next)) {
// ignore
} else {
throw new CompilerError(`invalid character: \`${next}\``, span);
}
}
}
}
i++;
}
return tokens;
}
export function isValidIdent(ident: string): boolean {
if (!isIdentStart(ident[0])) {
return false;
}
for (let i = 1; i < ident.length; i++) {
const char = ident[i];
if (!isIdentContinue(char)) {
return false;
}
}
return true;
}
function isIdentStart(char: string): boolean {
return (
(char <= "Z" && char >= "A") || (char <= "z" && char >= "a") || char === "_"
);
}
function isIdentContinue(char: string): boolean {
return (
(char <= "Z" && char >= "A") ||
(char <= "z" && char >= "a") ||
char === "_" ||
isDigit(char)
);
}
function isDigit(char: string): boolean {
return !Number.isNaN(parseInt(char, 10));
}
function isWhitespace(char: string): boolean {
return char === " " || char === "\t" || char === "\n" || char === "\r";
}
const KEYOWRDS: DatalessToken[] = [
"function",
"let",
"if",
"then",
"else",
"type",
"loop",
"break",
"import",
"extern",
"mod",
"global",
"struct",
];
const KEYWORD_SET = new Set<string>(KEYOWRDS);
function isKeyword(kw: string): DatalessToken | undefined {
return KEYWORD_SET.has(kw) ? (kw as DatalessToken) : undefined;
}