Create a simple compiler in javascript


Compilers are a very interesting topic. They are used in many different areas, from programming languages to compilers for other languages. In this post we will create a compiler that will take a simple arithmetic expression and convert it into a javascript program that will evaluate the expression.

(add 2 (subtract 4 2))

And generate this output:

add(2, subtract(4, 2));

Note: This post assumes that you have a basic understanding of javascript.

The Steps

Compiler diagram

Our compiler will have 4 main parts:

  • Tokenizer: This will take the input string and will split it into tokens.
  • Parser: This will take the tokens and will generate an AST (Abstract Syntax Tree).
  • Transformer: This will take the AST and will generate a new AST.
  • Code Generator: This will take the new AST and will generate the output string.

Table of contents

Lexer

We will start by creating a lexer that will take a string and return an array of tokens.

lexer.js:

function lexer(input) {
  // current position in the input
  let current = 0;

  // array of tokens
  let tokens = [];

  // loop through the input
  while (current < input.length) {
    let char = input[current];

    // if the character is a parenthesis, add it to the tokens array
    if (char === "(") {
      tokens.push({
        type: "paren",
        value: "(",
      });

      current++;

      continue;
    }

    if (char === ")") {
      tokens.push({
        type: "paren",
        value: ")",
      });

      current++;

      continue;
    }

    // if the character is a space, skip it
    let WHITESPACE = /\s/;
    if (WHITESPACE.test(char)) {
      current++;

      continue;
    }

    // if the character is a number, add it to the tokens array and iterate through the number to get the full number value (e.g. 123) instead of just the first digit (e.g. 1)
    let NUMBERS = /[0-9]/;
    if (NUMBERS.test(char)) {
      let value = "";

      while (NUMBERS.test(char)) {
        value += char;
        char = input[++current];
      }

      tokens.push({
        type: "number",
        value,
      });

      continue;
    }

    let LETTERS = /[a-z]/i;
    if (LETTERS.test(char)) {
      let value = "";

      while (LETTERS.test(char)) {
        value += char;
        char = input[++current];
      }

      tokens.push({
        type: "name",
        value,
      });

      continue;
    }

    if (char === '"') {
      let value = "";

      char = input[++current];

      while (char !== '"') {
        value += char;
        char = input[++current];
      }

      char = input[++current];

      tokens.push({
        type: "string",
        value,
      });

      continue;
    }

    if (char === "+") {
      tokens.push({
        type: "plus",
        value: "+",
      });

      current++;

      continue;
    }

    if (char === "-") {
      tokens.push({
        type: "minus",
        value: "-",
      });

      current++;

      continue;
    }

    if (char === "*") {
      tokens.push({
        type: "times",
        value: "*",
      });

      current++;

      continue;
    }

    if (char === "/") {
      tokens.push({
        type: "divide",
        value: "/",
      });

      current++;

      continue;
    }

    throw new TypeError("I dont know what this character is: " + char);
  }

  return tokens;
}

module.exports = lexer;

Parser

The parser is a program that takes a sequence of tokens as input, and produces a tree as output. The tree is a data structure that represents the structure of the source code. The parser is also responsible for syntax analysis, which is the process of checking the tokens for any syntax errors.

parser.js:

function parser(tokens) {
  let current = 0;

  // recursive function that will iterate through the tokens array and return a node
  function walk() {
    let token = tokens[current];

    if (token.type === "number") {
      current++;

      return {
        type: "NumberLiteral",
        value: token.value,
      };
    }

    if (token.type === "string") {
      current++;

      return {
        type: "StringLiteral",
        value: token.value,
      };
    }

    if (token.type === "paren" && token.value === "(") {
      token = tokens[++current];

      let node = {
        type: "CallExpression",
        name: token.value,
        params: [],
      };

      token = tokens[++current];

      while (
        token.type !== "paren" ||
        (token.type === "paren" && token.value !== ")")
      ) {
        node.params.push(walk());
        token = tokens[current];
      }

      current++;

      return node;
    }

    throw new TypeError(token.type);
  }

  let ast = {
    type: "Program",
    body: [],
  };

  while (current < tokens.length) {
    ast.body.push(walk());
  }

  return ast;
}

module.exports = parser;

Traverser

The traverser is a program that takes an AST and an object containing visitor methods, and traverses the AST recursively while calling the specified methods.

traverser.js:

function traverser(ast, visitor) {
  // recursive function that will iterate through the AST and call the visitor methods
  function traverseArray(array, parent) {
    array.forEach((child) => {
      traverseNode(child, parent);
    });
  }

  // recursive function that will iterate through the AST and call the visitor methods
  function traverseNode(node, parent) {
    let methods = visitor[node.type];

    if (methods && methods.enter) {
      methods.enter(node, parent);
    }

    switch (node.type) {
      case "Program":
        traverseArray(node.body, node);
        break;

      case "CallExpression":
        traverseArray(node.params, node);
        break;

      case "NumberLiteral":
      case "StringLiteral":
        break;

      default:
        throw new TypeError(node.type);
    }

    if (methods && methods.exit) {
      methods.exit(node, parent);
    }
  }

  // start traversing the AST
  traverseNode(ast, null);
}

module.exports = traverser;

Transformer

The transformer is a program that takes an AST and returns a new AST.

transformer.js:

const traverser = require("./traverse");

function transformer(ast) {
  let newAst = {
    type: "Program",
    body: [],
  };

  ast._context = newAst.body;

  traverser(ast, {
    NumberLiteral: {
      enter(node, parent) {
        parent._context.push({
          type: "NumberLiteral",
          value: node.value,
        });
      },
    },

    StringLiteral: {
      enter(node, parent) {
        parent._context.push({
          type: "StringLiteral",
          value: node.value,
        });
      },
    },

    CallExpression: {
      enter(node, parent) {
        let expression = {
          type: "CallExpression",
          callee: {
            type: "Identifier",
            name: node.name,
          },
          arguments: [],
        };

        node._context = expression.arguments;

        if (parent.type !== "CallExpression") {
          expression = {
            type: "ExpressionStatement",
            expression: expression,
          };
        }

        parent._context.push(expression);
      },
    },
  });

  return newAst;
}

module.exports = transformer;

Code Generator

The code generator is a program that takes an AST and returns a string.

codegenerator.js:

function codeGenerator(node) {
  switch (node.type) {
    case "Program":
      return node.body.map(codeGenerator).join("");

    case "ExpressionStatement":
      return codeGenerator(node.expression) + ";";

    case "CallExpression":
      return (
        codeGenerator(node.callee) +
        "(" +
        node.arguments.map(codeGenerator).join(", ") +
        ")"
      );

    case "Identifier":
      return node.name;

    case "NumberLiteral":
      return node.value;

    case "StringLiteral":
      return '"' + node.value + '"';

    default:
      throw new TypeError(node.type);
  }
}

module.exports = codeGenerator;

Compiler

The compiler is a program that takes a string of JavaScript code as input and returns a string of JavaScript code as output.

compiler.js:

const codeGenerator = require("./codegenerator");
const lexer = require("./lexer");
const parser = require("./parser");
const transformer = require("./transformer");

function compiler(input) {
  // tokenize the input
  let tokens = tokenizer(input);

  // parse the tokens
  let ast = parser(tokens);

  // transform the AST
  let newAst = transformer(ast);

  // generate the output
  let output = codeGenerator(newAst);

  return output;
}

module.exports = compiler;

Running the Compiler

const compiler = require("./compiler");

let input = "(add 2 (subtract 4 2))";

let output = compiler(input);

console.log(output);

The Result

add(2, subtract(4, 2));

Further Reading

Compiler Explorer
Writing a Compiler in Go
How to Write a Compiler
The Super Tiny Compiler