当前位置：首页 > 技术 > 正文内容

如何将文本输入解析为可交互的语法树结构

访客技术 2026年6月3日 81

在开发类似 Excel 的表格编辑器时，一个核心需求是实现智能的单元格公式输入框。用户不仅能够输入普通文本和数值，还能编写包含函数（如 IF、VLOOKUP）、引用其他单元格（如 A1）或跨表区域（如 sheet1!$A$1:$C$50）的复杂表达式。为了支持对这些元素的点击高亮与交互响应，系统必须理解输入内容的语义结构。

关键在于将原始字符串转换为结构化的数据表示——即抽象语法树（AST）。一旦获得 AST，就可以遍历它生成带有语义类名的 HTML 片段，并为不同类型的节点绑定事件处理器，从而实现语法高亮、悬停提示和跳转等功能。

目标结构示例

考虑如下公式：

IF(VLOOKUP(A1,"sheet1"!$A$1:$C$50,1,2) > 3, 4, 5)

理想情况下，应将其解析成具有层级关系的对象树，例如：

{
  type: 'FunctionCall',
  name: 'IF',
  args: [
    {
      type: 'BinaryExpression',
      operator: '>',
      left: {
        type: 'FunctionCall',
        name: 'VLOOKUP',
        args: [
          { type: 'CellReference', value: 'A1' },
          { type: 'RangeReference', sheet: 'sheet1', range: '$A$1:$C$50' },
          { type: 'NumericLiteral', value: '1' },
          { type: 'NumericLiteral', value: '2' }
        ]
      },
      right: { type: 'NumericLiteral', value: '3' }
    },
    { type: 'NumericLiteral', value: '4' },
    { type: 'NumericLiteral', value: '5' }
  ]
}

基于此结构，可以轻松映射为带样式的 HTML 元素：

<span class="fn">IF</span>
<span>(</span>
  <span class="expr">
    <span class="fn">VLOOKUP</span>
    <span>(</span>
    <span class="cell">A1</span>
    <span>,</span>
    <span class="ref-sheet">sheet1!</span>
    <span class="range">$A$1:$C$50</span>
    ...
  </span>
<span>)</span>

四阶段解析流程

整个解析过程可分为四个主要阶段：词法分析、语法分析、树变换和代码生成。这种分层设计广泛应用于现代编译器和 DSL 处理工具中。

1. 词法分析器（Tokenizer）

逐字符扫描输入流，识别出有意义的最小单位——"词法单元"（Token），并忽略空白符等无关字符。

function tokenize(input) {
  let pos = 0;
  const tokens = [];

  while (pos < input.length) {
    let ch = input[pos];

    // 括号处理
    if (ch === '(' || ch === ')') {
      tokens.push({ type: 'bracket', value: ch });
      pos++;
      continue;
    }

    // 空白跳过
    if (/\s/.test(ch)) {
      pos++;
      continue;
    }

    // 数字匹配
    if (/[0-9]/.test(ch)) {
      let num = '';
      while (/[0-9]/.test(input[pos])) {
        num += input[pos++];
      }
      tokens.push({ type: 'number', value: num });
      continue;
    }

    // 字符串字面量
    if (ch === '"') {
      let str = '';
      pos++; // 跳过起始引号
      while (input[pos] !== '"' && pos < input.length) {
        str += input[pos++];
      }
      pos++; // 跳过结束引号
      tokens.push({ type: 'string', value: str });
      continue;
    }

    // 标识符（函数名、变量名）
    if (/[a-zA-Z]/.test(ch)) {
      let ident = '';
      while (/[a-zA-Z0-9]/.test(input[pos])) {
        ident += input[pos++];
      }
      tokens.push({ type: 'identifier', value: ident });
      continue;
    }

    throw new Error(`未知字符: ${ch}`);
  }

  return tokens;
}

2. 语法分析器（Parser）

将线性 Token 流构造成树形结构。通过递归下降的方式构建嵌套的表达式节点。

function parse(tokens) {
  let index = 0;

  function walk() {
    let token = tokens[index];

    // 数值节点
    if (token.type === 'number') {
      index++;
      return {
        type: 'NumericLiteral',
        value: token.value
      };
    }

    // 字符串节点
    if (token.type === 'string') {
      index++;
      return {
        type: 'StringLiteral',
        value: token.value
      };
    }

    // 函数调用：以括号开头的标识符序列
    if (
      token.type === 'identifier' &&
      tokens[index + 1] &&
      tokens[index + 1].type === 'bracket' &&
      tokens[index + 1].value === '('
    ) {
      token = tokens[index++]; // 当前标识符
      index++; // 跳过 '('

      const node = {
        type: 'FunctionCall',
        name: token.value,
        args: []
      };

      // 解析参数直到遇到 ')'
      while (
        tokens[index] &&
        !(tokens[index].type === 'bracket' && tokens[index].value === ')')
      ) {
        // 忽略逗号
        if (
          tokens[index].type === 'comma' ||
          (tokens[index].type === 'bracket' && tokens[index].value === ',')
        ) {
          index++;
          continue;
        }
        node.args.push(walk());
      }

      index++; // 跳过 ')'
      return node;
    }

    throw new Error(`无法解析 token: ${token.type}`);
  }

  const ast = {
    type: 'FormulaRoot',
    body: []
  };

  while (index < tokens.length) {
    ast.body.push(walk());
  }

  return ast;
}

3. 树遍历器（Tree Traverser）

提供一种通用机制来访问 AST 中的每个节点，允许在进入和退出时执行自定义逻辑。

function traverse(ast, visitor) {
  function visit(node, parent) {
    const methods = visitor[node.type];

    if (methods && methods.enter) {
      methods.enter(node, parent);
    }

    switch (node.type) {
      case 'FormulaRoot':
        traverseArray(node.body, node);
        break;
      case 'FunctionCall':
        traverseArray(node.args, node);
        break;
      case 'NumericLiteral':
      case 'StringLiteral':
        break;
      default:
        throw new Error(`不支持的节点类型: ${node.type}`);
    }

    if (methods && methods.exit) {
      methods.exit(node, parent);
    }
  }

  function traverseArray(arr, parent) {
    arr.forEach(node => visit(node, parent));
  }

  visit(ast, null);
}

4. 转换器与代码生成器

结合使用 traverser 对 AST 进行改写，并最终输出格式化结果或渲染标记。

function transform(ast) {
  const newAst = {
    type: 'FormattedOutput',
    elements: []
  };

  ast._context = newAst.elements;

  traverse(ast, {
    FunctionCall: {
      enter(node, parent) {
        const callNode = {
          type: 'function',
          name: node.name,
          children: []
        };
        node._context = callNode.children;
        parent._context.push(callNode);
      }
    },
    NumericLiteral: {
      enter(node, parent) {
        parent._context.push({
          type: 'literal-number',
          text: node.value
        });
      }
    },
    StringLiteral: {
      enter(node, parent) {
        parent._context.push({
          type: 'literal-string',
          text: `"${node.value}"`
        });
      }
    }
  });

  return newAst;
}

function generateHtml(nodes) {
  return nodes.map(node => {
    switch (node.type) {
      case 'function':
        const argsHtml = node.children.map(generateHtml).join('');
        return `<span class="func">${node.name}</span>(${argsHtml})`;
      case 'literal-number':
      case 'literal-string':
        return `<span class="${node.type}">${node.text}</span>`;
      default:
        return '';
    }
  }).join('');
}

最终整合为完整的处理管道：

function compile(formula) {
  const tokens = tokenize(formula);
  const ast = parse(tokens);
  const transformed = transform(ast);
  return generateHtml(transformed.elements);
}

该模式虽简化，但完整展现了从文本到结构化数据再到可视化的全过程。类似原理被用于 Vue 模板编译、Babel 转译、SQL 解析器等实际工程场景中。

标签: AST 词法分析语法分析

返回列表

上一篇：Python SQLAlchemy ORM 数据库操作指南

下一篇：Tarjan算法及其应用

老程序员博客

如何将文本输入解析为可交互的语法树结构

目标结构示例

四阶段解析流程

1. 词法分析器（Tokenizer）

2. 语法分析器（Parser）

3. 树遍历器（Tree Traverser）

4. 转换器与代码生成器

相关文章

Linux crontab 详解

富文本里可以允许的 HTML 属性

Mac 安装 Node.js 指南

Dom\HTML_NO_DEFAULT_NS 的副作用：自动加闭合标签

自定义域名解析神器 dnsmasq

linux screen 用法详情 (nohup 的替代方案)

发表评论

Copyright © agingcoder.cn

Powered By Z-BlogPHP. Theme by TOYEAN.

老程序员博客

如何将文本输入解析为可交互的语法树结构

目标结构示例

四阶段解析流程

1. 词法分析器（Tokenizer）

2. 语法分析器（Parser）

3. 树遍历器（Tree Traverser）

4. 转换器与代码生成器

相关文章

Linux crontab 详解

富文本里可以允许的 HTML 属性

Mac 安装 Node.js 指南

Dom\HTML_NO_DEFAULT_NS 的副作用：自动加闭合标签

自定义域名解析神器 dnsmasq

linux screen 用法详情 (nohup 的替代方案)

发表评论取消回复

Copyright © agingcoder.cn

Powered By Z-BlogPHP. Theme by TOYEAN.

发表评论