每日一报

magic-string 的原理

背景

是一个用于操作字符串和生成源映射的小型快速实用程序。适用于做一些轻微的源代码修改，如插入、删除、替换等操作同时最后依然可以快速生成 sourcemap( 源映射 )。相比于 recast （将 JavaScript 生成为 AST，同时可以操纵它，并重新打印带有源映射的内容而不会丢失注释和格式），能力很全面，但对于一般开发者来说太过局限和太重（或者可能源代码不是 JavaScript）。

各个工具的使用情况

在Vite中，magic-string不光处理插件中的代码重写，还会构建sourcemap，供开发者工具识别并调试。
在Rollup中，magic-string被用作实现Tree shaking的工具。值得注意的是，Rollup内部处理源码时，并非直接操作实际的字符串，而是使用magic-string的实例来表示源码。

使用方式

区间选择遵循"左闭右开"的概念，也就是选择范围包含左边的值，但不包括右边的值。
支持链式调用。
区间会自动加入offset计算，比如已经使用了prepend和append给开头和结尾加入了多个字符，之后使用update变更字符串，依然可以使用原始区间，这样省略了自己去计算offset的步骤。
可以生成v3版本的sourcemap。

源码解析

概括

MagicString 针对每一次修改均以 chunk 来进行表示，所生成的 chunk 均以双向链表来进行连接，每个 chunk 内部均包含 intro（头部内容，即在当前 chunk 头部添加新的内容）、content（当前 chunk 包含的内容）、outro（尾部内容，即在当前 chunk 尾部添加新的内容）三部分内容组成。最后生成的新字符串是通过所有的 chunk 拼接而成，即如下样例：

text

firstChunk <=> chunk <=> lastChunk

newString =
    firstChunk.intro + firstChunk.content + firstChunk.outro +
    chunk.intro + chunk.content + chunk.outro +
    lastChunk.intro + lastChunk.content + lastChunk.outro;

原理分析

MagicString 的实现

实例化

export class MagicString {
  constructor(string, options = {}) {
    // 初始源代码字符串的 chunk 实例
    const chunk = new Chunk(0, string.length, string);

    Object.defineProperties(this, {
      // 原始字符串, 后续操作不做变更。
      original: { writable: true, value: string },
      // 尾部字符串（将在原始字符串后追加的内容）
      outro: { writable: true, value: '' },
      // 头部字符串（将在原始字符串前追加的内容）
      intro: { writable: true, value: '' },
      // 第一个 chunk 实例
      firstChunk: { writable: true, value: chunk },
      // 最后一个 chunk 实例
      lastChunk: { writable: true, value: chunk },
      // 最后搜索的 chunk 实例，用于优化 chunk 的检索操作。
      lastSearchedChunk: { writable: true, value: chunk },
      // 获取以 index 索引开始的 chunk 实例。
      byStart: { writable: true, value: {} },
      // 获取以 index 索引结束的 chunk 实例。
      byEnd: { writable: true, value: {} },
      // 文件名
      filename: { writable: true, value: options.filename },
      // 缩进排除范围
      indentExclusionRanges: {
        writable: true,
        value: options.indentExclusionRanges,
      },
      // sourcemap的位置信息
      sourcemapLocations: { writable: true, value: new BitSet() },
      // 存储的名称
      storedNames: { writable: true, value: {} },
      // 缩进字符串
      indentStr: { writable: true, value: undefined },
      ignoreList: { writable: true, value: options.ignoreList },
    });

    if (DEBUG) {
      Object.defineProperty(this, 'stats', { value: new Stats() });
    }
    // 初始化首个 chunk 的位置信息。
    this.byStart[0] = chunk;
    this.byEnd[string.length] = chunk;
  }
}

MagicString 中包含了很多方法，我们举一个 update 的实现来做具体分析，其他方法类似。

MagicString.update

function update(start, end, content, options) {
  // 确保替换内容为字符串。
  if (typeof content !== 'string')
    throw new TypeError('replacement content must be a string');

  // 处理负数的起始位置和结束位置，确保均为正数。
  while (start < 0) start += this.original.length;
  while (end < 0) end += this.original.length;

  // 确保结束索引在范围之内，若结束索引符合要求则启始位置肯定也符合预期。
  if (end > this.original.length) throw new Error('end is out of bounds');
  // 检查更新的范围区间是否为零长度，是则直接跳过。
  if (start === end)
    throw new Error(
      'Cannot overwrite a zero-length range – use appendLeft or prependRight instead',
    );

  if (DEBUG) this.stats.time('overwrite');

  /**
   * start 和 end 位置进行拆分新 chunk，
   * 确保存在以 start 为起点和以 end 为结束的 chunk，
   * 为后续的操作做准备。
   */
  this._split(start);
  this._split(end);

  // 处理选项参数。
  if (options === true) {
    if (!warned.storeName) {
      console.warn(
        'The final argument to magicString.overwrite(...) should be an options object. See https://github.com/rich-harris/magic-string',
      );
      warned.storeName = true;
    }

    options = { storeName: true };
  }

  // 获取 storeName 和 overwrite 选项。
  const storeName = options !== undefined ? options.storeName : false;
  const overwrite = options !== undefined ? options.overwrite : false;

  // 如果需要存储名称，将原始内容存储到 storedNames 中。
  if (storeName) {
    const original = this.original.slice(start, end);
    Object.defineProperty(this.storedNames, original, {
      writable: true,
      value: true,
      enumerable: true,
    });
  }
  // 获取要修改的启始 chunk 和结尾 chunk。
  const first = this.byStart[start];
  const last = this.byEnd[end];

  // 若 first 存在的话，那么将 start 到 end 区间仅 first chunk 保存要修改的内容，其他的所有 chunk 均修改为空字符串。
  if (first) {
    let chunk = first;
    while (chunk !== last) {
      if (chunk.next !== this.byStart[chunk.end]) {
        throw new Error('Cannot overwrite across a split point');
      }
      chunk = chunk.next;
      chunk.edit('', false);
    }

    first.edit(content, storeName, !overwrite);
  } else {
    // must be inserting at the end
    const newChunk = new Chunk(start, end, '').edit(content, storeName);

    // TODO last chunk in the array may not be the last chunk, if it's moved...
    last.next = newChunk;
    newChunk.previous = last;
  }

  if (DEBUG) this.stats.timeEnd('overwrite');
  return this;
}

可以看到 update 的操作的时候会对 start 和 end 索引做拆分 chunk 的处理，这也就是 magic-string 包的核心所在，即完整的源码均以单个或多个 chunk 组成，每一次操作都会确保索引位置存在已拆分的 chunk。那么我们继续看一下对于 chunk 的拆分流程是如何进行的吧。

MagicString._split

function _split(index) {
  // 若索引位置存在 chunk 实例，则无需拆分，直接返回已拆分的 chunk。
  if (this.byStart[index] || this.byEnd[index]) return;

  if (DEBUG) this.stats.time('_split');

  // 标记 lastSearchedChunk 来记忆化检索目标 chunk 的位置。
  let chunk = this.lastSearchedChunk;
  // 确定新的划分 chunk 是在当前标记 chunk 的后面还是前面。
  const searchForward = index > chunk.end;

  while (chunk) {
    // 若要划分的索引在当前 chunk 中，则执行 chunk 划分流程。
    if (chunk.contains(index)) return this._splitChunk(chunk, index);
    // 若目标 chunk 是在 lastSearchedChunk chunk 的后面则晚后检索，相反若在前面则往前检索。
    chunk = searchForward
      ? this.byStart[chunk.end]
      : this.byEnd[chunk.start];
  }
}

可以看到拆分的步骤在 this._splitChunk(chunk, index) 中，继续深入分析。

MagicString._splitChunk

function _splitChunk(chunk, index) {
  // 若当前 chunk 已经被修改过了且包含了内容，那么停止当前流程。
  if (chunk.edited && chunk.content.length) {
    // zero-length edited chunks are a special case (overlapping replacements)
    const loc = getLocator(this.original)(index);
    throw new Error(
      `Cannot split a chunk that has already been edited (${loc.line}:${loc.column} – "${chunk.original}")`,
    );
  }
  /**
   * 在 chunk 中以 index 为索引进行拆分，返回以 index 为起点的新 chunk。
   * 即 newChunk.start === index。
   */
  const newChunk = chunk.split(index);
  // 更新原 chunk 的位置
  this.byEnd[index] = chunk;
  // 更新新 chunk 的位置
  this.byStart[index] = newChunk;
  this.byEnd[newChunk.end] = newChunk;

  // 若原 chunk 为最后一个 chunk，那么拆分后最后一个 chunk 即为新的 chunk。
  if (chunk === this.lastChunk) this.lastChunk = newChunk;
  // 保存当前操作的 chunk，即 newChunk.previous。作为下一次检索的启始位置，优化检索步骤。
  this.lastSearchedChunk = chunk;
  if (DEBUG) this.stats.timeEnd('_split');
  return true;
}

chunk.split

对原 chunk 执行拆分操作。创建区间为 [index, this.end] 的新 chunk，更新原 chunk 的区间为 [this.start, index] 和 content 等信息，将新 chunk 拼接在原 chunk 的后面，形成双向链表。

function split(index) {
  const sliceIndex = index - this.start;
  // 拆分内容。
  const originalBefore = this.original.slice(0, sliceIndex);
  const originalAfter = this.original.slice(sliceIndex);
  // 更新原 chunk 的内容。
  this.original = originalBefore;
  // 实例化新的 chunk，区间为 [index, this.end]。
  const newChunk = new Chunk(index, this.end, originalAfter);
  newChunk.outro = this.outro;
  this.outro = '';
  // 更新原 chunk 的区间为 [this.start, index]。
  this.end = index;

  // 原 chunk 已经被编辑过了且字符串长度为 0。
  if (this.edited) {
    // after split we should save the edit content record into the correct chunk
    // to make sure sourcemap correct
    // For example:
    // '  test'.trim()
    //     split   -> '  ' + 'test'
    //   ✔️ edit    -> '' + 'test'
    //   ✖️ edit    -> 'test' + ''
    // TODO is this block necessary?...
    newChunk.edit('', false);
    this.content = '';
  } else {
    this.content = originalBefore;
  }

  // 将 newChunk 拼接在原 chunk 的后面，形成双向链表。
  newChunk.next = this.next;
  if (newChunk.next) newChunk.next.previous = newChunk;
  newChunk.previous = this;
  this.next = newChunk;

  // 返回新创建的 chunk。
  return newChunk;
}

分析完 MagicString.update 的实现流程，最后的输出是 toString 的方法来实现。

MagicString.toString 实现源码如下：

export default class MagicString {
  toString() {
    let str = this.intro;

    let chunk = this.firstChunk;
    while (chunk) {
      str += chunk.toString();
      chunk = chunk.next;
    }

    return str + this.outro;
  }
}

export default class Chunk {
  toString() {
    return this.intro + this.content + this.outro;
  }
}

可以很清晰的看到 toString 的实现就是将所有 chunk 中的 intro、 content + outro 的内容相加。

MagicString 源码映射原理

概述：

处理新源码，解析每一个 chunk 信息 content 信息，获取 raw 数组存储的数据如下：

raw = [ ["新源码 content 的行坐标"]: [ "新源码 content 的列坐标", "源码开始的索引信息，默认均为0", "对应源码的行坐标", "对应源码的列坐标" ], ... ]

然后最后将 raw 的数据喂给 @jridgewell/sourcemap-codec 执行 VLQ base 64 编码操作处理。

import { encode } from '@jridgewell/sourcemap-codec';
encode(raw);

来获取新代码的 sourceMap 信息。

具体实现：

那么我们具体来分析一下实现原理，我们可以了解到是通过 generateDecodedMap 的方法来生成源码映射，那么我们就来深入分析一下 generateDecodedMap 到底做了什么。

function generateDecodedMap(options) {
  options = options || {};

  const sourceIndex = 0;
  const names = Object.keys(this.storedNames);
  const mappings = new Mappings(options.hires);
  // 通过 locate 可以以log复杂度快速获取索引对应源码的 [行列] 信息。
  const locate = getLocator(this.original);

  if (this.intro) {
    mappings.advance(this.intro);
  }
  // 以 this.firstChunk 为起点遍历后续的每一个 chunk。
  this.firstChunk.eachNext((chunk) => {
    // 获取 chunk.start 为索引所对应的源码 [行列] 信息。
    const loc = locate(chunk.start);

    if (chunk.intro.length) mappings.advance(chunk.intro);

    if (chunk.edited) {
      mappings.addEdit(
        sourceIndex,
        chunk.content,
        loc,
        chunk.storeName ? names.indexOf(chunk.original) : -1,
      );
    } else {
      mappings.addUneditedChunk(
        sourceIndex,
        chunk,
        this.original,
        loc,
        this.sourcemapLocations,
      );
    }

    if (chunk.outro.length) mappings.advance(chunk.outro);
  });

  return {
    file: options.file ? options.file.split(/[/\\]/).pop() : undefined,
    sources: [
      options.source
        ? getRelativePath(options.file || '', options.source)
        : options.file || '',
    ],
    sourcesContent: options.includeContent ? [this.original] : undefined,
    names,
    mappings: mappings.raw,
    x_google_ignoreList: this.ignoreList ? [sourceIndex] : undefined,
  };
}

function getLocator(source) {
  const originalLines = source.split('\n');
  const lineOffsets = [];
  // 根据源码中的 \n， 将源码拆分成多行，lineOffsets 中记录的是每一行中的其实字符位置。
  for (let i = 0, pos = 0; i < originalLines.length; i++) {
    lineOffsets.push(pos);
    pos += originalLines[i].length + 1;
  }

  // 通过二分方法以 log 的复杂度获取 index 字符索引对应的代码行列位置。
  return function locate(index) {
    let i = 0;
    let j = lineOffsets.length;
    while (i < j) {
      const m = (i + j) >> 1;
      if (index < lineOffsets[m]) {
        j = m;
      } else {
        i = m + 1;
      }
    }
    const line = i - 1;
    const column = index - lineOffsets[line];
    return { line, column };
  };
}

从上述源码中可以看到，处理 chunk 的 intro 和 outro 内容时使用的是 Mappings.advance 方法；当 chunk 被编辑过调用 Mappings.addEdit，而没有编辑过则调用 Mappings.addUneditedChunk 方法。那我们先看一下 Mappings.advance 究竟做了什么：

export default class Mappings {
  constructor(hires) {
    this.hires = hires;
    this.generatedCodeLine = 0;
    this.generatedCodeColumn = 0;
    this.raw = [];
    this.rawSegments = this.raw[this.generatedCodeLine] = [];
    this.pending = null;
  }
  advance(str) {
    if (!str) return;

    const lines = str.split('\n');

    if (lines.length > 1) {
      for (let i = 0; i < lines.length - 1; i++) {
        this.generatedCodeLine++;
        this.raw[this.generatedCodeLine] = this.rawSegments = [];
      }
      this.generatedCodeColumn = 0;
    }

    this.generatedCodeColumn += lines[lines.length - 1].length;
  }
}

值得注意的是对于 chunk 的 intro 及 outro 并没有记录对应源码的 [行列] 坐标，在 advance 函数更新 this.generatedCodeLine (即 content 的行信息) 和 this.generatedCodeColumn （即 content 的列信息），自身并没有做任何映射处理，仅确保获取新的 content 的 [行列] 坐标的准确性。

若 chunk 被编辑过了，则我们可以看到 Mappings.addEdit 具体做了什么。

export default class Mapping {
  addEdit(sourceIndex, content, loc, nameIndex) {
    if (content.length) {
      let contentLineEnd = content.indexOf('\n', 0);
      let previousContentLineEnd = -1;
      while (contentLineEnd >= 0) {
        const segment = [
          this.generatedCodeColumn,
          sourceIndex,
          loc.line,
          loc.column,
        ];
        if (nameIndex >= 0) {
          segment.push(nameIndex);
        }
        this.rawSegments.push(segment);

        this.generatedCodeLine += 1;
        this.raw[this.generatedCodeLine] = this.rawSegments = [];
        this.generatedCodeColumn = 0;

        previousContentLineEnd = contentLineEnd;
        contentLineEnd = content.indexOf('\n', contentLineEnd + 1);
      }

      const segment = [
        this.generatedCodeColumn,
        sourceIndex,
        loc.line,
        loc.column,
      ];
      if (nameIndex >= 0) {
        segment.push(nameIndex);
      }
      this.rawSegments.push(segment);

      this.advance(content.slice(previousContentLineEnd + 1));
    } else if (this.pending) {
      this.rawSegments.push(this.pending);
      this.advance(content);
    }

    this.pending = null;
  }
}

可以很清晰的看到只是做了一层新 content 对应源码位置的坐标映射处理。那么我们来对比若 content 没有被编辑过与其区别。

function addUneditedChunk(
  sourceIndex,
  chunk,
  original,
  loc,
  sourcemapLocations,
) {
  let originalCharIndex = chunk.start;
  let first = true;
  // when iterating each char, check if it's in a word boundary
  let charInHiresBoundary = false;

  while (originalCharIndex < chunk.end) {
    if (this.hires || first || sourcemapLocations.has(originalCharIndex)) {
      const segment = [
        this.generatedCodeColumn,
        sourceIndex,
        loc.line,
        loc.column,
      ];

      if (this.hires === 'boundary') {
        // in hires "boundary", group segments per word boundary than per char
        if (wordRegex.test(original[originalCharIndex])) {
          // for first char in the boundary found, start the boundary by pushing a segment
          if (!charInHiresBoundary) {
            this.rawSegments.push(segment);
            charInHiresBoundary = true;
          }
        } else {
          // for non-word char, end the boundary by pushing a segment
          this.rawSegments.push(segment);
          charInHiresBoundary = false;
        }
      } else {
        this.rawSegments.push(segment);
      }
    }

    if (original[originalCharIndex] === '\n') {
      loc.line += 1;
      loc.column = 0;
      this.generatedCodeLine += 1;
      this.raw[this.generatedCodeLine] = this.rawSegments = [];
      this.generatedCodeColumn = 0;
      first = true;
    } else {
      loc.column += 1;
      this.generatedCodeColumn += 1;
      first = false;
    }

    originalCharIndex += 1;
  }

  this.pending = null;
}

可以看到原理还是和上述被编辑过的 chunk 的原理类似，均是获取新 chunk 的 content 对应源码的映射。不一样的点是没有编辑过的 chunk 会做一些额外的操作，比如存在 hire 和 sourcemapLocations 时映射的精确性处理。

每日一报 ​

magic-string 的原理 ​

背景 ​

各个工具的使用情况 ​

使用方式 ​

源码解析 ​

概括 ​

原理分析 ​