199 lines
6.7 KiB
TypeScript
199 lines
6.7 KiB
TypeScript
/*
|
|
* serializer.ts — DOM to markdown serializer.
|
|
*
|
|
* Converts an HTML DOM tree back to markdown by walking the tree and
|
|
* producing a typed token stream. Text tokens are escaped during final
|
|
* serialization; delimiter tokens pass through verbatim. This separation
|
|
* is what makes round-trip correctness possible — the serializer always
|
|
* knows which characters are structural and which are literal.
|
|
*
|
|
* const serializer = new MarkdownSerializer(tagMap, delimiterChars);
|
|
* serializer.serialize(document.getElementById('content'))
|
|
* // '**bold** and *italic*'
|
|
*/
|
|
|
|
import type { InlineToken } from './tokenizer';
|
|
|
|
/**
|
|
* Maps HTML element names to their markdown serialization.
|
|
* Each entry defines how to convert an element back to markdown tokens.
|
|
*/
|
|
export interface SerializerTagDef {
|
|
/** The canonical delimiter (e.g. '**' for bold). */
|
|
delimiter?: string;
|
|
/** Custom serializer for elements that aren't simple delimiter wraps
|
|
* (e.g. links, code blocks, headings). Returns the full markdown
|
|
* string for the element and its children. */
|
|
serialize?: (element: HTMLElement, children: () => string) => string;
|
|
}
|
|
|
|
/**
|
|
* Converts a DOM tree to markdown. Walks the tree producing inline
|
|
* tokens, then serializes the token stream to a string with correct
|
|
* escaping.
|
|
*
|
|
* const serializer = new MarkdownSerializer(tagMap, new Set(['*', '`', '~', '[', '_']));
|
|
* const markdown = serializer.serialize(containerElement);
|
|
*/
|
|
export class MarkdownSerializer {
|
|
private tagMap: Map<string, SerializerTagDef>;
|
|
private delimiterChars: Set<string>;
|
|
|
|
constructor(
|
|
tagMap: Map<string, SerializerTagDef>,
|
|
delimiterChars: Set<string>,
|
|
) {
|
|
this.tagMap = tagMap;
|
|
this.delimiterChars = delimiterChars;
|
|
}
|
|
|
|
/**
|
|
* Serialize a DOM tree to a markdown string.
|
|
*
|
|
* serializer.serialize(document.querySelector('article'))
|
|
*/
|
|
serialize(node: Node): string {
|
|
const tokens = this.nodeToTokens(node);
|
|
return this.tokensToString(tokens);
|
|
}
|
|
|
|
/**
|
|
* Convert a DOM node to a stream of inline tokens.
|
|
* Text nodes become text tokens; elements with known tags
|
|
* become delimiter-wrapped token sequences; unknown elements
|
|
* recurse into their children.
|
|
*/
|
|
private nodeToTokens(node: Node): InlineToken[] {
|
|
if (node.nodeType === 3) {
|
|
return [{
|
|
role: 'text',
|
|
value: node.textContent || '',
|
|
}];
|
|
}
|
|
if (node.nodeType !== 1) {
|
|
return [];
|
|
}
|
|
|
|
const element = node as HTMLElement;
|
|
const tagDef = this.tagMap.get(element.nodeName);
|
|
|
|
// Custom serializer handles the entire element
|
|
if (tagDef?.serialize) {
|
|
const childrenMarkdown = () => this.serializeChildren(element);
|
|
const markdown = tagDef.serialize(element, childrenMarkdown);
|
|
// Custom serializers return raw markdown strings — wrap
|
|
// in a single text token that won't be escaped (it's already
|
|
// correctly formatted)
|
|
return [{
|
|
role: 'html',
|
|
value: markdown,
|
|
}];
|
|
}
|
|
|
|
// Delimiter-based element: emit open + children + close
|
|
if (tagDef?.delimiter) {
|
|
const delimiter = tagDef.delimiter;
|
|
return [
|
|
{
|
|
role: 'open',
|
|
value: delimiter,
|
|
delimiter,
|
|
},
|
|
...this.childrenToTokens(element),
|
|
{
|
|
role: 'close',
|
|
value: delimiter,
|
|
delimiter,
|
|
},
|
|
];
|
|
}
|
|
|
|
// Unknown element: just recurse into children
|
|
return this.childrenToTokens(element);
|
|
}
|
|
|
|
/**
|
|
* Collect tokens from all child nodes of an element.
|
|
*/
|
|
private childrenToTokens(element: HTMLElement): InlineToken[] {
|
|
const tokens: InlineToken[] = [];
|
|
for (const child of Array.from(element.childNodes)) {
|
|
tokens.push(...this.nodeToTokens(child));
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
/**
|
|
* Serialize an element's children directly to a markdown string.
|
|
* Used by custom serializers (links, headings, etc.) that need
|
|
* the children as a string, not as tokens.
|
|
*/
|
|
private serializeChildren(element: HTMLElement): string {
|
|
const tokens = this.childrenToTokens(element);
|
|
return this.tokensToString(tokens);
|
|
}
|
|
|
|
/**
|
|
* Convert a token stream to a markdown string. This is where
|
|
* escaping happens: text tokens have their delimiter characters
|
|
* backslash-escaped; all other token types pass through verbatim.
|
|
*/
|
|
private tokensToString(tokens: InlineToken[]): string {
|
|
let result = '';
|
|
for (const token of tokens) {
|
|
switch (token.role) {
|
|
case 'text':
|
|
result += this.escapeText(token.value);
|
|
break;
|
|
case 'open':
|
|
case 'close':
|
|
case 'html':
|
|
case 'break':
|
|
// Structural tokens are never escaped
|
|
result += token.value;
|
|
break;
|
|
case 'code':
|
|
result += token.value;
|
|
break;
|
|
case 'link':
|
|
result += token.value;
|
|
break;
|
|
case 'autolink':
|
|
result += token.value;
|
|
break;
|
|
default:
|
|
result += token.value;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Escape characters in literal text that would be misinterpreted
|
|
* as markdown syntax on re-parse. Only escapes characters that are
|
|
* registered as delimiter characters, plus `\`, `[`, `_`, and `<`
|
|
* before letters (HTML passthrough prevention).
|
|
*/
|
|
private escapeText(text: string): string {
|
|
let result = '';
|
|
for (let position = 0; position < text.length; position++) {
|
|
const character = text[position];
|
|
if (character === '\\') {
|
|
result += '\\\\';
|
|
} else if (character === '_') {
|
|
result += '\\_';
|
|
} else if (character === '[') {
|
|
result += '\\[';
|
|
} else if (character === '<' && position + 1 < text.length && /[a-zA-Z/]/.test(text[position + 1])) {
|
|
// Only escape < when it would start an HTML tag
|
|
result += '\\<';
|
|
} else if (this.delimiterChars.has(character)) {
|
|
result += '\\' + character;
|
|
} else {
|
|
result += character;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
}
|