/* * serializer.ts — DOM to markdown serializer. * * Converts an HTML DOM tree back to markdown by walking the tree and * producing a typed token stream. Text tokens are escaped during final * serialization; delimiter tokens pass through verbatim. This separation * is what makes round-trip correctness possible — the serializer always * knows which characters are structural and which are literal. * * const serializer = new MarkdownSerializer(tagMap, delimiterChars); * serializer.serialize(document.getElementById('content')) * // '**bold** and *italic*' */ import type { InlineToken } from './tokenizer'; /** * Maps HTML element names to their markdown serialization. * Each entry defines how to convert an element back to markdown tokens. */ export interface SerializerTagDef { /** The canonical delimiter (e.g. '**' for bold). */ delimiter?: string; /** Custom serializer for elements that aren't simple delimiter wraps * (e.g. links, code blocks, headings). Returns the full markdown * string for the element and its children. */ serialize?: (element: HTMLElement, children: () => string) => string; } /** * Converts a DOM tree to markdown. Walks the tree producing inline * tokens, then serializes the token stream to a string with correct * escaping. * * const serializer = new MarkdownSerializer(tagMap, new Set(['*', '`', '~', '[', '_'])); * const markdown = serializer.serialize(containerElement); */ export class MarkdownSerializer { private tagMap: Map; private delimiterChars: Set; constructor( tagMap: Map, delimiterChars: Set, ) { this.tagMap = tagMap; this.delimiterChars = delimiterChars; } /** * Serialize a DOM tree to a markdown string. * * serializer.serialize(document.querySelector('article')) */ serialize(node: Node): string { const tokens = this.nodeToTokens(node); return this.tokensToString(tokens); } /** * Convert a DOM node to a stream of inline tokens. * Text nodes become text tokens; elements with known tags * become delimiter-wrapped token sequences; unknown elements * recurse into their children. */ private nodeToTokens(node: Node): InlineToken[] { if (node.nodeType === 3) { return [{ role: 'text', value: node.textContent || '', }]; } if (node.nodeType !== 1) { return []; } const element = node as HTMLElement; const tagDef = this.tagMap.get(element.nodeName); // Custom serializer handles the entire element if (tagDef?.serialize) { const childrenMarkdown = () => this.serializeChildren(element); const markdown = tagDef.serialize(element, childrenMarkdown); // Custom serializers return raw markdown strings — wrap // in a single text token that won't be escaped (it's already // correctly formatted) return [{ role: 'html', value: markdown, }]; } // Delimiter-based element: emit open + children + close if (tagDef?.delimiter) { const delimiter = tagDef.delimiter; return [ { role: 'open', value: delimiter, delimiter, }, ...this.childrenToTokens(element), { role: 'close', value: delimiter, delimiter, }, ]; } // Unknown element: just recurse into children return this.childrenToTokens(element); } /** * Collect tokens from all child nodes of an element. */ private childrenToTokens(element: HTMLElement): InlineToken[] { const tokens: InlineToken[] = []; for (const child of Array.from(element.childNodes)) { tokens.push(...this.nodeToTokens(child)); } return tokens; } /** * Serialize an element's children directly to a markdown string. * Used by custom serializers (links, headings, etc.) that need * the children as a string, not as tokens. */ private serializeChildren(element: HTMLElement): string { const tokens = this.childrenToTokens(element); return this.tokensToString(tokens); } /** * Convert a token stream to a markdown string. This is where * escaping happens: text tokens have their delimiter characters * backslash-escaped; all other token types pass through verbatim. */ private tokensToString(tokens: InlineToken[]): string { let result = ''; for (const token of tokens) { switch (token.role) { case 'text': result += this.escapeText(token.value); break; case 'open': case 'close': case 'html': case 'break': // Structural tokens are never escaped result += token.value; break; case 'code': result += token.value; break; case 'link': result += token.value; break; case 'autolink': result += token.value; break; default: result += token.value; } } return result; } /** * Escape characters in literal text that would be misinterpreted * as markdown syntax on re-parse. Only escapes characters that are * registered as delimiter characters, plus `\`, `[`, `_`, and `<` * before letters (HTML passthrough prevention). */ private escapeText(text: string): string { let result = ''; for (let position = 0; position < text.length; position++) { const character = text[position]; if (character === '\\') { result += '\\\\'; } else if (character === '_') { result += '\\_'; } else if (character === '[') { result += '\\['; } else if (character === '<' && position + 1 < text.length && /[a-zA-Z/]/.test(text[position + 1])) { // Only escape < when it would start an HTML tag result += '\\<'; } else if (this.delimiterChars.has(character)) { result += '\\' + character; } else { result += character; } } return result; } }