ribbit/src/ts/serializer.ts

/*
 * serializer.ts — DOM to markdown serializer.
 *
 * Converts an HTML DOM tree back to markdown by walking the tree and
 * producing a typed token stream. Text tokens are escaped during final
 * serialization; delimiter tokens pass through verbatim. This separation
 * is what makes round-trip correctness possible — the serializer always
 * knows which characters are structural and which are literal.
 *
 *   const serializer = new MarkdownSerializer(tagMap, delimiterChars);
 *   serializer.serialize(document.getElementById('content'))
 *   // '**bold** and *italic*'
 */

import type { InlineToken } from './tokenizer';

/**
 * Maps HTML element names to their markdown serialization.
 * Each entry defines how to convert an element back to markdown tokens.
 */
export interface SerializerTagDef {
    /** The canonical delimiter (e.g. '**' for bold). */
    delimiter?: string;
    /** Custom serializer for elements that aren't simple delimiter wraps
     *  (e.g. links, code blocks, headings). Returns the full markdown
     *  string for the element and its children. */
    serialize?: (element: HTMLElement, children: () => string) => string;
}

/**
 * Converts a DOM tree to markdown. Walks the tree producing inline
 * tokens, then serializes the token stream to a string with correct
 * escaping.
 *
 *   const serializer = new MarkdownSerializer(tagMap, new Set(['*', '`', '~', '[', '_']));
 *   const markdown = serializer.serialize(containerElement);
 */
export class MarkdownSerializer {
    private tagMap: Map<string, SerializerTagDef>;
    private delimiterChars: Set<string>;

    constructor(
        tagMap: Map<string, SerializerTagDef>,
        delimiterChars: Set<string>,
    ) {
        this.tagMap = tagMap;
        this.delimiterChars = delimiterChars;
    }

    /**
     * Serialize a DOM tree to a markdown string.
     *
     *   serializer.serialize(document.querySelector('article'))
     */
    serialize(node: Node): string {
        const tokens = this.nodeToTokens(node);
        return this.tokensToString(tokens);
    }

    /**
     * Convert a DOM node to a stream of inline tokens.
     * Text nodes become text tokens; elements with known tags
     * become delimiter-wrapped token sequences; unknown elements
     * recurse into their children.
     */
    private nodeToTokens(node: Node): InlineToken[] {
        if (node.nodeType === 3) {
            return [{
                role: 'text',
                value: node.textContent || '',
            }];
        }
        if (node.nodeType !== 1) {
            return [];
        }

        const element = node as HTMLElement;
        const tagDef = this.tagMap.get(element.nodeName);

        // Custom serializer handles the entire element
        if (tagDef?.serialize) {
            const childrenMarkdown = () => this.serializeChildren(element);
            const markdown = tagDef.serialize(element, childrenMarkdown);
            // Custom serializers return raw markdown strings — wrap
            // in a single text token that won't be escaped (it's already
            // correctly formatted)
            return [{
                role: 'html',
                value: markdown,
            }];
        }

        // Delimiter-based element: emit open + children + close
        if (tagDef?.delimiter) {
            const delimiter = tagDef.delimiter;
            return [
                {
                    role: 'open',
                    value: delimiter,
                    delimiter,
                },
                ...this.childrenToTokens(element),
                {
                    role: 'close',
                    value: delimiter,
                    delimiter,
                },
            ];
        }

        // Unknown element: just recurse into children
        return this.childrenToTokens(element);
    }

    /**
     * Collect tokens from all child nodes of an element.
     */
    private childrenToTokens(element: HTMLElement): InlineToken[] {
        const tokens: InlineToken[] = [];
        for (const child of Array.from(element.childNodes)) {
            tokens.push(...this.nodeToTokens(child));
        }
        return tokens;
    }

    /**
     * Serialize an element's children directly to a markdown string.
     * Used by custom serializers (links, headings, etc.) that need
     * the children as a string, not as tokens.
     */
    private serializeChildren(element: HTMLElement): string {
        const tokens = this.childrenToTokens(element);
        return this.tokensToString(tokens);
    }

    /**
     * Convert a token stream to a markdown string. This is where
     * escaping happens: text tokens have their delimiter characters
     * backslash-escaped; all other token types pass through verbatim.
     */
    private tokensToString(tokens: InlineToken[]): string {
        let result = '';
        for (const token of tokens) {
            switch (token.role) {
                case 'text':
                    result += this.escapeText(token.value);
                    break;
                case 'open':
                case 'close':
                case 'html':
                case 'break':
                    // Structural tokens are never escaped
                    result += token.value;
                    break;
                case 'code':
                    result += token.value;
                    break;
                case 'link':
                    result += token.value;
                    break;
                case 'autolink':
                    result += token.value;
                    break;
                default:
                    result += token.value;
            }
        }
        return result;
    }

    /**
     * Escape characters in literal text that would be misinterpreted
     * as markdown syntax on re-parse. Only escapes characters that are
     * registered as delimiter characters, plus `\`, `[`, `_`, and `<`
     * before letters (HTML passthrough prevention).
     */
    private escapeText(text: string): string {
        let result = '';
        for (let position = 0; position < text.length; position++) {
            const character = text[position];
            if (character === '\\') {
                result += '\\\\';
            } else if (character === '_') {
                result += '\\_';
            } else if (character === '[') {
                result += '\\[';
            } else if (character === '<' && position + 1 < text.length && /[a-zA-Z/]/.test(text[position + 1])) {
                // Only escape < when it would start an HTML tag
                result += '\\<';
            } else if (this.delimiterChars.has(character)) {
                result += '\\' + character;
            } else {
                result += character;
            }
        }
        return result;
    }
}