Source code for arborist.wikitext

"""Wikitext → base prose conversion.

Arborist stores raw MediaWiki wikitext in ``chunks.content`` so the link
graph and original markup are recoverable from any page on demand. For
LLM context and post-LLM faithfulness verification we need *prose* — a
deterministic plain-text projection of the same chunk.

This module provides that projection. ``to_base(raw)`` is a pure function
of its input plus ``BASE_VERSION``: same wikitext → same prose, forever,
as long as ``BASE_VERSION`` is unchanged.

Versioning protocol
-------------------
Bump ``BASE_VERSION`` whenever the algorithm changes. Callers fold
``BASE_VERSION`` into ``governance_policy_hash`` (via ``policy["base_version"]``
in ``arborist.qa.runner`` / ``arborist.qa.query``) so a bump invalidates every
prior providence-cache record's 8-dim cache_key on the next lookup. No
schema migration; the next ``ask`` re-derives against fresh prose.

Algorithm (wikitext-base-v1)
----------------------------
1. Parse with ``mwparserfromhell`` (handles nested templates, complex
   tables, and edge cases that pure regex mangles).
2. Drop ``<ref>...</ref>`` and self-closing ``<ref ... />`` tags. Citations
   are not quotable claims about the topic.
3. Drop namespace-prefixed wikilinks: ``[[File:...]]``, ``[[Image:...]]``,
   ``[[Category:...]]``. Image params (``thumb|250px|...``) and category
   tags are not prose; they're metadata.
4. ``strip_code(normalize=True, collapse=True)`` — converts surviving
   templates to empty, wikilinks to their display text, headers to bare
   text, bold/italic markers to plain text, HTML tags to inner text,
   HTML entities to characters, external links to anchor text.
5. Whitespace pass: collapse runs of spaces/tabs, drop trailing space on
   lines, collapse 3+ newlines to 2.

Optional dependency. Install with ``pip install arborist[wikitext]``.
"""

from __future__ import annotations

import re
from typing import TYPE_CHECKING

try:
    import mwparserfromhell as _mw
except ImportError as e:  # pragma: no cover
    raise ImportError(
        "wikitext base conversion requires extras: "
        "pip install 'arborist[wikitext]'"
    ) from e

if TYPE_CHECKING:
    from arborist.sources.loss_report import LossCollector


BASE_VERSION = "wikitext-base-v1"
ADAPTER_NAME = "WikitextBase"

# Namespaces whose links carry no prose. ``File`` and ``Image`` are the
# same target type (image inclusion); MediaWiki accepts both prefixes.
# ``Category`` tags categorize a page but don't render as readable prose
# in the article body.
_DROP_NAMESPACES = frozenset({"file", "image", "category"})

_WS_RUN = re.compile(r"[ \t]+")
_TRAILING_WS = re.compile(r" +\n")
_BLANK_LINES = re.compile(r"\n{3,}")



[docs]
def to_base(
    raw: str,
    *,
    loss_collector: "LossCollector | None" = None,
) -> str:
    """Convert raw wikitext to base prose. Deterministic. Idempotent.

    Empty / whitespace-only input returns ``""``. Non-wikitext input
    (already-clean prose) round-trips unchanged modulo whitespace
    collapsing.

    When ``loss_collector`` is provided, every drop / transform /
    normalize step records a ``LossEvent`` against the collector. The
    output bytes remain bit-identical to the no-collector path —
    LossReport is additive metadata, never a gate. Default ``None``
    keeps the LLM-call path (verifier + query + runner) unchanged.
    """
    if not raw or not raw.strip():
        return ""

    code = _mw.parse(raw)

    # Drop <ref>...</ref> and self-closing <ref ... /> tags. We match on
    # the tag name (case-insensitive) so that <REF>, <Ref>, etc. all go.
    for tag in list(code.filter_tags()):
        if str(tag.tag).strip().lower() == "ref":
            tag_text = str(tag)
            try:
                code.remove(tag)
            except ValueError:
                # Tag was already removed via a parent node. mwparserfromhell
                # raises rather than no-op'ing; we swallow it.
                continue
            if loss_collector is not None:
                kind = "self_closing_ref_tag" if getattr(tag, "self_closing", False) else "ref_tag"
                loss_collector.add(
                    stage="wikitext_base",
                    canonicalization_version=BASE_VERSION,
                    loss_kind=kind,
                    loss_mode="pure_drop",
                    dropped=tag_text,
                )

    # Drop File: / Image: / Category: wikilinks. Image captions sometimes
    # contain useful prose ("thumb|250px|<caption>") but the technical
    # parameters dominate and corrupt the prose stream; cleaner to drop.
    for link in list(code.filter_wikilinks()):
        title = str(link.title).strip()
        if ":" in title:
            ns = title.split(":", 1)[0].strip().lower()
            if ns in _DROP_NAMESPACES:
                link_text = str(link)
                try:
                    code.remove(link)
                except ValueError:
                    continue
                if loss_collector is not None:
                    kind = {
                        "file": "file_link",
                        "image": "image_link",
                        "category": "category_link",
                    }[ns]
                    loss_collector.add(
                        stage="wikitext_base",
                        canonicalization_version=BASE_VERSION,
                        loss_kind=kind,
                        loss_mode="pure_drop",
                        dropped=link_text,
                    )

    pre_strip_len = (
        len(str(code).encode("utf-8", errors="surrogatepass"))
        if loss_collector is not None
        else 0
    )
    base = code.strip_code(normalize=True, collapse=True)
    if loss_collector is not None:
        # strip_code rewrites surviving wikilinks/templates/HTML to their
        # display text — bytes change, content is preserved as text.
        # Recorded as 'transform' (excluded from byte-conservation
        # property test).
        post_strip_len = len(base.encode("utf-8", errors="surrogatepass"))
        loss_collector.record_delta(
            stage="wikitext_base",
            canonicalization_version=BASE_VERSION,
            loss_kind="strip_code_transform",
            loss_mode="transform",
            bytes_delta=pre_strip_len - post_strip_len,
        )

    # Whitespace normalization — keeps paragraph breaks, drops runs.
    pre_ws_len = (
        len(base.encode("utf-8", errors="surrogatepass"))
        if loss_collector is not None
        else 0
    )
    base = _WS_RUN.sub(" ", base)
    base = _TRAILING_WS.sub("\n", base)
    base = _BLANK_LINES.sub("\n\n", base)
    base = base.strip()
    if loss_collector is not None:
        post_ws_len = len(base.encode("utf-8", errors="surrogatepass"))
        loss_collector.record_delta(
            stage="wikitext_base",
            canonicalization_version=BASE_VERSION,
            loss_kind="whitespace_run",
            loss_mode="normalize",
            bytes_delta=pre_ws_len - post_ws_len,
        )
        loss_collector.set_lengths(
            input_bytes=len(raw.encode("utf-8", errors="surrogatepass")),
            output_bytes=len(base.encode("utf-8", errors="surrogatepass")),
        )
    return base




[docs]
def extract_wikilinks(raw: str) -> list[tuple[str, str | None]]:
    """Return ``(target, display)`` for every wikilink in ``raw``.

    ``target`` is the link target (page title) with any ``#section``
    fragment stripped. ``display`` is the visible text if the link uses
    ``[[Target|Display]]`` form, else ``None``.

    File / Image / Category namespace links are *included* — they're the
    very signal the link graph wants. This is the "definition cloud"
    artifact: from any page, recover its full out-link set without
    re-parsing wikitext at query time.
    """
    if not raw or not raw.strip():
        return []
    code = _mw.parse(raw)
    out: list[tuple[str, str | None]] = []
    for link in code.filter_wikilinks():
        title = str(link.title).strip()
        # Strip ``#section`` so two links to "Foo#bar" and "Foo#baz"
        # collapse to one ("Foo") in the link graph. Sections are rarely
        # the meaningful unit downstream.
        if "#" in title:
            title = title.split("#", 1)[0].strip()
        if not title:
            continue
        text = str(link.text).strip() if link.text is not None else None
        if text == "":
            text = None
        out.append((title, text))
    return out