Source code for arborist.wikitext

"""Wikitext → base prose conversion.

Arborist stores raw MediaWiki wikitext in ``chunks.content`` so the link
graph and original markup are recoverable from any page on demand. For
LLM context and post-LLM faithfulness verification we need *prose* — a
deterministic plain-text projection of the same chunk.

This module provides that projection. ``to_base(raw)`` is a pure function
of its input plus ``BASE_VERSION``: same wikitext → same prose, forever,
as long as ``BASE_VERSION`` is unchanged.

Versioning protocol
-------------------
Bump ``BASE_VERSION`` whenever the algorithm changes. Callers fold
``BASE_VERSION`` into ``governance_policy_hash`` (via ``policy["base_version"]``
in ``arborist.qa.runner`` / ``arborist.qa.query``) so a bump invalidates every
prior providence-cache record's 8-dim cache_key on the next lookup. No
schema migration; the next ``ask`` re-derives against fresh prose.

Algorithm (wikitext-base-v1)
----------------------------
1. Parse with ``mwparserfromhell`` (handles nested templates, complex
   tables, and edge cases that pure regex mangles).
2. Drop ``<ref>...</ref>`` and self-closing ``<ref ... />`` tags. Citations
   are not quotable claims about the topic.
3. Drop namespace-prefixed wikilinks: ``[[File:...]]``, ``[[Image:...]]``,
   ``[[Category:...]]``. Image params (``thumb|250px|...``) and category
   tags are not prose; they're metadata.
4. ``strip_code(normalize=True, collapse=True)`` — converts surviving
   templates to empty, wikilinks to their display text, headers to bare
   text, bold/italic markers to plain text, HTML tags to inner text,
   HTML entities to characters, external links to anchor text.
5. Whitespace pass: collapse runs of spaces/tabs, drop trailing space on
   lines, collapse 3+ newlines to 2.

Optional dependency. Install with ``pip install arborist[wikitext]``.
"""

from __future__ import annotations

import re
from typing import TYPE_CHECKING

try:
    import mwparserfromhell as _mw
except ImportError as e:  # pragma: no cover
    raise ImportError(
        "wikitext base conversion requires extras: "
        "pip install 'arborist[wikitext]'"
    ) from e

if TYPE_CHECKING:
    from arborist.sources.loss_report import LossCollector


BASE_VERSION = "wikitext-base-v1"
ADAPTER_NAME = "WikitextBase"

# Namespaces whose links carry no prose. ``File`` and ``Image`` are the
# same target type (image inclusion); MediaWiki accepts both prefixes.
# ``Category`` tags categorize a page but don't render as readable prose
# in the article body.
_DROP_NAMESPACES = frozenset({"file", "image", "category"})

_WS_RUN = re.compile(r"[ \t]+")
_TRAILING_WS = re.compile(r" +\n")
_BLANK_LINES = re.compile(r"\n{3,}")


[docs] def to_base( raw: str, *, loss_collector: "LossCollector | None" = None, ) -> str: """Convert raw wikitext to base prose. Deterministic. Idempotent. Empty / whitespace-only input returns ``""``. Non-wikitext input (already-clean prose) round-trips unchanged modulo whitespace collapsing. When ``loss_collector`` is provided, every drop / transform / normalize step records a ``LossEvent`` against the collector. The output bytes remain bit-identical to the no-collector path — LossReport is additive metadata, never a gate. Default ``None`` keeps the LLM-call path (verifier + query + runner) unchanged. """ if not raw or not raw.strip(): return "" code = _mw.parse(raw) # Drop <ref>...</ref> and self-closing <ref ... /> tags. We match on # the tag name (case-insensitive) so that <REF>, <Ref>, etc. all go. for tag in list(code.filter_tags()): if str(tag.tag).strip().lower() == "ref": tag_text = str(tag) try: code.remove(tag) except ValueError: # Tag was already removed via a parent node. mwparserfromhell # raises rather than no-op'ing; we swallow it. continue if loss_collector is not None: kind = "self_closing_ref_tag" if getattr(tag, "self_closing", False) else "ref_tag" loss_collector.add( stage="wikitext_base", canonicalization_version=BASE_VERSION, loss_kind=kind, loss_mode="pure_drop", dropped=tag_text, ) # Drop File: / Image: / Category: wikilinks. Image captions sometimes # contain useful prose ("thumb|250px|<caption>") but the technical # parameters dominate and corrupt the prose stream; cleaner to drop. for link in list(code.filter_wikilinks()): title = str(link.title).strip() if ":" in title: ns = title.split(":", 1)[0].strip().lower() if ns in _DROP_NAMESPACES: link_text = str(link) try: code.remove(link) except ValueError: continue if loss_collector is not None: kind = { "file": "file_link", "image": "image_link", "category": "category_link", }[ns] loss_collector.add( stage="wikitext_base", canonicalization_version=BASE_VERSION, loss_kind=kind, loss_mode="pure_drop", dropped=link_text, ) pre_strip_len = ( len(str(code).encode("utf-8", errors="surrogatepass")) if loss_collector is not None else 0 ) base = code.strip_code(normalize=True, collapse=True) if loss_collector is not None: # strip_code rewrites surviving wikilinks/templates/HTML to their # display text — bytes change, content is preserved as text. # Recorded as 'transform' (excluded from byte-conservation # property test). post_strip_len = len(base.encode("utf-8", errors="surrogatepass")) loss_collector.record_delta( stage="wikitext_base", canonicalization_version=BASE_VERSION, loss_kind="strip_code_transform", loss_mode="transform", bytes_delta=pre_strip_len - post_strip_len, ) # Whitespace normalization — keeps paragraph breaks, drops runs. pre_ws_len = ( len(base.encode("utf-8", errors="surrogatepass")) if loss_collector is not None else 0 ) base = _WS_RUN.sub(" ", base) base = _TRAILING_WS.sub("\n", base) base = _BLANK_LINES.sub("\n\n", base) base = base.strip() if loss_collector is not None: post_ws_len = len(base.encode("utf-8", errors="surrogatepass")) loss_collector.record_delta( stage="wikitext_base", canonicalization_version=BASE_VERSION, loss_kind="whitespace_run", loss_mode="normalize", bytes_delta=pre_ws_len - post_ws_len, ) loss_collector.set_lengths( input_bytes=len(raw.encode("utf-8", errors="surrogatepass")), output_bytes=len(base.encode("utf-8", errors="surrogatepass")), ) return base