"""Wikitext → base prose conversion.
Arborist stores raw MediaWiki wikitext in ``chunks.content`` so the link
graph and original markup are recoverable from any page on demand. For
LLM context and post-LLM faithfulness verification we need *prose* — a
deterministic plain-text projection of the same chunk.
This module provides that projection. ``to_base(raw)`` is a pure function
of its input plus ``BASE_VERSION``: same wikitext → same prose, forever,
as long as ``BASE_VERSION`` is unchanged.
Versioning protocol
-------------------
Bump ``BASE_VERSION`` whenever the algorithm changes. Callers fold
``BASE_VERSION`` into ``governance_policy_hash`` (via ``policy["base_version"]``
in ``arborist.qa.runner`` / ``arborist.qa.query``) so a bump invalidates every
prior providence-cache record's 8-dim cache_key on the next lookup. No
schema migration; the next ``ask`` re-derives against fresh prose.
Algorithm (wikitext-base-v1)
----------------------------
1. Parse with ``mwparserfromhell`` (handles nested templates, complex
tables, and edge cases that pure regex mangles).
2. Drop ``<ref>...</ref>`` and self-closing ``<ref ... />`` tags. Citations
are not quotable claims about the topic.
3. Drop namespace-prefixed wikilinks: ``[[File:...]]``, ``[[Image:...]]``,
``[[Category:...]]``. Image params (``thumb|250px|...``) and category
tags are not prose; they're metadata.
4. ``strip_code(normalize=True, collapse=True)`` — converts surviving
templates to empty, wikilinks to their display text, headers to bare
text, bold/italic markers to plain text, HTML tags to inner text,
HTML entities to characters, external links to anchor text.
5. Whitespace pass: collapse runs of spaces/tabs, drop trailing space on
lines, collapse 3+ newlines to 2.
Optional dependency. Install with ``pip install arborist[wikitext]``.
"""
from __future__ import annotations
import re
from typing import TYPE_CHECKING
try:
import mwparserfromhell as _mw
except ImportError as e: # pragma: no cover
raise ImportError(
"wikitext base conversion requires extras: "
"pip install 'arborist[wikitext]'"
) from e
if TYPE_CHECKING:
from arborist.sources.loss_report import LossCollector
BASE_VERSION = "wikitext-base-v1"
ADAPTER_NAME = "WikitextBase"
# Namespaces whose links carry no prose. ``File`` and ``Image`` are the
# same target type (image inclusion); MediaWiki accepts both prefixes.
# ``Category`` tags categorize a page but don't render as readable prose
# in the article body.
_DROP_NAMESPACES = frozenset({"file", "image", "category"})
_WS_RUN = re.compile(r"[ \t]+")
_TRAILING_WS = re.compile(r" +\n")
_BLANK_LINES = re.compile(r"\n{3,}")
[docs]
def to_base(
raw: str,
*,
loss_collector: "LossCollector | None" = None,
) -> str:
"""Convert raw wikitext to base prose. Deterministic. Idempotent.
Empty / whitespace-only input returns ``""``. Non-wikitext input
(already-clean prose) round-trips unchanged modulo whitespace
collapsing.
When ``loss_collector`` is provided, every drop / transform /
normalize step records a ``LossEvent`` against the collector. The
output bytes remain bit-identical to the no-collector path —
LossReport is additive metadata, never a gate. Default ``None``
keeps the LLM-call path (verifier + query + runner) unchanged.
"""
if not raw or not raw.strip():
return ""
code = _mw.parse(raw)
# Drop <ref>...</ref> and self-closing <ref ... /> tags. We match on
# the tag name (case-insensitive) so that <REF>, <Ref>, etc. all go.
for tag in list(code.filter_tags()):
if str(tag.tag).strip().lower() == "ref":
tag_text = str(tag)
try:
code.remove(tag)
except ValueError:
# Tag was already removed via a parent node. mwparserfromhell
# raises rather than no-op'ing; we swallow it.
continue
if loss_collector is not None:
kind = "self_closing_ref_tag" if getattr(tag, "self_closing", False) else "ref_tag"
loss_collector.add(
stage="wikitext_base",
canonicalization_version=BASE_VERSION,
loss_kind=kind,
loss_mode="pure_drop",
dropped=tag_text,
)
# Drop File: / Image: / Category: wikilinks. Image captions sometimes
# contain useful prose ("thumb|250px|<caption>") but the technical
# parameters dominate and corrupt the prose stream; cleaner to drop.
for link in list(code.filter_wikilinks()):
title = str(link.title).strip()
if ":" in title:
ns = title.split(":", 1)[0].strip().lower()
if ns in _DROP_NAMESPACES:
link_text = str(link)
try:
code.remove(link)
except ValueError:
continue
if loss_collector is not None:
kind = {
"file": "file_link",
"image": "image_link",
"category": "category_link",
}[ns]
loss_collector.add(
stage="wikitext_base",
canonicalization_version=BASE_VERSION,
loss_kind=kind,
loss_mode="pure_drop",
dropped=link_text,
)
pre_strip_len = (
len(str(code).encode("utf-8", errors="surrogatepass"))
if loss_collector is not None
else 0
)
base = code.strip_code(normalize=True, collapse=True)
if loss_collector is not None:
# strip_code rewrites surviving wikilinks/templates/HTML to their
# display text — bytes change, content is preserved as text.
# Recorded as 'transform' (excluded from byte-conservation
# property test).
post_strip_len = len(base.encode("utf-8", errors="surrogatepass"))
loss_collector.record_delta(
stage="wikitext_base",
canonicalization_version=BASE_VERSION,
loss_kind="strip_code_transform",
loss_mode="transform",
bytes_delta=pre_strip_len - post_strip_len,
)
# Whitespace normalization — keeps paragraph breaks, drops runs.
pre_ws_len = (
len(base.encode("utf-8", errors="surrogatepass"))
if loss_collector is not None
else 0
)
base = _WS_RUN.sub(" ", base)
base = _TRAILING_WS.sub("\n", base)
base = _BLANK_LINES.sub("\n\n", base)
base = base.strip()
if loss_collector is not None:
post_ws_len = len(base.encode("utf-8", errors="surrogatepass"))
loss_collector.record_delta(
stage="wikitext_base",
canonicalization_version=BASE_VERSION,
loss_kind="whitespace_run",
loss_mode="normalize",
bytes_delta=pre_ws_len - post_ws_len,
)
loss_collector.set_lengths(
input_bytes=len(raw.encode("utf-8", errors="surrogatepass")),
output_bytes=len(base.encode("utf-8", errors="surrogatepass")),
)
return base