Source code for arborist.qa.keys

"""The 8-dim Merkle-AGI v9.8 cache_key.

v9.8 invariant: no answer is reused unless all eight match and the
record is live (not failed/stale/quarantined):

    1. source_root              — content fingerprint of the document
    2. question_hash             — SHA-256 of normalized question text
    3. model_profile_hash        — model_id + revision + quantization
    4. conversation_hash         — full canonical OpenAI messages array
    5. governance_policy_hash    — sampling/policy parameters dict
    6. schema_version            — arborist DB schema version
    7. canonicalization_version  — text normalization rules
    8. chunking_version          — chunker name & parameters

Bumping ANY of these eight dimensions yields a distinct cache_key, so
prior records cannot be served. This is the runtime drift detection
the providence whitepaper compresses into "cache_key = source_root +
':' + question_hash" — that's a simplification; the rigorous form is
all eight dimensions hashed together.
"""

from __future__ import annotations

import hashlib
import json

from arborist.document import canonicalize


def _sha256(s: str) -> str:
    # ``errors='surrogatepass'`` survives lone UTF-16 surrogates from
    # model output; same rationale as ``arborist.qa.dag._sha256_hex``.
    return hashlib.sha256(s.encode("utf-8", errors="surrogatepass")).hexdigest()


def _canonical_json(obj) -> str:
    return json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False)


QUESTION_DEDUP_MODES = ("strict", "equivalence_class")
DEFAULT_QUESTION_DEDUP = "equivalence_class"

# Lookup-time fidelity. Decoupled from write-time `question_dedup`:
# write determines under which `cache_key` a record lands; fidelity
# determines which `cache_key`s a lookup will check.
#
#   strict             only the cache_key matching the agent's policy
#                      is checked. No fallback. Audit-grade behavior.
#   equivalence_class  primary cache_key checked first; if miss AND the
#                      OTHER dedup mode produces a different cache_key,
#                      the alternate is also checked. Lets a fast-cache
#                      agent reuse records written under either mode.
FIDELITY_MODES = ("strict", "equivalence_class")
DEFAULT_FIDELITY = "equivalence_class"


[docs] def canonical_question( question: str, *, mode: str = DEFAULT_QUESTION_DEDUP ) -> str: """Canonical form of `question` for the given dedup ``mode``. Two modes: - ``"equivalence_class"`` (default): four-step canonicalization — ``canonicalize()`` (NFC + ws-collapse + strip ends), then lowercase, then trailing-punctuation strip, then standalone-article filter (``the``, ``a``, ``an``). All variants of "Who is THE Batman?" / "who is batman" / "who is X." collapse to one form. The default for chat-style agents that prefer fast cache hits. - ``"strict"``: only ``canonicalize()`` — NFC + ws-collapse + strip ends. Case-sensitive, punctuation-sensitive, article-sensitive. Maximum granularity. The choice for audit-grade agents that want every distinct phrasing to get its own answer. Exposed as a function so callers can dedup BEFORE hashing — e.g. inject the canonical form into the user message used for ``conversation_hash``, while still sending the verbatim question to the LLM. Without this split ``"who is batman"``, ``"who is batman?"``, and ``"who is the batman?"`` collapse on ``question_hash`` (under equivalence_class) but each hits ``conversation_hash`` differently, missing cache. The choice of mode flows through the ``question_dedup`` policy field into ``governance_policy_hash`` so two agents under different modes write records under different ``cache_key`` values — they coexist in parallel namespaces, never collide. """ if mode not in QUESTION_DEDUP_MODES: raise ValueError( f"question dedup mode must be one of {QUESTION_DEDUP_MODES}, got {mode!r}" ) canon = canonicalize(question) if mode == "strict": return canon canon = canon.lower().rstrip(_QUESTION_TRAILING_STRIP) tokens = [t for t in canon.split() if t not in _QUESTION_ARTICLE_STRIP] return " ".join(tokens)
[docs] def question_hash( question: str, *, mode: str = DEFAULT_QUESTION_DEDUP ) -> str: """SHA-256 of the dedup-mode-canonicalized question. See ``canonical_question`` for what each mode does. The hash is the SHA-256 of the canonical form. Bumping ``_QUESTION_TRAILING_STRIP`` or ``_QUESTION_ARTICLE_STRIP`` (the equivalence-class strip sets) orphans prior cache records whose canonical question contained newly-stripped tokens; they live as history but won't be re-hit on lookup. Equivalence class examples (mode="equivalence_class"):: "who is X" | "who is X?" | "Who Is X." | -> same question_hash "who is the X" | "who is a X" | "who is an X?" | (CJK question mark) Strict mode (mode="strict") distinguishes all of those. What's IN the trailing-strip set: ``.?!,;:`` (ASCII), ``?!。、`` (CJK full-width), ``…`` (ellipsis). Pairs like ``"`` ``'`` ``)`` ``]`` ``}`` are NOT — naive one-sided stripping breaks balance. Apostrophes aren't either — ``X's`` is a different question from ``X``. """ return _sha256(canonical_question(question, mode=mode))
# Trailing punctuation that carries no semantic difference at the end # of a question. Order doesn't matter (rstrip walks char-by-char from # the right). Repeats handled trivially: ``X???`` → ``X``. # # ASCII: . ? ! , ; : # CJK: ? U+FF1F full-width question mark # ! U+FF01 full-width exclamation # 。 U+3002 ideographic full stop # 、 U+3001 ideographic comma # Other: … U+2026 horizontal ellipsis _QUESTION_TRAILING_STRIP = ".?!,;:?!。、…" # English articles stripped as standalone tokens after lowercasing. The # question equivalence class treats "the foo" and "foo" as the same # question — fox's 2026-04-29 catch: `who is the batman` & `who is # batman` produced different cache records under earlier rules. Tokens # are matched as EXACT lowercase strings, so substrings like "thesis" # (contains "the") stay untouched. # # Conservative on purpose: only ASCII English articles. "El", "la", # "los", "le", "les", "der", "die", "das" etc. are not stripped today. # Adding them when needed flows through the same equivalence-class # expansion the trailing-punctuation set went through. _QUESTION_ARTICLE_STRIP = frozenset({"the", "a", "an"})
[docs] def model_profile_hash( model_id: str, revision: str = "", quantization: str = "" ) -> str: """SHA-256 of model identity. Bumping any field bumps the cache key.""" return _sha256(f"{model_id}|{revision}|{quantization}")
[docs] def conversation_hash(messages: list[dict]) -> str: """SHA-256 of canonical JSON of the full OpenAI messages array. Order matters: a 6-turn dialogue arriving at the same final question produces a different hash than a single-turn ask. """ return _sha256(_canonical_json(messages))
[docs] def governance_policy_hash(policy: dict) -> str: """SHA-256 of canonical JSON of the sampling/policy dict. Includes temperature, top_p, max_tokens, and the system prompt — any of those changing means the answer is governed differently and the cache must miss. """ return _sha256(_canonical_json(policy))
# Verifier-policy fields — the subset of `policy` that names what # the deterministic verifier does. Separate from the broader # `governance_policy_hash` so an auditor can answer "did the verifier # rules change?" with a single hash diff rather than scanning the # whole policy. See docs/cti-architecture.md §6 + the de-novo # synthesis (2026-05-01) on verifier-policy identity. # # Adding a field here bumps `verifier_policy_hash` for every cached # record on next lookup. Removing a field does the same. Reordering # does not (set membership, not list ordering). _VERIFIER_POLICY_FIELDS = frozenset({ # Mode + parser identity "answer_mode", # Pointer-mode hard checks "claim_lattice_max_pointers_per_claim", "claim_lattice_min_citation_coverage", "claim_lattice_min_claim_content_tokens", "claim_lattice_lazy_anchor_demote_threshold", "claim_lattice_lazy_anchor_demote_min_pairs", "claim_lattice_allowed_source_roles", # Retrieval-side knob with verifier consequences "claim_lattice_max_chunks_per_source", # JSON variant identity "claim_lattice_use_guided_json", "claim_lattice_json_stop_sequences", # Warrant-lite (relation-question hard check, Ticket H, 2026-05-01) "claim_lattice_warrant_check_enabled", "claim_lattice_deflection_check_enabled", "claim_lattice_format_collapse_check_enabled", # Subject-tokens-absent / premise-parroting (Ticket #000006 amend # 2026-05-02b, Rule 9). Threshold of question∩claim content tokens # absent from cited evidence union that demotes STRICT → HYBRID. "claim_lattice_subject_tokens_absent_threshold", # Ticket #000008 Phase 2-4 — quantifier preflight guard. The seven # fields together control whether broad-quantifier classification # affects the per-call claim cap, which modes are gated, whether # the reminder fires, and whether reject-broad early-return takes # over. Plus #000010 adds six more for metacognition (see below). # Adding a model to model_profiles.py PROFILES doesn't bump # governance_policy_hash on its own (the dict isn't policy); # but flipping any of these knobs DOES bump the hash, which # invalidates prior cache records on lookup — exactly the # invalidation we want when a guard knob changes. "quantifier_guard_enabled", "quantifier_guard_apply_caps", "quantifier_apply_caps_modes", "quantifier_caps_by_intensity", "quantifier_guard_modes", "quantifier_reminder_enabled", "quantifier_reject_broad", # Ticket #000010 — Meta-Cognition Preflight Guard. The six # fields together control whether preflight runs and which # detectors fire. Flipping any of them invalidates prior cache # records on lookup — same governance discipline as #000008. "metacognition_enabled", "metacognition_temporal_check", "metacognition_contradiction_check", "metacognition_false_premise_check", "metacognition_out_of_corpus_check", "metacognition_block_on_contradiction", # Quote-mode entity policy "entity_policy", "entity_proximity_n", "entity_proximity_window", # Wikitext base-prose pinning (changes verifier surface) "base_version", # Verifier content-token rules version (#000053). Bumping the value # (e.g. adding a token class) invalidates prior cached records — # the verifier's TITLE_MISMATCH / subject-tokens-absent / spotlight # decisions depend on which tokens count as content. "content_token_rules", })
[docs] def verifier_policy_hash(policy: dict) -> str: """SHA-256 of canonical JSON of the verifier-relevant subset of policy. Pulls `_VERIFIER_POLICY_FIELDS` out of `policy` and hashes only those. Empty dict → constant hash (`sha256("{}")`). Folded into `cache_key` as a 9th dimension so a verifier-policy change is observable from the cache_key alone, separate from `governance_policy_hash` which folds in temperature / top_p / prompts. The two hashes overlap (verifier fields ARE in the broader policy dict and so contribute to governance_policy_hash too). That's intentional — bumping a verifier rule bumps BOTH dimensions. Bumping a non-verifier field (e.g. temperature) bumps ONLY governance_policy_hash. The asymmetry is what makes the audit legible: which dimension changed answers a question that scanning the whole policy dict cannot. """ subset = {k: v for k, v in policy.items() if k in _VERIFIER_POLICY_FIELDS} return _sha256(_canonical_json(subset))
[docs] def cache_key( source_root: str, question_hash_value: str, model_profile_hash_value: str, conversation_hash_value: str, governance_policy_hash_value: str, schema_version: str, canonicalization_version: str, chunking_version: str, verifier_policy_hash_value: str | None = None, ) -> str: """SHA-256 of the cache-identity dimensions joined with '|'. 8-dim form (legacy): omit `verifier_policy_hash_value` (or pass None). The result matches pre-2026-05-01 cache identity and keeps backward compatibility with cached records written before the 9th dimension landed. 9-dim form: pass `verifier_policy_hash_value` explicitly. Records written under the 9-dim form bind to the verifier-policy identity; lookups with a different verifier_policy_hash miss. The 9th dimension is the explicit "did the verifier rules change?" gate. Any drift in any dimension produces a distinct cache_key. """ parts = [ source_root, question_hash_value, model_profile_hash_value, conversation_hash_value, governance_policy_hash_value, schema_version, canonicalization_version, chunking_version, ] if verifier_policy_hash_value is not None: parts.append(verifier_policy_hash_value) return _sha256( "|".join(parts) )