"""The 8-dim Merkle-AGI v9.8 cache_key.
v9.8 invariant: no answer is reused unless all eight match and the
record is live (not failed/stale/quarantined):
1. source_root — content fingerprint of the document
2. question_hash — SHA-256 of normalized question text
3. model_profile_hash — model_id + revision + quantization
4. conversation_hash — full canonical OpenAI messages array
5. governance_policy_hash — sampling/policy parameters dict
6. schema_version — arborist DB schema version
7. canonicalization_version — text normalization rules
8. chunking_version — chunker name & parameters
Bumping ANY of these eight dimensions yields a distinct cache_key, so
prior records cannot be served. This is the runtime drift detection
the providence whitepaper compresses into "cache_key = source_root +
':' + question_hash" — that's a simplification; the rigorous form is
all eight dimensions hashed together.
"""
from __future__ import annotations
import hashlib
import json
from arborist.document import canonicalize
def _sha256(s: str) -> str:
# ``errors='surrogatepass'`` survives lone UTF-16 surrogates from
# model output; same rationale as ``arborist.qa.dag._sha256_hex``.
return hashlib.sha256(s.encode("utf-8", errors="surrogatepass")).hexdigest()
def _canonical_json(obj) -> str:
return json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
QUESTION_DEDUP_MODES = ("strict", "equivalence_class")
DEFAULT_QUESTION_DEDUP = "equivalence_class"
# Lookup-time fidelity. Decoupled from write-time `question_dedup`:
# write determines under which `cache_key` a record lands; fidelity
# determines which `cache_key`s a lookup will check.
#
# strict only the cache_key matching the agent's policy
# is checked. No fallback. Audit-grade behavior.
# equivalence_class primary cache_key checked first; if miss AND the
# OTHER dedup mode produces a different cache_key,
# the alternate is also checked. Lets a fast-cache
# agent reuse records written under either mode.
FIDELITY_MODES = ("strict", "equivalence_class")
DEFAULT_FIDELITY = "equivalence_class"
[docs]
def canonical_question(
question: str, *, mode: str = DEFAULT_QUESTION_DEDUP
) -> str:
"""Canonical form of `question` for the given dedup ``mode``.
Two modes:
- ``"equivalence_class"`` (default): four-step canonicalization —
``canonicalize()`` (NFC + ws-collapse + strip ends), then
lowercase, then trailing-punctuation strip, then standalone-article
filter (``the``, ``a``, ``an``). All variants of "Who is THE
Batman?" / "who is batman" / "who is X." collapse to one form.
The default for chat-style agents that prefer fast cache hits.
- ``"strict"``: only ``canonicalize()`` — NFC + ws-collapse + strip
ends. Case-sensitive, punctuation-sensitive, article-sensitive.
Maximum granularity. The choice for audit-grade agents that want
every distinct phrasing to get its own answer.
Exposed as a function so callers can dedup BEFORE hashing — e.g.
inject the canonical form into the user message used for
``conversation_hash``, while still sending the verbatim question to
the LLM. Without this split ``"who is batman"``, ``"who is
batman?"``, and ``"who is the batman?"`` collapse on
``question_hash`` (under equivalence_class) but each hits
``conversation_hash`` differently, missing cache.
The choice of mode flows through the ``question_dedup`` policy
field into ``governance_policy_hash`` so two agents under different
modes write records under different ``cache_key`` values — they
coexist in parallel namespaces, never collide.
"""
if mode not in QUESTION_DEDUP_MODES:
raise ValueError(
f"question dedup mode must be one of {QUESTION_DEDUP_MODES}, got {mode!r}"
)
canon = canonicalize(question)
if mode == "strict":
return canon
canon = canon.lower().rstrip(_QUESTION_TRAILING_STRIP)
tokens = [t for t in canon.split() if t not in _QUESTION_ARTICLE_STRIP]
return " ".join(tokens)
[docs]
def question_hash(
question: str, *, mode: str = DEFAULT_QUESTION_DEDUP
) -> str:
"""SHA-256 of the dedup-mode-canonicalized question.
See ``canonical_question`` for what each mode does. The hash is the
SHA-256 of the canonical form. Bumping
``_QUESTION_TRAILING_STRIP`` or ``_QUESTION_ARTICLE_STRIP`` (the
equivalence-class strip sets) orphans prior cache records whose
canonical question contained newly-stripped tokens; they live as
history but won't be re-hit on lookup.
Equivalence class examples (mode="equivalence_class")::
"who is X" |
"who is X?" |
"Who Is X." | -> same question_hash
"who is the X" |
"who is a X" |
"who is an X?" | (CJK question mark)
Strict mode (mode="strict") distinguishes all of those.
What's IN the trailing-strip set: ``.?!,;:`` (ASCII), ``?!。、``
(CJK full-width), ``…`` (ellipsis). Pairs like ``"`` ``'`` ``)``
``]`` ``}`` are NOT — naive one-sided stripping breaks balance.
Apostrophes aren't either — ``X's`` is a different question from
``X``.
"""
return _sha256(canonical_question(question, mode=mode))
# Trailing punctuation that carries no semantic difference at the end
# of a question. Order doesn't matter (rstrip walks char-by-char from
# the right). Repeats handled trivially: ``X???`` → ``X``.
#
# ASCII: . ? ! , ; :
# CJK: ? U+FF1F full-width question mark
# ! U+FF01 full-width exclamation
# 。 U+3002 ideographic full stop
# 、 U+3001 ideographic comma
# Other: … U+2026 horizontal ellipsis
_QUESTION_TRAILING_STRIP = ".?!,;:?!。、…"
# English articles stripped as standalone tokens after lowercasing. The
# question equivalence class treats "the foo" and "foo" as the same
# question — fox's 2026-04-29 catch: `who is the batman` & `who is
# batman` produced different cache records under earlier rules. Tokens
# are matched as EXACT lowercase strings, so substrings like "thesis"
# (contains "the") stay untouched.
#
# Conservative on purpose: only ASCII English articles. "El", "la",
# "los", "le", "les", "der", "die", "das" etc. are not stripped today.
# Adding them when needed flows through the same equivalence-class
# expansion the trailing-punctuation set went through.
_QUESTION_ARTICLE_STRIP = frozenset({"the", "a", "an"})
[docs]
def model_profile_hash(
model_id: str, revision: str = "", quantization: str = ""
) -> str:
"""SHA-256 of model identity. Bumping any field bumps the cache key."""
return _sha256(f"{model_id}|{revision}|{quantization}")
[docs]
def conversation_hash(messages: list[dict]) -> str:
"""SHA-256 of canonical JSON of the full OpenAI messages array.
Order matters: a 6-turn dialogue arriving at the same final question
produces a different hash than a single-turn ask.
"""
return _sha256(_canonical_json(messages))
[docs]
def governance_policy_hash(policy: dict) -> str:
"""SHA-256 of canonical JSON of the sampling/policy dict.
Includes temperature, top_p, max_tokens, and the system prompt — any
of those changing means the answer is governed differently and the
cache must miss.
"""
return _sha256(_canonical_json(policy))
# Verifier-policy fields — the subset of `policy` that names what
# the deterministic verifier does. Separate from the broader
# `governance_policy_hash` so an auditor can answer "did the verifier
# rules change?" with a single hash diff rather than scanning the
# whole policy. See docs/cti-architecture.md §6 + the de-novo
# synthesis (2026-05-01) on verifier-policy identity.
#
# Adding a field here bumps `verifier_policy_hash` for every cached
# record on next lookup. Removing a field does the same. Reordering
# does not (set membership, not list ordering).
_VERIFIER_POLICY_FIELDS = frozenset({
# Mode + parser identity
"answer_mode",
# Pointer-mode hard checks
"claim_lattice_max_pointers_per_claim",
"claim_lattice_min_citation_coverage",
"claim_lattice_min_claim_content_tokens",
"claim_lattice_lazy_anchor_demote_threshold",
"claim_lattice_lazy_anchor_demote_min_pairs",
"claim_lattice_allowed_source_roles",
# Retrieval-side knob with verifier consequences
"claim_lattice_max_chunks_per_source",
# JSON variant identity
"claim_lattice_use_guided_json",
"claim_lattice_json_stop_sequences",
# Warrant-lite (relation-question hard check, Ticket H, 2026-05-01)
"claim_lattice_warrant_check_enabled",
"claim_lattice_deflection_check_enabled",
"claim_lattice_format_collapse_check_enabled",
# Subject-tokens-absent / premise-parroting (Ticket #000006 amend
# 2026-05-02b, Rule 9). Threshold of question∩claim content tokens
# absent from cited evidence union that demotes STRICT → HYBRID.
"claim_lattice_subject_tokens_absent_threshold",
# Ticket #000008 Phase 2-4 — quantifier preflight guard. The seven
# fields together control whether broad-quantifier classification
# affects the per-call claim cap, which modes are gated, whether
# the reminder fires, and whether reject-broad early-return takes
# over. Plus #000010 adds six more for metacognition (see below).
# Adding a model to model_profiles.py PROFILES doesn't bump
# governance_policy_hash on its own (the dict isn't policy);
# but flipping any of these knobs DOES bump the hash, which
# invalidates prior cache records on lookup — exactly the
# invalidation we want when a guard knob changes.
"quantifier_guard_enabled",
"quantifier_guard_apply_caps",
"quantifier_apply_caps_modes",
"quantifier_caps_by_intensity",
"quantifier_guard_modes",
"quantifier_reminder_enabled",
"quantifier_reject_broad",
# Ticket #000010 — Meta-Cognition Preflight Guard. The six
# fields together control whether preflight runs and which
# detectors fire. Flipping any of them invalidates prior cache
# records on lookup — same governance discipline as #000008.
"metacognition_enabled",
"metacognition_temporal_check",
"metacognition_contradiction_check",
"metacognition_false_premise_check",
"metacognition_out_of_corpus_check",
"metacognition_block_on_contradiction",
# Quote-mode entity policy
"entity_policy",
"entity_proximity_n",
"entity_proximity_window",
# Wikitext base-prose pinning (changes verifier surface)
"base_version",
# Verifier content-token rules version (#000053). Bumping the value
# (e.g. adding a token class) invalidates prior cached records —
# the verifier's TITLE_MISMATCH / subject-tokens-absent / spotlight
# decisions depend on which tokens count as content.
"content_token_rules",
})
[docs]
def verifier_policy_hash(policy: dict) -> str:
"""SHA-256 of canonical JSON of the verifier-relevant subset of policy.
Pulls `_VERIFIER_POLICY_FIELDS` out of `policy` and hashes only
those. Empty dict → constant hash (`sha256("{}")`). Folded into
`cache_key` as a 9th dimension so a verifier-policy change is
observable from the cache_key alone, separate from
`governance_policy_hash` which folds in temperature / top_p /
prompts.
The two hashes overlap (verifier fields ARE in the broader policy
dict and so contribute to governance_policy_hash too). That's
intentional — bumping a verifier rule bumps BOTH dimensions.
Bumping a non-verifier field (e.g. temperature) bumps ONLY
governance_policy_hash. The asymmetry is what makes the audit
legible: which dimension changed answers a question that scanning
the whole policy dict cannot.
"""
subset = {k: v for k, v in policy.items() if k in _VERIFIER_POLICY_FIELDS}
return _sha256(_canonical_json(subset))
[docs]
def cache_key(
source_root: str,
question_hash_value: str,
model_profile_hash_value: str,
conversation_hash_value: str,
governance_policy_hash_value: str,
schema_version: str,
canonicalization_version: str,
chunking_version: str,
verifier_policy_hash_value: str | None = None,
) -> str:
"""SHA-256 of the cache-identity dimensions joined with '|'.
8-dim form (legacy): omit `verifier_policy_hash_value` (or pass
None). The result matches pre-2026-05-01 cache identity and
keeps backward compatibility with cached records written before
the 9th dimension landed.
9-dim form: pass `verifier_policy_hash_value` explicitly. Records
written under the 9-dim form bind to the verifier-policy
identity; lookups with a different verifier_policy_hash miss.
The 9th dimension is the explicit "did the verifier rules
change?" gate.
Any drift in any dimension produces a distinct cache_key.
"""
parts = [
source_root,
question_hash_value,
model_profile_hash_value,
conversation_hash_value,
governance_policy_hash_value,
schema_version,
canonicalization_version,
chunking_version,
]
if verifier_policy_hash_value is not None:
parts.append(verifier_policy_hash_value)
return _sha256(
"|".join(parts)
)