Source code for arborist.qa.keys

"""The 8-dim Merkle-AGI v9.8 cache_key.

v9.8 invariant: no answer is reused unless all eight match and the
record is live (not failed/stale/quarantined):

    1. source_root              — content fingerprint of the document
    2. question_hash             — SHA-256 of normalized question text
    3. model_profile_hash        — model_id + revision + quantization
    4. conversation_hash         — full canonical OpenAI messages array
    5. governance_policy_hash    — sampling/policy parameters dict
    6. schema_version            — arborist DB schema version
    7. canonicalization_version  — text normalization rules
    8. chunking_version          — chunker name & parameters

Bumping ANY of these eight dimensions yields a distinct cache_key, so
prior records cannot be served. This is the runtime drift detection
the providence whitepaper compresses into "cache_key = source_root +
':' + question_hash" — that's a simplification; the rigorous form is
all eight dimensions hashed together.
"""

from __future__ import annotations

import hashlib
import json

from arborist.document import canonicalize


def _sha256(s: str) -> str:
    # ``errors='surrogatepass'`` survives lone UTF-16 surrogates from
    # model output; same rationale as ``arborist.qa.dag._sha256_hex``.
    return hashlib.sha256(s.encode("utf-8", errors="surrogatepass")).hexdigest()


def _canonical_json(obj) -> str:
    return json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False)


QUESTION_DEDUP_MODES = ("strict", "equivalence_class")
DEFAULT_QUESTION_DEDUP = "equivalence_class"

# Lookup-time fidelity. Decoupled from write-time `question_dedup`:
# write determines under which `cache_key` a record lands; fidelity
# determines which `cache_key`s a lookup will check.
#
#   strict             only the cache_key matching the agent's policy
#                      is checked. No fallback. Audit-grade behavior.
#   equivalence_class  primary cache_key checked first; if miss AND the
#                      OTHER dedup mode produces a different cache_key,
#                      the alternate is also checked. Lets a fast-cache
#                      agent reuse records written under either mode.
FIDELITY_MODES = ("strict", "equivalence_class")
DEFAULT_FIDELITY = "equivalence_class"



[docs]
def canonical_question(
    question: str, *, mode: str = DEFAULT_QUESTION_DEDUP
) -> str:
    """Canonical form of `question` for the given dedup ``mode``.

    Two modes:

    - ``"equivalence_class"`` (default): four-step canonicalization —
      ``canonicalize()`` (NFC + ws-collapse + strip ends), then
      lowercase, then trailing-punctuation strip, then standalone-article
      filter (``the``, ``a``, ``an``). All variants of "Who is THE
      Batman?" / "who is batman" / "who is X." collapse to one form.
      The default for chat-style agents that prefer fast cache hits.

    - ``"strict"``: only ``canonicalize()`` — NFC + ws-collapse + strip
      ends. Case-sensitive, punctuation-sensitive, article-sensitive.
      Maximum granularity. The choice for audit-grade agents that want
      every distinct phrasing to get its own answer.

    Exposed as a function so callers can dedup BEFORE hashing — e.g.
    inject the canonical form into the user message used for
    ``conversation_hash``, while still sending the verbatim question to
    the LLM. Without this split ``"who is batman"``, ``"who is
    batman?"``, and ``"who is the batman?"`` collapse on
    ``question_hash`` (under equivalence_class) but each hits
    ``conversation_hash`` differently, missing cache.

    The choice of mode flows through the ``question_dedup`` policy
    field into ``governance_policy_hash`` so two agents under different
    modes write records under different ``cache_key`` values — they
    coexist in parallel namespaces, never collide.
    """
    if mode not in QUESTION_DEDUP_MODES:
        raise ValueError(
            f"question dedup mode must be one of {QUESTION_DEDUP_MODES}, got {mode!r}"
        )
    canon = canonicalize(question)
    if mode == "strict":
        return canon
    canon = canon.lower().rstrip(_QUESTION_TRAILING_STRIP)
    tokens = [t for t in canon.split() if t not in _QUESTION_ARTICLE_STRIP]
    return " ".join(tokens)




[docs]
def question_hash(
    question: str, *, mode: str = DEFAULT_QUESTION_DEDUP
) -> str:
    """SHA-256 of the dedup-mode-canonicalized question.

    See ``canonical_question`` for what each mode does. The hash is the
    SHA-256 of the canonical form. Bumping
    ``_QUESTION_TRAILING_STRIP`` or ``_QUESTION_ARTICLE_STRIP`` (the
    equivalence-class strip sets) orphans prior cache records whose
    canonical question contained newly-stripped tokens; they live as
    history but won't be re-hit on lookup.

    Equivalence class examples (mode="equivalence_class")::

        "who is X"           |
        "who is X?"          |
        "Who Is X."          | -> same question_hash
        "who is the X"       |
        "who is a X"         |
        "who is an X？"      |  (CJK question mark)

    Strict mode (mode="strict") distinguishes all of those.

    What's IN the trailing-strip set: ``.?!,;:`` (ASCII), ``？！。、``
    (CJK full-width), ``…`` (ellipsis). Pairs like ``"`` ``'`` ``)``
    ``]`` ``}`` are NOT — naive one-sided stripping breaks balance.
    Apostrophes aren't either — ``X's`` is a different question from
    ``X``.
    """
    return _sha256(canonical_question(question, mode=mode))



# Trailing punctuation that carries no semantic difference at the end
# of a question. Order doesn't matter (rstrip walks char-by-char from
# the right). Repeats handled trivially: ``X???`` → ``X``.
#
# ASCII:  . ? ! , ; :
# CJK:    ？ U+FF1F  full-width question mark
#         ！ U+FF01  full-width exclamation
#         。 U+3002  ideographic full stop
#         、 U+3001  ideographic comma
# Other:  … U+2026  horizontal ellipsis
_QUESTION_TRAILING_STRIP = ".?!,;:？！。、…"

# English articles stripped as standalone tokens after lowercasing. The
# question equivalence class treats "the foo" and "foo" as the same
# question — fox's 2026-04-29 catch: `who is the batman` & `who is
# batman` produced different cache records under earlier rules. Tokens
# are matched as EXACT lowercase strings, so substrings like "thesis"
# (contains "the") stay untouched.
#
# Conservative on purpose: only ASCII English articles. "El", "la",
# "los", "le", "les", "der", "die", "das" etc. are not stripped today.
# Adding them when needed flows through the same equivalence-class
# expansion the trailing-punctuation set went through.
_QUESTION_ARTICLE_STRIP = frozenset({"the", "a", "an"})



[docs]
def model_profile_hash(
    model_id: str, revision: str = "", quantization: str = ""
) -> str:
    """SHA-256 of model identity. Bumping any field bumps the cache key."""
    return _sha256(f"{model_id}|{revision}|{quantization}")




[docs]
def conversation_hash(messages: list[dict]) -> str:
    """SHA-256 of canonical JSON of the full OpenAI messages array.

    Order matters: a 6-turn dialogue arriving at the same final question
    produces a different hash than a single-turn ask.
    """
    return _sha256(_canonical_json(messages))




[docs]
def governance_policy_hash(policy: dict) -> str:
    """SHA-256 of canonical JSON of the sampling/policy dict.

    Includes temperature, top_p, max_tokens, and the system prompt — any
    of those changing means the answer is governed differently and the
    cache must miss.
    """
    return _sha256(_canonical_json(policy))



# Verifier-policy fields — the subset of `policy` that names what
# the deterministic verifier does. Separate from the broader
# `governance_policy_hash` so an auditor can answer "did the verifier
# rules change?" with a single hash diff rather than scanning the
# whole policy. See docs/cti-architecture.md §6 + the de-novo
# synthesis (2026-05-01) on verifier-policy identity.
#
# Adding a field here bumps `verifier_policy_hash` for every cached
# record on next lookup. Removing a field does the same. Reordering
# does not (set membership, not list ordering).
_VERIFIER_POLICY_FIELDS = frozenset({
    # Mode + parser identity
    "answer_mode",
    # Pointer-mode hard checks
    "claim_lattice_max_pointers_per_claim",
    "claim_lattice_min_citation_coverage",
    "claim_lattice_min_claim_content_tokens",
    "claim_lattice_lazy_anchor_demote_threshold",
    "claim_lattice_lazy_anchor_demote_min_pairs",
    "claim_lattice_allowed_source_roles",
    # Retrieval-side knob with verifier consequences
    "claim_lattice_max_chunks_per_source",
    # JSON variant identity
    "claim_lattice_use_guided_json",
    "claim_lattice_json_stop_sequences",
    # Warrant-lite (relation-question hard check, Ticket H, 2026-05-01)
    "claim_lattice_warrant_check_enabled",
    "claim_lattice_deflection_check_enabled",
    "claim_lattice_format_collapse_check_enabled",
    # Subject-tokens-absent / premise-parroting (Ticket #000006 amend
    # 2026-05-02b, Rule 9). Threshold of question∩claim content tokens
    # absent from cited evidence union that demotes STRICT → HYBRID.
    "claim_lattice_subject_tokens_absent_threshold",
    # Ticket #000008 Phase 2-4 — quantifier preflight guard. The seven
    # fields together control whether broad-quantifier classification
    # affects the per-call claim cap, which modes are gated, whether
    # the reminder fires, and whether reject-broad early-return takes
    # over. Plus #000010 adds six more for metacognition (see below).
    # Adding a model to model_profiles.py PROFILES doesn't bump
    # governance_policy_hash on its own (the dict isn't policy);
    # but flipping any of these knobs DOES bump the hash, which
    # invalidates prior cache records on lookup — exactly the
    # invalidation we want when a guard knob changes.
    "quantifier_guard_enabled",
    "quantifier_guard_apply_caps",
    "quantifier_apply_caps_modes",
    "quantifier_caps_by_intensity",
    "quantifier_guard_modes",
    "quantifier_reminder_enabled",
    "quantifier_reject_broad",
    # Ticket #000010 — Meta-Cognition Preflight Guard. The six
    # fields together control whether preflight runs and which
    # detectors fire. Flipping any of them invalidates prior cache
    # records on lookup — same governance discipline as #000008.
    "metacognition_enabled",
    "metacognition_temporal_check",
    "metacognition_contradiction_check",
    "metacognition_false_premise_check",
    "metacognition_out_of_corpus_check",
    "metacognition_block_on_contradiction",
    # Quote-mode entity policy
    "entity_policy",
    "entity_proximity_n",
    "entity_proximity_window",
    # Wikitext base-prose pinning (changes verifier surface)
    "base_version",
    # Verifier content-token rules version (#000053). Bumping the value
    # (e.g. adding a token class) invalidates prior cached records —
    # the verifier's TITLE_MISMATCH / subject-tokens-absent / spotlight
    # decisions depend on which tokens count as content.
    "content_token_rules",
})



[docs]
def verifier_policy_hash(policy: dict) -> str:
    """SHA-256 of canonical JSON of the verifier-relevant subset of policy.

    Pulls `_VERIFIER_POLICY_FIELDS` out of `policy` and hashes only
    those. Empty dict → constant hash (`sha256("{}")`). Folded into
    `cache_key` as a 9th dimension so a verifier-policy change is
    observable from the cache_key alone, separate from
    `governance_policy_hash` which folds in temperature / top_p /
    prompts.

    The two hashes overlap (verifier fields ARE in the broader policy
    dict and so contribute to governance_policy_hash too). That's
    intentional — bumping a verifier rule bumps BOTH dimensions.
    Bumping a non-verifier field (e.g. temperature) bumps ONLY
    governance_policy_hash. The asymmetry is what makes the audit
    legible: which dimension changed answers a question that scanning
    the whole policy dict cannot.
    """
    subset = {k: v for k, v in policy.items() if k in _VERIFIER_POLICY_FIELDS}
    return _sha256(_canonical_json(subset))




[docs]
def cache_key(
    source_root: str,
    question_hash_value: str,
    model_profile_hash_value: str,
    conversation_hash_value: str,
    governance_policy_hash_value: str,
    schema_version: str,
    canonicalization_version: str,
    chunking_version: str,
    verifier_policy_hash_value: str | None = None,
) -> str:
    """SHA-256 of the cache-identity dimensions joined with '|'.

    8-dim form (legacy): omit `verifier_policy_hash_value` (or pass
    None). The result matches pre-2026-05-01 cache identity and
    keeps backward compatibility with cached records written before
    the 9th dimension landed.

    9-dim form: pass `verifier_policy_hash_value` explicitly. Records
    written under the 9-dim form bind to the verifier-policy
    identity; lookups with a different verifier_policy_hash miss.
    The 9th dimension is the explicit "did the verifier rules
    change?" gate.

    Any drift in any dimension produces a distinct cache_key.
    """
    parts = [
        source_root,
        question_hash_value,
        model_profile_hash_value,
        conversation_hash_value,
        governance_policy_hash_value,
        schema_version,
        canonicalization_version,
        chunking_version,
    ]
    if verifier_policy_hash_value is not None:
        parts.append(verifier_policy_hash_value)
    return _sha256(
        "|".join(parts)
    )