Source code for arborist.qa.runner

"""Q&A runner: cache-first lookup -> inference fallback -> provable record.

Implements the v9.8 admissibility invariant: no record reused unless
all 8 cache_key dimensions match AND state is 'live' (not
failed/stale/quarantined).

- Cache hit  -> persisted audit_mode (STRICT/HYBRID/UNGROUNDED).
- Cache miss -> call ChatClient, run faithfulness check, classify,
  store record, audit event.
"""

from __future__ import annotations

import json
import sqlite3
import time

from arborist import (
    CANONICALIZATION_VERSION,
    SCHEMA_VERSION,
)
from arborist.compress import unpack_chunk
from arborist.merkle import MerkleTree, proof_to_dict
from arborist.qa.client import ChatClient
from arborist.qa.prompts import (
    CLAIM_LATTICE_GROUNDING_REMINDER,
    CLAIM_LATTICE_JSON_GROUNDING_REMINDER,
    CLAIM_LATTICE_JSON_SYSTEM_PROMPT,
    CLAIM_LATTICE_SYSTEM_PROMPT,
)
from arborist.qa.keys import (
    DEFAULT_FIDELITY,
    DEFAULT_QUESTION_DEDUP,
    FIDELITY_MODES,
    QUESTION_DEDUP_MODES,
    cache_key,
    canonical_question,
    conversation_hash,
    governance_policy_hash,
    model_profile_hash,
    question_hash,
    verifier_policy_hash,
)
from arborist.qa.dag import build_run_dag
from arborist.qa.evidence import (
    build_evidence_map,
    evidence_map_root,
    render_evidence_map,
    render_evidence_map_for_json,
)
from arborist.qa.repair import mechanical_repair, reprompt_repair
from arborist.qa.verify import (
    ANSWER_MODES,
    CLAIM_LATTICE_JSON_SCHEMA,
    DEFAULT_ANSWER_MODE,
    verify_claim_lattice,
    verify_claim_lattice_json,
    verify_quotes,
)
from arborist.store import append_audit, transaction

try:
    from arborist.wikitext import BASE_VERSION as _WIKITEXT_BASE_VERSION
    from arborist.wikitext import to_base as _wikitext_to_base
except ImportError:  # pragma: no cover
    _WIKITEXT_BASE_VERSION = None
    _wikitext_to_base = None


DEFAULT_POLICY = {
    # Ticket #000007 — query-layer hyphen-fold marker. See
    # arborist/qa/query.py:DEFAULT_QUERY_POLICY for rationale.
    "hyphen_fold_v1": True,
    # Ticket #000006 amend 2026-05-02b (Rule 9). See
    # arborist/qa/query.py:DEFAULT_QUERY_POLICY for full rationale.
    "claim_lattice_subject_tokens_absent_threshold": 3,
    "system_prompt": (
        "Answer the user's question based ONLY on the document below. "
        "For EVERY factual claim, include a verbatim quote from the "
        "document enclosed in double quotes (\"...\"). The quoted span "
        "must appear word-for-word. Make a claim only when a verbatim "
        "quote in the document directly supports it. "
        "If the answer is in the document, write it. "
        "If the answer is absent from the document, say 'I don't know "
        "based on the provided document.' and stop there. "
        "Stay inside the document at all times."
    ),
    # Restated rule fired as a user message right before the document +
    # question arrive. See arborist/qa/query.py for the rationale (recent
    # user-turn instructions outweigh decayed system-turn rules in 8B
    # instruction-tuned models).
    "grounding_reminder": (
        "REMINDER: wrap every factual claim in double quotes (\"...\") "
        "and the quoted span must appear word-for-word in the "
        "document. Each claim earns a verbatim quote. "
        "Now answer the question on the next message."
    ),
    "temperature": 0.1,
    "top_p": 1.0,
    "max_tokens": 512,
    "entity_policy": "proximity",
    "entity_proximity_n": 3,
    "entity_proximity_window": 300,
    # Mechanical answer repair after first verify. Off by default; see
    # arborist/qa/query.py for semantics.
    "repair_enabled": False,
    "repair_max_reprompts": 0,
    # Strip wikitext markup before the LLM ever sees the context. Lets
    # Hermes quote prose verbatim and shrinks token bills (~43% on
    # Wikipedia chunks). Bumps governance_policy_hash so prior cached
    # answers under raw-wikitext policy stay distinct on lookup. Set
    # via the wikitext extras; no-op if mwparserfromhell isn't installed.
    "base_version": _WIKITEXT_BASE_VERSION,
    # G0 / CTI — claim-lattice-pointer answer mode. "quote" (default):
    # existing behavior, model writes prose with verbatim quotes inline.
    # "claim_lattice_pointer": runtime builds an evidence map and shows
    # the model short pointer ids (E1, E2, …); model writes natural
    # prose with bracket pointer tags ("Claim. [E12]") instead of
    # quoting source text. Renderer interpolates literal spans at
    # display time. Synthetic-elision-by-construction-impossible: the
    # model never types the quote string. Two-layer id discipline keeps
    # the cache & run-DAG keyed on content-addressed evidence_ids.
    # Folds into governance_policy_hash so two modes write under
    # different cache_keys and never alias. No iterative repair in
    # pointer mode (one-shot benchmark discipline).
    "answer_mode": DEFAULT_ANSWER_MODE,
    "claim_lattice_system_prompt": CLAIM_LATTICE_SYSTEM_PROMPT,
    "claim_lattice_grounding_reminder": CLAIM_LATTICE_GROUNDING_REMINDER,
    # Allowed source roles for claim-lattice verification. Roles outside
    # this set get classified SOURCE_ROLE_BLOCKED and downgrade the
    # verdict. Mirrors arborist.qa.verify.DEFAULT_ALLOWED_SOURCE_ROLES;
    # noisy_background_source / sequel_background_source are excluded by
    # default. Folds into governance_policy_hash on change.
    "claim_lattice_allowed_source_roles": [
        "primary_answer_source",
        "secondary_context_source",
        "background_source",
        "unclassified",
        # Self-promoted providence records (`arborist://providence/`
        # URI scheme). Trusted-as-fact substrate per the
        # self-reference design — STRICT live records past the
        # kindergarten window. See
        # docs/self-reference-design.md for the
        # falsification trust model.
        "self_reference_source",
    ],
    # Hard cap on pointer ids per claim line — mirrors prompt Rule 9.
    # Lines exceeding this cap classify as SCHEMA_INVALID and the
    # verdict can no longer reach STRICT. Folds into
    # governance_policy_hash so changing the cap invalidates prior
    # cached records.
    "claim_lattice_max_pointers_per_claim": 2,
    # Minimum claim-token coverage required for the citation-overlap
    # check (Rule 6) to pass. Pre-2026-04-30 the threshold was implicit
    # at "≥1 shared token", which let through lazy-anchored claims
    # whose only overlap was a single topical word (e.g. "Yale
    # University... [E9]" cited to a highway-data span containing only
    # "Connecticut"). 0.30 means a 10-token claim needs ≥3 of its
    # content tokens to appear in the cited span. Short claims (≤3
    # content tokens) keep the old ≥1-token floor so narrow factoids
    # like "Steve Jobs co-founded Apple" still pass. Folds into
    # governance_policy_hash on change.
    "claim_lattice_min_citation_coverage": 0.30,
    # Bare-name claim guard. A claim with fewer than this many content
    # tokens (>=4 chars, post-spotlight-stopword) is rejected as
    # SCHEMA_INVALID. Catches the JP-dinosaurs lazy-anchor where
    # "Triceratops. [E16]" passes lexical overlap on a single token
    # even when E16 is a video-game tie-in chunk rather than the film
    # article. Default 2: bare-entity-name claims (one content token
    # after stopword strip) fail; sentence-shape claims pass. Note
    # ``_content_tokens`` already filters "appears", "shown", etc. so
    # "Trex appears" → 1 content token (filtered), "Trex appears in
    # the film" → 2 content tokens (passes). Folds into
    # governance_policy_hash on change.
    "claim_lattice_min_claim_content_tokens": 2,
    # Lazy-anchor smell auto-demote. When >= threshold of verified
    # pointer-pairs cite a single pointer AND there are >= min_pairs
    # total, cap audit_mode at HYBRID. The smell sidecar was advisory
    # only pre-2026-04-30; now it's load-bearing. STRICT requires
    # diverse anchoring across pointers.
    "claim_lattice_lazy_anchor_demote_threshold": 0.5,
    "claim_lattice_lazy_anchor_demote_min_pairs": 3,
    # Warrant-lite — relation-question hard check (Ticket H from
    # feedback-3, 2026-05-01). Detects relation-shape questions
    # ("who is X's boss?", "who founded Y?") and requires the cited
    # span to contain at least one named answer entity (proper-noun
    # phrase) from the claim. Catches the Homer-Simpson lazy-anchor
    # case where claim asserts "Mr. Burns" but cited span is the
    # voice-actor bio. WARRANT_MISSING violations cap audit_mode
    # at HYBRID. See arborist/qa/warrant.py.
    "claim_lattice_warrant_check_enabled": True,
    "claim_lattice_deflection_check_enabled": True,
    # Format-collapse check (pointer-mode only): when the model emits
    # ≥5 meaningful prose lines with zero `[E\d+]` pointer tags, it
    # abandoned the claim_lattice_pointer protocol entirely. Soft-demote
    # so audit display surfaces "format collapsed" vs "graceful per-
    # claim refusal" — different failure shapes, same UNGROUNDED rung.
    # JSON-mode collapse already shows up as SCHEMA_INVALID so this
    # check is redundant there. Surfaced 2026-05-02 by fox's "Winners
    # of all major sports?" case where Hermes dumped 50+ free-form
    # sentences.
    "claim_lattice_format_collapse_check_enabled": True,
    # Quantifier preflight guard (Ticket #000008 Phase 2). Per-call
    # claim cap derived from the question's quantifier intensity and
    # the configured model profile (arborist/qa/model_profiles.py).
    # Phase 2 lands the lookup wiring with apply_caps=False per
    # §10.11.3 dry-run discipline — claim_cap_applied is computed
    # and reported on the result dict, but the verifier still uses
    # claim_lattice_max_claims_per_answer as the actual cap.
    # Operator flips quantifier_guard_apply_caps=True after dry-run
    # bench review confirms classifier output across the full
    # question set.
    #
    # Six-level disable hierarchy (§10.11.2):
    #   - quantifier_guard_enabled: master kill (False = no
    #     classifier output, no cap lookup, no telemetry).
    #   - quantifier_guard_apply_caps: dry-run gate (True = cap
    #     applied; False = cap reported but not applied).
    #   - quantifier_caps_by_intensity: per-call override dict;
    #     wins over the model_profiles.py table when present.
    #   - quantifier_guard_modes: list of answer_modes the guard
    #     applies to. Quote mode opts out by default — already
    #     stable HYBRID 0.455 on baseline, different failure shape.
    "quantifier_guard_enabled": True,
    "quantifier_guard_apply_caps": False,
    # When apply_caps flips True, this allowlist gates which modes
    # actually have caps applied. n=5 verification 2026-05-03 (#000008
    # §12.10): cap on claim_lattice (JSON) wins +14pp on STRICT-rate;
    # cap on claim_lattice_pointer fires TOO_MANY_CLAIMS 20× without
    # moving the verdict floor (still 0 STRICT). Default to JSON only
    # so flipping the master switch doesn't add wasted cap-noise on
    # pointer mode. Empty list / None = honor quantifier_guard_modes
    # (legacy fallback).
    "quantifier_apply_caps_modes": ["claim_lattice"],
    "quantifier_caps_by_intensity": {},
    "quantifier_guard_modes": ["claim_lattice_pointer", "claim_lattice"],
    # Phase 3 — broad-quantifier reminder injection. Default ON for
    # lattice modes (gated via quantifier_guard_modes) per the
    # 2026-05-03 bench A/B (#000008 §12). Reminder eliminates
    # FORMAT_COLLAPSED (2→0), reduces NO_EVIDENCE_POINTER 33%,
    # boosts mean ratio +17pp on pointer / +21pp on JSON, and
    # rescues JSON UNGROUNDED 7→1. n=5 verification 2026-05-03
    # confirms the compound effect with cap survives at higher
    # sample size. Quote mode is mode-gated off (different failure
    # shape; paraphrase verifier doesn't need pointer-tag reminders).
    "quantifier_reminder_enabled": True,
    # Phase 4 — strict reject for broad-unbounded queries. When True
    # AND intensity ∈ {ALL, COMPREHENSIVE, OPEN_REQUEST} AND
    # scope_bound_hint == "unbounded", query()/ask() return UNGROUNDED
    # before the LLM call with a BROAD_QUANTIFIER_REJECTED violation.
    # Saves the ~10-15s LLM call on rejected runs. Default OFF — opt-in
    # via --reject-broad CLI flag or per-call policy override.
    # Bounded universals (e.g. all members of the Beatles, year-anchored
    # questions) are NOT rejected per §10.1.
    "quantifier_reject_broad": False,
    # Ticket #000010 — Meta-Cognition Preflight Guard (M0 / MCTL).
    # Pure deterministic detectors (temporal, contradiction,
    # false-premise-lite, out-of-corpus) wrap the #000008 quantifier
    # classifier and surface a QuestionState on the result dict.
    # Master switch ON by default — detectors are pure-on-question
    # so cost is negligible. Each sub-detector has its own enable
    # switch for granular A/B. block_on_contradiction defaults False
    # (label-only by default; opt-in to hard-block — false-positive
    # rate not yet bench-validated).
    "metacognition_enabled": True,
    "metacognition_temporal_check": True,
    "metacognition_contradiction_check": True,
    "metacognition_false_premise_check": True,
    "metacognition_out_of_corpus_check": True,
    "metacognition_block_on_contradiction": False,
    # Ticket #000011 — soft preflight sidecar. Default OFF —
    # adds one short LLM round-trip (~200ms median) so cost is
    # operator-opt-in only. NEVER enters the verifier proof path
    # (D1); produces only SOFT_* labels that surface as advisory
    # hints alongside the deterministic detector output.
    "soft_preflight_enabled": False,
    # Claim-count ceiling. Bench finding (2026-04-30 york-england):
    # "tell me all there is to know about X" prompted Hermes to spam
    # 26-59 encyclopedic claims sourced from training, only 2-4 of
    # which grounded in retrieval. Atomic-claim prompt rule (b5925c8)
    # cut this to ~10, but a hard structural cap is defense in depth.
    # Cap of 12 admits typical entity-list questions (5-7 dinosaurs,
    # Simpsons + pets) while flagging the runaway shape. Folds into
    # governance_policy_hash on change.
    "claim_lattice_max_claims_per_answer": 12,
    # JSON variant — `answer_mode="claim_lattice"`. Mirrors the
    # multi-source query path. Pairs with grammar-constrained inference
    # (vLLM guided_json, Claude/GPT-4 native JSON, Qwen 3.6 reasoner).
    # Lenient pre-parser in verify_claim_lattice_json keeps the path
    # survivable on inference paths without grammar guidance.
    "claim_lattice_json_system_prompt": CLAIM_LATTICE_JSON_SYSTEM_PROMPT,
    "claim_lattice_json_grounding_reminder": CLAIM_LATTICE_JSON_GROUNDING_REMINDER,
    "claim_lattice_use_guided_json": True,
    # JSON-mode stop sequences. Hermes-3-8B sometimes spams whitespace
    # / newlines after the closing brace on broad-descriptive shapes
    # ("plot of X", "tell me about Y") — the response runs out the
    # max_tokens budget and the lenient parser sees truncated JSON.
    # Stopping on a blank line cuts the runaway. JSON-mode output
    # never legitimately contains a blank line (single object, single
    # line) so this is a safe filter. Folds into
    # governance_policy_hash on change.
    "claim_lattice_json_stop_sequences": ["\n\n"],
    # Verifier content-token rules version (#000053). "v2-acronym-aware"
    # = `arborist.qa.evidence._content_tokens` keeps all-caps 2-3-char
    # acronyms (CPU/GPU/DNA/FBI…) as content tokens; pre-#000053 dropped
    # every <4-char token, so a CPU/GPU claim cited to a "CPU foo" /
    # "GPU bar" article tripped TITLE_MISMATCH spuriously. A pure
    # marker — it doesn't gate code (the tokenizer change is
    # unconditional), it exists so the change folds into
    # `verifier_policy_hash` and prior cache records orphan on lookup.
    "content_token_rules": "v2-acronym-aware",
}


#: Mapping from verdict audit_mode → Δ5F utility signal for the
#: advisory controller step. Same shape the dry-run simulator uses
#: (``bench/scripts/prometheus_sigma_sweep_dryrun.py``) so the
#: live-runtime advisory log and the offline sweep dry-run agree on
#: cost-class economics.
_AUDIT_MODE_DELTA_5F = {
    "STRICT": 0.05,
    "CANONICAL_PROJECTION": 0.10,
    "HYBRID": 0.00,
    "UNGROUNDED": -0.10,
}
_AUDIT_MODE_CAPITAL_COST = {
    "STRICT": 1.0,
    "CANONICAL_PROJECTION": 0.05,
    "HYBRID": 0.8,
    "UNGROUNDED": 0.4,
}


def _emit_qa_controller_advisory(
    conn: sqlite3.Connection, cache_key: str, verdict: dict
) -> None:
    """#000037 Phase 2 — write an advisory controller_decision row.

    Synthesizes a single-branch ControllerInput from the verdict, runs
    the Phase 1 controller (pure function, no LLM call, no I/O), and
    persists the resulting decision + difficulty + allocation rows to
    the sibling ``controller_events`` table (does NOT enter
    ``audit_events.event_hash`` preimage — chain integrity unchanged).

    Idempotent under retry (``UNIQUE(event_kind, body_hash)``). Pure
    advisory: every QA cycle emits one of these regardless of cache
    hit/miss state, but downstream consumers can prune or aggregate
    rows without affecting the QA result.

    Lazy import keeps the QA hot path free of substrate-module load
    cost on calls that never reach this helper (cache hits + early
    returns).
    """
    from arborist.substrate.prometheus import (
        BatteryDeltas,
        ControllerBranch,
        ControllerInput,
        controller_decide,
        safe_weights,
    )
    from arborist.substrate.prometheus_audit import emit_controller_events

    audit_mode = (verdict.get("audit_mode") or "UNGROUNDED").upper()
    n_quotes = int(verdict.get("n_quotes") or 0)
    unverified = verdict.get("unverified_quotes") or []
    if isinstance(unverified, str):
        try:
            unverified = json.loads(unverified)
        except (TypeError, ValueError):
            unverified = []
    n_unverified = len(unverified) if isinstance(unverified, list) else 0
    witness_divergence = (n_unverified / n_quotes) if n_quotes > 0 else 0.0

    branch = ControllerBranch(
        branch_id=f"qa:{cache_key[:16]}",
        deltas=BatteryDeltas(
            delta_5s=0.0, delta_5t=0.0,
            delta_5f=_AUDIT_MODE_DELTA_5F.get(audit_mode, 0.0),
            delta_5r=0.0,
        ),
        witness_divergence=witness_divergence,
        capital_cost=_AUDIT_MODE_CAPITAL_COST.get(audit_mode, 1.0),
        payoff_b=1.0,
    )
    decision = controller_decide(
        ControllerInput(
            organism_root=f"qa:{cache_key}",
            branches=(branch,),
            budget=1,
            hermes_utilization=0,
            weights=safe_weights(),
            difficulty=1.0,
            divergence_rate=witness_divergence,
        )
    )
    emit_controller_events(conn, decision, organism_root=f"qa:{cache_key}")


def _ms_since(t: float) -> float:
    return round((time.monotonic() - t) * 1000, 1)


[docs] def ask( conn: sqlite3.Connection, *, document_root: str, question: str, client: ChatClient, model_id: str, revision: str = "", quantization: str = "", policy: dict | None = None, chain: str = "private", fidelity: str | None = None, ) -> dict: """Look up cached answer or run inference. Returns a result dict. See ``arborist.qa.query.query`` for `fidelity` semantics — it controls lookup tolerance: ``"strict"`` only checks the cache_key matching the call's ``policy["question_dedup"]``; the default ``"equivalence_class"`` falls back to the alternate dedup mode's cache_key on miss so a fast-cache agent can reuse records written under either mode. Result includes ``lookup_path``. """ policy = policy or DEFAULT_POLICY if fidelity is None: fidelity = policy.get("fidelity", DEFAULT_FIDELITY) if fidelity not in FIDELITY_MODES: raise ValueError( f"fidelity must be one of {FIDELITY_MODES}, got {fidelity!r}" ) # Quantifier preflight (Ticket #000008 Phase 1+2). Same wiring # as query() — see arborist/qa/query.py for the rationale and # disable hierarchy. from arborist.qa.model_profiles import cap_for_intensity from arborist.qa.quantifier import classify_question_quantifier answer_mode_for_guard = policy.get("answer_mode", "quote") quantifier_guard_on = bool(policy.get("quantifier_guard_enabled", True)) quantifier_guard_modes = policy.get( "quantifier_guard_modes", ["claim_lattice_pointer", "claim_lattice"], ) quantifier_mode_gated = answer_mode_for_guard in (quantifier_guard_modes or []) if quantifier_guard_on: quantifier = classify_question_quantifier(question) else: quantifier = { "intensity": None, "matched_token": None, "explicit_count": None, "is_broad": False, "operational_shape": None, "scope_bound_hint": "unknown", "classifier_version": None, } if quantifier_guard_on and quantifier_mode_gated and quantifier["intensity"]: claim_cap_lookup = cap_for_intensity( model_profile_id=model_id, intensity=quantifier["intensity"], explicit_count=quantifier["explicit_count"], policy_overrides=policy.get("quantifier_caps_by_intensity") or None, ) else: claim_cap_lookup = None quantifier_apply_caps = bool(policy.get("quantifier_guard_apply_caps", False)) quantifier_apply_caps_modes = policy.get( "quantifier_apply_caps_modes", quantifier_guard_modes, # legacy fallback ) or quantifier_guard_modes quantifier_caps_mode_gated = answer_mode_for_guard in ( quantifier_apply_caps_modes or [] ) _policy_max_claims = int(policy.get("claim_lattice_max_claims_per_answer", 12)) if ( quantifier_apply_caps and quantifier_caps_mode_gated and claim_cap_lookup is not None ): effective_max_claims = int(claim_cap_lookup) else: effective_max_claims = _policy_max_claims # Ticket #000010 — meta-cognition preflight (mirror of query()). from arborist.qa.metacognition import preflight_question question_state = preflight_question( question, model_profile_id=model_id, reference_frames=(), policy=policy, ) t_start = time.monotonic() doc = conn.execute( "SELECT document_uri, chunking_version FROM documents " "WHERE document_root = ?", (document_root,), ).fetchone() if doc is None: return {"status": "unknown_document"} chunk_rows = conn.execute( "SELECT idx, leaf_hash, content FROM chunks " "WHERE document_root = ? ORDER BY idx ASC", (document_root,), ).fetchall() if not chunk_rows: return {"status": "unknown_document"} if any(r["content"] is None for r in chunk_rows): return {"status": "source_cold", "msg": "rehydrate before asking"} answer_mode = policy.get("answer_mode", DEFAULT_ANSWER_MODE) if answer_mode not in ANSWER_MODES: raise ValueError( f"policy['answer_mode'] must be one of {ANSWER_MODES}, got {answer_mode!r}" ) chunk_texts = [unpack_chunk(r["content"]) for r in chunk_rows] document_text = "\n\n".join(chunk_texts) # Wikitext → prose before the LLM sees it. The model can then quote # verbatim against the prose form; the verifier compares like-against- # like. Idempotent if context is already plain prose. Gated on # policy["base_version"] so this is part of governance_policy_hash. if policy.get("base_version") and _wikitext_to_base is not None: document_text = _wikitext_to_base(document_text) chunk_texts = [_wikitext_to_base(t) for t in chunk_texts] evidence_map = [] if answer_mode == "claim_lattice_pointer": # Quote-by-pointer: one evidence object per chunk. The model sees # the literal spans labeled with content-addressed IDs and is # instructed to reference IDs, not type quote text. Synthetic # elision is impossible by construction — the model never produces # the quote string. chunks_for_map = [ { "source_root": document_root, "document_uri": doc["document_uri"], "title": None, "chunk_idx": r["idx"], "chunk_root": r["leaf_hash"], "span": chunk_texts[i], "source_role": "primary_answer_source", } for i, r in enumerate(chunk_rows) ] evidence_map = build_evidence_map(chunks_for_map) sys_prompt = policy["claim_lattice_system_prompt"] grounding_reminder = policy.get("claim_lattice_grounding_reminder") rendered_evidence = render_evidence_map(evidence_map) def _user_payload(q: str) -> str: return f"EVIDENCE:\n\n{rendered_evidence}\n\n---\n\nQUESTION: {q}" elif answer_mode == "claim_lattice": # JSON variant — same per-chunk evidence map as pointer mode, # blocks labeled with content-addressed evidence_id (long hex) # since the model emits IDs in JSON. Pairs with grammar- # constrained inference; lenient pre-parser handles drift. chunks_for_map = [ { "source_root": document_root, "document_uri": doc["document_uri"], "title": None, "chunk_idx": r["idx"], "chunk_root": r["leaf_hash"], "span": chunk_texts[i], "source_role": "primary_answer_source", } for i, r in enumerate(chunk_rows) ] evidence_map = build_evidence_map(chunks_for_map) sys_prompt = policy.get( "claim_lattice_json_system_prompt", policy["claim_lattice_system_prompt"], ) grounding_reminder = policy.get( "claim_lattice_json_grounding_reminder", policy.get("claim_lattice_grounding_reminder"), ) rendered_evidence = render_evidence_map_for_json(evidence_map) def _user_payload(q: str) -> str: return f"EVIDENCE:\n\n{rendered_evidence}\n\n---\n\nQUESTION: {q}" else: sys_prompt = policy["system_prompt"] grounding_reminder = policy.get("grounding_reminder") def _user_payload(q: str) -> str: return f"Document:\n\n{document_text}\n\n---\n\nQuestion: {q}" # System sets the policy; a user-turn reminder restates the rule one # message before the payload arrives. Payload (document or evidence # map + question) lands last as the most-recent tokens before # generation. messages = [{"role": "system", "content": sys_prompt}] if grounding_reminder: messages.append({"role": "user", "content": grounding_reminder}) # Quantifier-specific reminder (Ticket #000008 Phase 3, default # off). When the question is broad (ALL/COMPREHENSIVE/OPEN_ # REQUEST) and the operator opted in via # quantifier_reminder_enabled=True, append a one-line reminder # restating the cap and the no-prior-enumeration rule. # quantifier_reminder_enabled defaults to False because Hermes-3-8B # already ignores parts of the existing reminder under enumeration # pressure (§3 Option B con); empirical effect requires bench # measurement before flipping default-on (§10.8 decision tree). if ( quantifier_guard_on and quantifier_mode_gated and quantifier.get("is_broad") and bool(policy.get("quantifier_reminder_enabled", False)) ): from arborist.qa.quantifier_reminder import broad_quantifier_reminder broad = broad_quantifier_reminder( intensity=quantifier["intensity"], cap=effective_max_claims, scope_bound_hint=quantifier["scope_bound_hint"], ) if broad: messages.append({"role": "user", "content": broad}) messages.append({"role": "user", "content": _user_payload(question)}) mhash = model_profile_hash(model_id, revision, quantization) # Dedup-mode-aware cache_key. See arborist/qa/query.py for rationale — # policy_variant matches the alternate mode so governance_policy_hash # agrees with what an agent under that mode would have written, # enabling cross-silo fallback. def _ckey_for_mode(mode: str) -> str: canon_q = canonical_question(question, mode=mode) canon_msgs = list(messages[:-1]) + [ {"role": "user", "content": _user_payload(canon_q)}, ] policy_variant = dict(policy, question_dedup=mode) return cache_key( document_root, question_hash(question, mode=mode), mhash, conversation_hash(canon_msgs), governance_policy_hash(policy_variant), SCHEMA_VERSION, CANONICALIZATION_VERSION, doc["chunking_version"], verifier_policy_hash(policy_variant), ) ghash = governance_policy_hash(policy) # for the legacy INSERT below primary_dedup = policy.get("question_dedup", DEFAULT_QUESTION_DEDUP) if primary_dedup not in QUESTION_DEDUP_MODES: raise ValueError( f"policy['question_dedup'] must be one of {QUESTION_DEDUP_MODES}, " f"got {primary_dedup!r}" ) # Re-derive the per-mode hashes for use in the INSERT below. _ckey_for_mode # already builds them, but the legacy INSERT references qhash/chash by name. qhash = question_hash(question, mode=primary_dedup) canonical_q_primary = canonical_question(question, mode=primary_dedup) canonical_messages_primary = list(messages[:-1]) + [ {"role": "user", "content": _user_payload(canonical_q_primary)}, ] chash = conversation_hash(canonical_messages_primary) primary_ckey = _ckey_for_mode(primary_dedup) ckey = primary_ckey # legacy name for the rest of the function t_lookup = time.monotonic() cached = conn.execute( "SELECT * FROM providence_cache " "WHERE cache_key = ? AND falsification_state = 'live'", (primary_ckey,), ).fetchone() hit_ckey = primary_ckey lookup_path = primary_dedup if cached is not None else None if cached is None and fidelity == "equivalence_class": other_mode = ( "equivalence_class" if primary_dedup == "strict" else "strict" ) other_ckey = _ckey_for_mode(other_mode) if other_ckey != primary_ckey: cached = conn.execute( "SELECT * FROM providence_cache " "WHERE cache_key = ? AND falsification_state = 'live'", (other_ckey,), ).fetchone() if cached is not None: hit_ckey = other_ckey lookup_path = f"{other_mode}_fallback" cache_lookup_ms = _ms_since(t_lookup) if cached is not None: with transaction(conn): now = int(time.time()) conn.execute( "UPDATE providence_cache " "SET hit_count = hit_count + 1, last_hit_at = ? " "WHERE cache_key = ?", (now, hit_ckey), ) return { "status": "cache_hit", "audit_mode": cached["audit_mode"], "cache_key": hit_ckey, "lookup_path": lookup_path, "source_root": document_root, "answer_text": cached["answer_text"], "merkle_proof": json.loads(cached["merkle_proof"]), "n_quotes": cached["n_quotes"], "n_verified": cached["n_verified"], "verifier_method": cached["verifier_method"], "unverified_quotes": ( json.loads(cached["unverified_quotes"]) if cached["unverified_quotes"] else [] ), "partially_verified_quotes": [], # Quantifier preflight (Ticket #000008 Phase 1+2). Pure # on the question string, so cache hits re-classify # cheaply and carry the same schema as miss-path rows. "quantifier_intensity": quantifier["intensity"], "quantifier_matched_token": quantifier["matched_token"], "scope_bound_hint": quantifier["scope_bound_hint"], "quantifier_explicit_count": quantifier["explicit_count"], "claim_cap_applied": claim_cap_lookup, # Ticket #000010 — meta-cognition QuestionState. "question_state": question_state.to_dict(), "timings": { "cache_lookup_ms": cache_lookup_ms, "llm_ms": None, "total_ms": _ms_since(t_start), }, } t_llm = time.monotonic() # JSON mode: pass structured-output extras under all three engine # conventions (vLLM `guided_json`, llama.cpp `json_schema`, OpenAI- # spec `response_format`) so the same call site enforces the schema # whether the model is on vLLM, llama.cpp, or OpenAI-spec. Engines # silently drop unknown keys. Pre-2026-05-19 this was vLLM-only # (`guided_json` alone); llama.cpp Qwen got NO enforcement and # relied on the lenient pre-parser to clean drift — the parse- # tolerant fallback still handles whatever drift remains. from arborist.qa.verify import claim_lattice_structured_output_extras extra_body: dict | None = None stop_seqs: list[str] | None = None if answer_mode == "claim_lattice" and policy.get( "claim_lattice_use_guided_json", True ): extra_body = claim_lattice_structured_output_extras() if answer_mode == "claim_lattice": # JSON-mode token-runaway guard. On broad-descriptive / # comparison questions Hermes-3-8B sometimes spams whitespace # / newlines after the closing brace until max_tokens # exhausts; the resulting truncated payload won't parse and # the run lands UNGROUNDED 0/0 at 12-15s instead of 2-4s. # Stopping on a blank line (\n\n) cuts the runaway — # well-formed JSON-mode output never contains a blank line # since the model emits a single object on one line (or # with simple internal newlines). stop_seqs = list(policy.get( "claim_lattice_json_stop_sequences", ["\n\n"] )) raw_answer = client.chat_completion( messages, model=model_id, temperature=policy["temperature"], max_tokens=policy["max_tokens"], top_p=policy.get("top_p", 1.0), extra_body=extra_body, stop=stop_seqs, ) llm_ms = _ms_since(t_llm) repair_changes: list[dict] = [] pre_repair_verdict: dict | None = None # Phase 3 of #000031: load the warrant-chain core_root set once # from the conn's shards directory. The verifier consults this # set to suppress WARRANT_MISSING when the cited chunk's document # has a warrant-resolver derivation row (Merkle-bound primary- # source backing). Empty set if the shard has no derivations # rows yet — fully backward-compatible. _warrant_chain_roots: frozenset[str] = frozenset() try: from pathlib import Path as _Path from arborist.qa.warrant_chain import warrant_chain_lookup as _wcl _db_path = conn.execute("PRAGMA database_list").fetchall() # PRAGMA database_list rows: (seq, name, file). Main DB is # the first row with name='main'. _main_row = next((r for r in _db_path if r[1] == "main"), None) if _main_row and _main_row[2]: _warrant_chain_roots = _wcl(_Path(_main_row[2]).parent) except Exception: # Fail-closed: empty set means no suppression, behavior # identical to pre-Phase-3. _warrant_chain_roots = frozenset() if answer_mode == "claim_lattice_pointer": verdict = verify_claim_lattice( raw_answer, evidence_map, allowed_source_roles=tuple( policy.get( "claim_lattice_allowed_source_roles", [ "primary_answer_source", "secondary_context_source", "background_source", "unclassified", ], ) ), max_pointers_per_claim=int(policy.get( "claim_lattice_max_pointers_per_claim", 2 )), min_citation_coverage=float(policy.get( "claim_lattice_min_citation_coverage", 0.30 )), min_claim_content_tokens=int(policy.get( "claim_lattice_min_claim_content_tokens", 3 )), lazy_anchor_demote_threshold=float(policy.get( "claim_lattice_lazy_anchor_demote_threshold", 0.5 )), lazy_anchor_demote_min_pairs=int(policy.get( "claim_lattice_lazy_anchor_demote_min_pairs", 3 )), max_claims_per_answer=effective_max_claims, subject_tokens_absent_threshold=int(policy.get( "claim_lattice_subject_tokens_absent_threshold", 3 )), question=question, warrant_check_enabled=bool(policy.get( "claim_lattice_warrant_check_enabled", True )), deflection_check_enabled=bool(policy.get( "claim_lattice_deflection_check_enabled", True )), format_collapse_check_enabled=bool(policy.get( "claim_lattice_format_collapse_check_enabled", True )), warrant_chain_roots=_warrant_chain_roots, ) # Rendered prose (literal spans interpolated) is the user-facing # answer text — never the model's raw pointer-line output. If # rendering produced nothing (no valid claims), persist the raw # output so an operator can see what the model actually said. rendered = verdict["rendered_text"] answer_text = rendered if rendered else raw_answer elif answer_mode == "claim_lattice": verdict = verify_claim_lattice_json( raw_answer, evidence_map, allowed_source_roles=tuple( policy.get( "claim_lattice_allowed_source_roles", [ "primary_answer_source", "secondary_context_source", "background_source", "unclassified", ], ) ), max_evidence_per_claim=int(policy.get( "claim_lattice_max_pointers_per_claim", 2 )), min_citation_coverage=float(policy.get( "claim_lattice_min_citation_coverage", 0.30 )), max_claims_per_answer=effective_max_claims, subject_tokens_absent_threshold=int(policy.get( "claim_lattice_subject_tokens_absent_threshold", 3 )), question=question, warrant_check_enabled=bool(policy.get( "claim_lattice_warrant_check_enabled", True )), deflection_check_enabled=bool(policy.get( "claim_lattice_deflection_check_enabled", True )), warrant_chain_roots=_warrant_chain_roots, ) rendered = verdict["rendered_text"] answer_text = rendered if rendered else raw_answer else: answer_text = raw_answer verdict = verify_quotes( answer_text, document_text, entity_policy=policy.get("entity_policy", "hybrid"), proximity_n=policy.get("entity_proximity_n", 3), proximity_window=policy.get("entity_proximity_window", 300), ) def _verify(text: str) -> dict: return verify_quotes( text, document_text, entity_policy=policy.get("entity_policy", "hybrid"), proximity_n=policy.get("entity_proximity_n", 3), proximity_window=policy.get("entity_proximity_window", 300), ) if ( policy.get("repair_enabled") and verdict["audit_mode"] != "STRICT" and verdict.get("unverified_quotes") ): repair_result = mechanical_repair( answer_text, verdict["unverified_quotes"], document_text ) if repair_result["changes"]: new_verdict = _verify(repair_result["repaired_text"]) if new_verdict["n_verified"] >= verdict["n_verified"]: pre_repair_verdict = verdict answer_text = repair_result["repaired_text"] verdict = new_verdict repair_changes = list(repair_result["changes"]) max_reprompts = int(policy.get("repair_max_reprompts", 0)) for _ in range(max_reprompts): if ( verdict["audit_mode"] == "STRICT" or not verdict.get("unverified_quotes") ): break new_text = reprompt_repair( chat_client=client, model_id=model_id, original_messages=messages, original_answer=answer_text, failed_quotes=verdict["unverified_quotes"], policy=policy, ) if not new_text: break new_verdict = _verify(new_text) if new_verdict["n_verified"] > verdict["n_verified"]: if pre_repair_verdict is None: pre_repair_verdict = verdict answer_text = new_text verdict = new_verdict repair_changes.append({ "action": "reprompt_rewrite", "diagnosis": "model_feedback_loop", }) else: break unverified_blob = ( json.dumps(verdict["unverified_quotes"], separators=(",", ":")) if verdict["unverified_quotes"] else None ) leaves = [bytes.fromhex(r["leaf_hash"]) for r in chunk_rows] tree = MerkleTree.build(leaves) proof_obj = { "document_root": document_root, "chunk_0_proof": proof_to_dict(tree.proof(0)), } proof_blob = json.dumps(proof_obj, separators=(",", ":")) # Per-run Merkle-DAG (see arborist/qa/dag.py). Single-doc shape: # the only "source" is document_root. Quote mode: 7 stages base # (8 with #000009 preflight). Pointer mode: 9 stages base (10 with # preflight); context drops out and answer splits into raw_answer # / parsed_claim_lattice / render. ev_root = evidence_map_root(evidence_map) if evidence_map else None parsed_lattice = None is_lattice_mode = answer_mode in ("claim_lattice_pointer", "claim_lattice") if is_lattice_mode: # Per-claim list of {claim_text, content-addressed evidence_ids} # for the parsed_claim_lattice node hash. Pointer ids are # run-dependent; evidence_ids are content-addressed → the run- # DAG hashes the run-stable form. Same shape for JSON and # pointer; verifier already returns evidence_id_pairs. evidence_id_pairs = verdict.get("evidence_id_pairs") or [] parsed_lattice = [ { "claim_text": cs.get("text", ""), "evidence_ids": evidence_id_pairs[i] if i < len(evidence_id_pairs) else [], } for i, cs in enumerate(verdict.get("claim_statuses") or []) ] # Ticket #000009 — preflight node binding (mirror of query(); # nested CTI clauses per ticket §8 / 2026-05-04 feedback). from arborist.qa.dag import preflight_node_hash # verifier_policy_hash + model_profile_hash imported at module # top; do NOT re-import locally (free-variable shadowing). ghash_for_dag = verifier_policy_hash(policy) claim_cap_actually_applied = ( claim_cap_lookup if (quantifier_apply_caps and quantifier_caps_mode_gated and claim_cap_lookup is not None) else None ) reminder_eligible = ( quantifier_guard_on and quantifier_mode_gated and quantifier.get("is_broad", False) ) reminder_enabled = bool(policy.get("quantifier_reminder_enabled", False)) reminder_injected = reminder_eligible and reminder_enabled reminder_template_id = None if reminder_injected: reminder_template_id = ( "broad-quantifier-bounded-v1" if quantifier.get("scope_bound_hint") == "bounded" else "broad-quantifier-unbounded-v1" ) # Build payload + hash separately so we can persist both into # run_dag_blob (Ticket #000009 §7.2 — `arborist providence # --show-preflight` renders the full clause set). from arborist.qa.dag import ( _canonical_json as _runner_canon, _sha256_hex as _runner_sha, build_preflight_node_payload as _runner_build_payload, ) _runner_preflight_payload = _runner_build_payload( question_state=question_state.to_dict(), quantifier=quantifier, answer_contract={ "guard_enabled": quantifier_guard_on, "mode_gated": quantifier_mode_gated, "apply_caps_active": quantifier_apply_caps, "apply_caps_mode_gated": quantifier_caps_mode_gated, "claim_cap_resolved": claim_cap_lookup, "claim_cap_applied": claim_cap_actually_applied, "manual_quotes_allowed": False, "evidence_pointer_required": is_lattice_mode, "allow_unbounded_enumeration": False, "reject_broad_active": bool( policy.get("quantifier_reject_broad", False) ), "metacognition_enabled": bool( policy.get("metacognition_enabled", True) ), "block_on_contradiction": bool( policy.get("metacognition_block_on_contradiction", False) ), }, prompt_contract={ "reminder_enabled": reminder_enabled, "reminder_injected": reminder_injected, "reminder_template_id": reminder_template_id, }, evidence_contract={ "max_evidence_ids_exposed": int(policy.get( "claim_lattice_max_pointers_per_claim", 2 )), "one_claim_per_line": is_lattice_mode, }, policy_refs={ "governance_policy_hash": ghash_for_dag, "model_profile_hash": mhash, "answer_mode": answer_mode, }, ) preflight_hash = _runner_sha(_runner_canon(_runner_preflight_payload)) run_dag = build_run_dag( question_hash=qhash, sources=[{ "document_root": document_root, "source_role": "primary_answer_source", "score": None, "chunk_idx": None, }], context_root=document_root, conversation_hash=chash, answer_text=answer_text, audit_mode=verdict["audit_mode"], verifier_method=verdict["verifier_method"], n_quotes=verdict["n_quotes"], n_verified=verdict["n_verified"], claim_statuses=verdict.get("claim_statuses", []), lookup_path="miss", evidence_map_root=ev_root, answer_mode=answer_mode if answer_mode != "quote" else None, violations=verdict.get("violations"), raw_answer_text=raw_answer if is_lattice_mode else None, parsed_lattice=parsed_lattice, rendered_text=answer_text if is_lattice_mode else None, preflight_hash=preflight_hash, preflight_payload=_runner_preflight_payload, ) run_dag_blob = json.dumps(run_dag, separators=(",", ":")) now = int(time.time()) with transaction(conn): if repair_changes and pre_repair_verdict is not None: append_audit( conn, event_type="providence_repair", subject_root=ckey, body={ "kind": "mechanical", "n_changes": len(repair_changes), "changes": repair_changes, "pre_audit_mode": pre_repair_verdict["audit_mode"], "post_audit_mode": verdict["audit_mode"], "pre_n_verified": pre_repair_verdict["n_verified"], "post_n_verified": verdict["n_verified"], }, ts=now, ) event_hash = append_audit( conn, event_type="providence_write", subject_root=ckey, body={ "source_root": document_root, "model_id": model_id, "revision": revision, "quantization": quantization, "chunks_in_context": len(chunk_rows), "answer_chars": len(answer_text), "audit_mode": verdict["audit_mode"], "n_quotes": verdict["n_quotes"], "n_verified": verdict["n_verified"], "verifier_method": verdict["verifier_method"], }, ts=now, ) # Capital ledger (ticket #000020). Sibling table; advisory. from arborist.capital import profile_for_op, record as capital_record capital_profile, capital_inputs = profile_for_op( "qa", { "cache_hit": False, "answer_chars": len(answer_text), "llm_seconds": llm_ms / 1000.0, }, ) capital_record( conn, audit_event_hash=event_hash, op_type="qa", profile=capital_profile, estimator_inputs=capital_inputs, ts=now, ) conn.execute( # ON CONFLICT(cache_key) DO NOTHING: the cache lookup above runs # outside this transaction, so two concurrent ask()s on the same # cache_key can both miss and both reach this INSERT — the loser # no-ops instead of raising UNIQUE constraint failed (its answer is # equivalent: same question/model/policy ⇒ same cache_key). "INSERT INTO providence_cache " "(cache_key, source_root, document_uri, question_hash, question_text, " " answer_text, merkle_proof, model_profile_hash, conversation_hash, " " governance_policy_hash, schema_version, canonicalization_version, " " chunking_version, falsification_state, chain, audit_event_hash, " " created_at, hit_count, audit_mode, n_quotes, n_verified, " " unverified_quotes, verifier_method, run_dag_root, run_dag_blob) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'live', ?, ?, ?, 0, " " ?, ?, ?, ?, ?, ?, ?) " "ON CONFLICT(cache_key) DO NOTHING", ( ckey, document_root, doc["document_uri"], qhash, question, answer_text, proof_blob, mhash, chash, ghash, SCHEMA_VERSION, CANONICALIZATION_VERSION, doc["chunking_version"], chain, event_hash, now, verdict["audit_mode"], verdict["n_quotes"], verdict["n_verified"], unverified_blob, verdict["verifier_method"], run_dag["root"], run_dag_blob, ), ) # #000037 Phase 2 — emit advisory controller_events for this QA # cycle. Single-branch decision, byte-cheap. Wrapped so any # controller-emit error never blocks the QA return. Audit-only. try: _emit_qa_controller_advisory(conn, ckey, verdict) except Exception: # pragma: no cover — advisory must not break QA pass from arborist.qa.dag import localize_failure as _localize failure_stage = _localize( audit_mode=verdict["audit_mode"], n_sources=1, # ask() runs against one document n_quotes=verdict["n_quotes"], n_verified=verdict["n_verified"], ) return { "status": "cache_miss_then_written", "audit_mode": verdict["audit_mode"], "cache_key": ckey, "run_dag_root": run_dag["root"], "lookup_path": "miss", "failure_stage": failure_stage, "repair_changes": repair_changes, "pre_repair_audit_mode": ( pre_repair_verdict["audit_mode"] if pre_repair_verdict else None ), "source_root": document_root, "answer_text": answer_text, "merkle_proof": proof_obj, "n_quotes": verdict["n_quotes"], "n_verified": verdict["n_verified"], "verifier_method": verdict["verifier_method"], "unverified_quotes": verdict["unverified_quotes"], "partially_verified_quotes": verdict.get("partially_verified_quotes") or [], # Quantifier preflight (Ticket #000008 Phase 1+2). See query.py # for full rationale; runner.ask carries the same schema for # CLI-side `arborist ask` parity with `arborist query`. "quantifier_intensity": quantifier["intensity"], "quantifier_matched_token": quantifier["matched_token"], "scope_bound_hint": quantifier["scope_bound_hint"], "quantifier_explicit_count": quantifier["explicit_count"], "claim_cap_applied": claim_cap_lookup, # Ticket #000010 — meta-cognition QuestionState. "question_state": question_state.to_dict(), # Sidecar smell signals (claim_lattice mode only) — render- # layer; never persisted, never in run_dag_root. "pointer_id_distribution": verdict.get("pointer_id_distribution"), "lazy_anchor_ratio": verdict.get("lazy_anchor_ratio"), "timings": { "cache_lookup_ms": cache_lookup_ms, "llm_ms": llm_ms, "total_ms": _ms_since(t_start), }, }