"""Q&A runner: cache-first lookup -> inference fallback -> provable record.
Implements the v9.8 admissibility invariant: no record reused unless
all 8 cache_key dimensions match AND state is 'live' (not
failed/stale/quarantined).
- Cache hit -> persisted audit_mode (STRICT/HYBRID/UNGROUNDED).
- Cache miss -> call ChatClient, run faithfulness check, classify,
store record, audit event.
"""
from __future__ import annotations
import json
import sqlite3
import time
from arborist import (
CANONICALIZATION_VERSION,
SCHEMA_VERSION,
)
from arborist.compress import unpack_chunk
from arborist.merkle import MerkleTree, proof_to_dict
from arborist.qa.client import ChatClient
from arborist.qa.prompts import (
CLAIM_LATTICE_GROUNDING_REMINDER,
CLAIM_LATTICE_JSON_GROUNDING_REMINDER,
CLAIM_LATTICE_JSON_SYSTEM_PROMPT,
CLAIM_LATTICE_SYSTEM_PROMPT,
)
from arborist.qa.keys import (
DEFAULT_FIDELITY,
DEFAULT_QUESTION_DEDUP,
FIDELITY_MODES,
QUESTION_DEDUP_MODES,
cache_key,
canonical_question,
conversation_hash,
governance_policy_hash,
model_profile_hash,
question_hash,
verifier_policy_hash,
)
from arborist.qa.dag import build_run_dag
from arborist.qa.evidence import (
build_evidence_map,
evidence_map_root,
render_evidence_map,
render_evidence_map_for_json,
)
from arborist.qa.repair import mechanical_repair, reprompt_repair
from arborist.qa.verify import (
ANSWER_MODES,
CLAIM_LATTICE_JSON_SCHEMA,
DEFAULT_ANSWER_MODE,
verify_claim_lattice,
verify_claim_lattice_json,
verify_quotes,
)
from arborist.store import append_audit, transaction
try:
from arborist.wikitext import BASE_VERSION as _WIKITEXT_BASE_VERSION
from arborist.wikitext import to_base as _wikitext_to_base
except ImportError: # pragma: no cover
_WIKITEXT_BASE_VERSION = None
_wikitext_to_base = None
DEFAULT_POLICY = {
# Ticket #000007 — query-layer hyphen-fold marker. See
# arborist/qa/query.py:DEFAULT_QUERY_POLICY for rationale.
"hyphen_fold_v1": True,
# Ticket #000006 amend 2026-05-02b (Rule 9). See
# arborist/qa/query.py:DEFAULT_QUERY_POLICY for full rationale.
"claim_lattice_subject_tokens_absent_threshold": 3,
"system_prompt": (
"Answer the user's question based ONLY on the document below. "
"For EVERY factual claim, include a verbatim quote from the "
"document enclosed in double quotes (\"...\"). The quoted span "
"must appear word-for-word. Make a claim only when a verbatim "
"quote in the document directly supports it. "
"If the answer is in the document, write it. "
"If the answer is absent from the document, say 'I don't know "
"based on the provided document.' and stop there. "
"Stay inside the document at all times."
),
# Restated rule fired as a user message right before the document +
# question arrive. See arborist/qa/query.py for the rationale (recent
# user-turn instructions outweigh decayed system-turn rules in 8B
# instruction-tuned models).
"grounding_reminder": (
"REMINDER: wrap every factual claim in double quotes (\"...\") "
"and the quoted span must appear word-for-word in the "
"document. Each claim earns a verbatim quote. "
"Now answer the question on the next message."
),
"temperature": 0.1,
"top_p": 1.0,
"max_tokens": 512,
"entity_policy": "proximity",
"entity_proximity_n": 3,
"entity_proximity_window": 300,
# Mechanical answer repair after first verify. Off by default; see
# arborist/qa/query.py for semantics.
"repair_enabled": False,
"repair_max_reprompts": 0,
# Strip wikitext markup before the LLM ever sees the context. Lets
# Hermes quote prose verbatim and shrinks token bills (~43% on
# Wikipedia chunks). Bumps governance_policy_hash so prior cached
# answers under raw-wikitext policy stay distinct on lookup. Set
# via the wikitext extras; no-op if mwparserfromhell isn't installed.
"base_version": _WIKITEXT_BASE_VERSION,
# G0 / CTI — claim-lattice-pointer answer mode. "quote" (default):
# existing behavior, model writes prose with verbatim quotes inline.
# "claim_lattice_pointer": runtime builds an evidence map and shows
# the model short pointer ids (E1, E2, …); model writes natural
# prose with bracket pointer tags ("Claim. [E12]") instead of
# quoting source text. Renderer interpolates literal spans at
# display time. Synthetic-elision-by-construction-impossible: the
# model never types the quote string. Two-layer id discipline keeps
# the cache & run-DAG keyed on content-addressed evidence_ids.
# Folds into governance_policy_hash so two modes write under
# different cache_keys and never alias. No iterative repair in
# pointer mode (one-shot benchmark discipline).
"answer_mode": DEFAULT_ANSWER_MODE,
"claim_lattice_system_prompt": CLAIM_LATTICE_SYSTEM_PROMPT,
"claim_lattice_grounding_reminder": CLAIM_LATTICE_GROUNDING_REMINDER,
# Allowed source roles for claim-lattice verification. Roles outside
# this set get classified SOURCE_ROLE_BLOCKED and downgrade the
# verdict. Mirrors arborist.qa.verify.DEFAULT_ALLOWED_SOURCE_ROLES;
# noisy_background_source / sequel_background_source are excluded by
# default. Folds into governance_policy_hash on change.
"claim_lattice_allowed_source_roles": [
"primary_answer_source",
"secondary_context_source",
"background_source",
"unclassified",
# Self-promoted providence records (`arborist://providence/`
# URI scheme). Trusted-as-fact substrate per the
# self-reference design — STRICT live records past the
# kindergarten window. See
# docs/self-reference-design.md for the
# falsification trust model.
"self_reference_source",
],
# Hard cap on pointer ids per claim line — mirrors prompt Rule 9.
# Lines exceeding this cap classify as SCHEMA_INVALID and the
# verdict can no longer reach STRICT. Folds into
# governance_policy_hash so changing the cap invalidates prior
# cached records.
"claim_lattice_max_pointers_per_claim": 2,
# Minimum claim-token coverage required for the citation-overlap
# check (Rule 6) to pass. Pre-2026-04-30 the threshold was implicit
# at "≥1 shared token", which let through lazy-anchored claims
# whose only overlap was a single topical word (e.g. "Yale
# University... [E9]" cited to a highway-data span containing only
# "Connecticut"). 0.30 means a 10-token claim needs ≥3 of its
# content tokens to appear in the cited span. Short claims (≤3
# content tokens) keep the old ≥1-token floor so narrow factoids
# like "Steve Jobs co-founded Apple" still pass. Folds into
# governance_policy_hash on change.
"claim_lattice_min_citation_coverage": 0.30,
# Bare-name claim guard. A claim with fewer than this many content
# tokens (>=4 chars, post-spotlight-stopword) is rejected as
# SCHEMA_INVALID. Catches the JP-dinosaurs lazy-anchor where
# "Triceratops. [E16]" passes lexical overlap on a single token
# even when E16 is a video-game tie-in chunk rather than the film
# article. Default 2: bare-entity-name claims (one content token
# after stopword strip) fail; sentence-shape claims pass. Note
# ``_content_tokens`` already filters "appears", "shown", etc. so
# "Trex appears" → 1 content token (filtered), "Trex appears in
# the film" → 2 content tokens (passes). Folds into
# governance_policy_hash on change.
"claim_lattice_min_claim_content_tokens": 2,
# Lazy-anchor smell auto-demote. When >= threshold of verified
# pointer-pairs cite a single pointer AND there are >= min_pairs
# total, cap audit_mode at HYBRID. The smell sidecar was advisory
# only pre-2026-04-30; now it's load-bearing. STRICT requires
# diverse anchoring across pointers.
"claim_lattice_lazy_anchor_demote_threshold": 0.5,
"claim_lattice_lazy_anchor_demote_min_pairs": 3,
# Warrant-lite — relation-question hard check (Ticket H from
# feedback-3, 2026-05-01). Detects relation-shape questions
# ("who is X's boss?", "who founded Y?") and requires the cited
# span to contain at least one named answer entity (proper-noun
# phrase) from the claim. Catches the Homer-Simpson lazy-anchor
# case where claim asserts "Mr. Burns" but cited span is the
# voice-actor bio. WARRANT_MISSING violations cap audit_mode
# at HYBRID. See arborist/qa/warrant.py.
"claim_lattice_warrant_check_enabled": True,
"claim_lattice_deflection_check_enabled": True,
# Format-collapse check (pointer-mode only): when the model emits
# ≥5 meaningful prose lines with zero `[E\d+]` pointer tags, it
# abandoned the claim_lattice_pointer protocol entirely. Soft-demote
# so audit display surfaces "format collapsed" vs "graceful per-
# claim refusal" — different failure shapes, same UNGROUNDED rung.
# JSON-mode collapse already shows up as SCHEMA_INVALID so this
# check is redundant there. Surfaced 2026-05-02 by fox's "Winners
# of all major sports?" case where Hermes dumped 50+ free-form
# sentences.
"claim_lattice_format_collapse_check_enabled": True,
# Quantifier preflight guard (Ticket #000008 Phase 2). Per-call
# claim cap derived from the question's quantifier intensity and
# the configured model profile (arborist/qa/model_profiles.py).
# Phase 2 lands the lookup wiring with apply_caps=False per
# §10.11.3 dry-run discipline — claim_cap_applied is computed
# and reported on the result dict, but the verifier still uses
# claim_lattice_max_claims_per_answer as the actual cap.
# Operator flips quantifier_guard_apply_caps=True after dry-run
# bench review confirms classifier output across the full
# question set.
#
# Six-level disable hierarchy (§10.11.2):
# - quantifier_guard_enabled: master kill (False = no
# classifier output, no cap lookup, no telemetry).
# - quantifier_guard_apply_caps: dry-run gate (True = cap
# applied; False = cap reported but not applied).
# - quantifier_caps_by_intensity: per-call override dict;
# wins over the model_profiles.py table when present.
# - quantifier_guard_modes: list of answer_modes the guard
# applies to. Quote mode opts out by default — already
# stable HYBRID 0.455 on baseline, different failure shape.
"quantifier_guard_enabled": True,
"quantifier_guard_apply_caps": False,
# When apply_caps flips True, this allowlist gates which modes
# actually have caps applied. n=5 verification 2026-05-03 (#000008
# §12.10): cap on claim_lattice (JSON) wins +14pp on STRICT-rate;
# cap on claim_lattice_pointer fires TOO_MANY_CLAIMS 20× without
# moving the verdict floor (still 0 STRICT). Default to JSON only
# so flipping the master switch doesn't add wasted cap-noise on
# pointer mode. Empty list / None = honor quantifier_guard_modes
# (legacy fallback).
"quantifier_apply_caps_modes": ["claim_lattice"],
"quantifier_caps_by_intensity": {},
"quantifier_guard_modes": ["claim_lattice_pointer", "claim_lattice"],
# Phase 3 — broad-quantifier reminder injection. Default ON for
# lattice modes (gated via quantifier_guard_modes) per the
# 2026-05-03 bench A/B (#000008 §12). Reminder eliminates
# FORMAT_COLLAPSED (2→0), reduces NO_EVIDENCE_POINTER 33%,
# boosts mean ratio +17pp on pointer / +21pp on JSON, and
# rescues JSON UNGROUNDED 7→1. n=5 verification 2026-05-03
# confirms the compound effect with cap survives at higher
# sample size. Quote mode is mode-gated off (different failure
# shape; paraphrase verifier doesn't need pointer-tag reminders).
"quantifier_reminder_enabled": True,
# Phase 4 — strict reject for broad-unbounded queries. When True
# AND intensity ∈ {ALL, COMPREHENSIVE, OPEN_REQUEST} AND
# scope_bound_hint == "unbounded", query()/ask() return UNGROUNDED
# before the LLM call with a BROAD_QUANTIFIER_REJECTED violation.
# Saves the ~10-15s LLM call on rejected runs. Default OFF — opt-in
# via --reject-broad CLI flag or per-call policy override.
# Bounded universals (e.g. all members of the Beatles, year-anchored
# questions) are NOT rejected per §10.1.
"quantifier_reject_broad": False,
# Ticket #000010 — Meta-Cognition Preflight Guard (M0 / MCTL).
# Pure deterministic detectors (temporal, contradiction,
# false-premise-lite, out-of-corpus) wrap the #000008 quantifier
# classifier and surface a QuestionState on the result dict.
# Master switch ON by default — detectors are pure-on-question
# so cost is negligible. Each sub-detector has its own enable
# switch for granular A/B. block_on_contradiction defaults False
# (label-only by default; opt-in to hard-block — false-positive
# rate not yet bench-validated).
"metacognition_enabled": True,
"metacognition_temporal_check": True,
"metacognition_contradiction_check": True,
"metacognition_false_premise_check": True,
"metacognition_out_of_corpus_check": True,
"metacognition_block_on_contradiction": False,
# Ticket #000011 — soft preflight sidecar. Default OFF —
# adds one short LLM round-trip (~200ms median) so cost is
# operator-opt-in only. NEVER enters the verifier proof path
# (D1); produces only SOFT_* labels that surface as advisory
# hints alongside the deterministic detector output.
"soft_preflight_enabled": False,
# Claim-count ceiling. Bench finding (2026-04-30 york-england):
# "tell me all there is to know about X" prompted Hermes to spam
# 26-59 encyclopedic claims sourced from training, only 2-4 of
# which grounded in retrieval. Atomic-claim prompt rule (b5925c8)
# cut this to ~10, but a hard structural cap is defense in depth.
# Cap of 12 admits typical entity-list questions (5-7 dinosaurs,
# Simpsons + pets) while flagging the runaway shape. Folds into
# governance_policy_hash on change.
"claim_lattice_max_claims_per_answer": 12,
# JSON variant — `answer_mode="claim_lattice"`. Mirrors the
# multi-source query path. Pairs with grammar-constrained inference
# (vLLM guided_json, Claude/GPT-4 native JSON, Qwen 3.6 reasoner).
# Lenient pre-parser in verify_claim_lattice_json keeps the path
# survivable on inference paths without grammar guidance.
"claim_lattice_json_system_prompt": CLAIM_LATTICE_JSON_SYSTEM_PROMPT,
"claim_lattice_json_grounding_reminder": CLAIM_LATTICE_JSON_GROUNDING_REMINDER,
"claim_lattice_use_guided_json": True,
# JSON-mode stop sequences. Hermes-3-8B sometimes spams whitespace
# / newlines after the closing brace on broad-descriptive shapes
# ("plot of X", "tell me about Y") — the response runs out the
# max_tokens budget and the lenient parser sees truncated JSON.
# Stopping on a blank line cuts the runaway. JSON-mode output
# never legitimately contains a blank line (single object, single
# line) so this is a safe filter. Folds into
# governance_policy_hash on change.
"claim_lattice_json_stop_sequences": ["\n\n"],
# Verifier content-token rules version (#000053). "v2-acronym-aware"
# = `arborist.qa.evidence._content_tokens` keeps all-caps 2-3-char
# acronyms (CPU/GPU/DNA/FBI…) as content tokens; pre-#000053 dropped
# every <4-char token, so a CPU/GPU claim cited to a "CPU foo" /
# "GPU bar" article tripped TITLE_MISMATCH spuriously. A pure
# marker — it doesn't gate code (the tokenizer change is
# unconditional), it exists so the change folds into
# `verifier_policy_hash` and prior cache records orphan on lookup.
"content_token_rules": "v2-acronym-aware",
}
#: Mapping from verdict audit_mode → Δ5F utility signal for the
#: advisory controller step. Same shape the dry-run simulator uses
#: (``bench/scripts/prometheus_sigma_sweep_dryrun.py``) so the
#: live-runtime advisory log and the offline sweep dry-run agree on
#: cost-class economics.
_AUDIT_MODE_DELTA_5F = {
"STRICT": 0.05,
"CANONICAL_PROJECTION": 0.10,
"HYBRID": 0.00,
"UNGROUNDED": -0.10,
}
_AUDIT_MODE_CAPITAL_COST = {
"STRICT": 1.0,
"CANONICAL_PROJECTION": 0.05,
"HYBRID": 0.8,
"UNGROUNDED": 0.4,
}
def _emit_qa_controller_advisory(
conn: sqlite3.Connection, cache_key: str, verdict: dict
) -> None:
"""#000037 Phase 2 — write an advisory controller_decision row.
Synthesizes a single-branch ControllerInput from the verdict, runs
the Phase 1 controller (pure function, no LLM call, no I/O), and
persists the resulting decision + difficulty + allocation rows to
the sibling ``controller_events`` table (does NOT enter
``audit_events.event_hash`` preimage — chain integrity unchanged).
Idempotent under retry (``UNIQUE(event_kind, body_hash)``). Pure
advisory: every QA cycle emits one of these regardless of cache
hit/miss state, but downstream consumers can prune or aggregate
rows without affecting the QA result.
Lazy import keeps the QA hot path free of substrate-module load
cost on calls that never reach this helper (cache hits + early
returns).
"""
from arborist.substrate.prometheus import (
BatteryDeltas,
ControllerBranch,
ControllerInput,
controller_decide,
safe_weights,
)
from arborist.substrate.prometheus_audit import emit_controller_events
audit_mode = (verdict.get("audit_mode") or "UNGROUNDED").upper()
n_quotes = int(verdict.get("n_quotes") or 0)
unverified = verdict.get("unverified_quotes") or []
if isinstance(unverified, str):
try:
unverified = json.loads(unverified)
except (TypeError, ValueError):
unverified = []
n_unverified = len(unverified) if isinstance(unverified, list) else 0
witness_divergence = (n_unverified / n_quotes) if n_quotes > 0 else 0.0
branch = ControllerBranch(
branch_id=f"qa:{cache_key[:16]}",
deltas=BatteryDeltas(
delta_5s=0.0, delta_5t=0.0,
delta_5f=_AUDIT_MODE_DELTA_5F.get(audit_mode, 0.0),
delta_5r=0.0,
),
witness_divergence=witness_divergence,
capital_cost=_AUDIT_MODE_CAPITAL_COST.get(audit_mode, 1.0),
payoff_b=1.0,
)
decision = controller_decide(
ControllerInput(
organism_root=f"qa:{cache_key}",
branches=(branch,),
budget=1,
hermes_utilization=0,
weights=safe_weights(),
difficulty=1.0,
divergence_rate=witness_divergence,
)
)
emit_controller_events(conn, decision, organism_root=f"qa:{cache_key}")
def _ms_since(t: float) -> float:
return round((time.monotonic() - t) * 1000, 1)
[docs]
def ask(
conn: sqlite3.Connection,
*,
document_root: str,
question: str,
client: ChatClient,
model_id: str,
revision: str = "",
quantization: str = "",
policy: dict | None = None,
chain: str = "private",
fidelity: str | None = None,
) -> dict:
"""Look up cached answer or run inference. Returns a result dict.
See ``arborist.qa.query.query`` for `fidelity` semantics — it
controls lookup tolerance: ``"strict"`` only checks the cache_key
matching the call's ``policy["question_dedup"]``; the default
``"equivalence_class"`` falls back to the alternate dedup mode's
cache_key on miss so a fast-cache agent can reuse records written
under either mode. Result includes ``lookup_path``.
"""
policy = policy or DEFAULT_POLICY
if fidelity is None:
fidelity = policy.get("fidelity", DEFAULT_FIDELITY)
if fidelity not in FIDELITY_MODES:
raise ValueError(
f"fidelity must be one of {FIDELITY_MODES}, got {fidelity!r}"
)
# Quantifier preflight (Ticket #000008 Phase 1+2). Same wiring
# as query() — see arborist/qa/query.py for the rationale and
# disable hierarchy.
from arborist.qa.model_profiles import cap_for_intensity
from arborist.qa.quantifier import classify_question_quantifier
answer_mode_for_guard = policy.get("answer_mode", "quote")
quantifier_guard_on = bool(policy.get("quantifier_guard_enabled", True))
quantifier_guard_modes = policy.get(
"quantifier_guard_modes",
["claim_lattice_pointer", "claim_lattice"],
)
quantifier_mode_gated = answer_mode_for_guard in (quantifier_guard_modes or [])
if quantifier_guard_on:
quantifier = classify_question_quantifier(question)
else:
quantifier = {
"intensity": None,
"matched_token": None,
"explicit_count": None,
"is_broad": False,
"operational_shape": None,
"scope_bound_hint": "unknown",
"classifier_version": None,
}
if quantifier_guard_on and quantifier_mode_gated and quantifier["intensity"]:
claim_cap_lookup = cap_for_intensity(
model_profile_id=model_id,
intensity=quantifier["intensity"],
explicit_count=quantifier["explicit_count"],
policy_overrides=policy.get("quantifier_caps_by_intensity") or None,
)
else:
claim_cap_lookup = None
quantifier_apply_caps = bool(policy.get("quantifier_guard_apply_caps", False))
quantifier_apply_caps_modes = policy.get(
"quantifier_apply_caps_modes",
quantifier_guard_modes, # legacy fallback
) or quantifier_guard_modes
quantifier_caps_mode_gated = answer_mode_for_guard in (
quantifier_apply_caps_modes or []
)
_policy_max_claims = int(policy.get("claim_lattice_max_claims_per_answer", 12))
if (
quantifier_apply_caps
and quantifier_caps_mode_gated
and claim_cap_lookup is not None
):
effective_max_claims = int(claim_cap_lookup)
else:
effective_max_claims = _policy_max_claims
# Ticket #000010 — meta-cognition preflight (mirror of query()).
from arborist.qa.metacognition import preflight_question
question_state = preflight_question(
question,
model_profile_id=model_id,
reference_frames=(),
policy=policy,
)
t_start = time.monotonic()
doc = conn.execute(
"SELECT document_uri, chunking_version FROM documents "
"WHERE document_root = ?",
(document_root,),
).fetchone()
if doc is None:
return {"status": "unknown_document"}
chunk_rows = conn.execute(
"SELECT idx, leaf_hash, content FROM chunks "
"WHERE document_root = ? ORDER BY idx ASC",
(document_root,),
).fetchall()
if not chunk_rows:
return {"status": "unknown_document"}
if any(r["content"] is None for r in chunk_rows):
return {"status": "source_cold", "msg": "rehydrate before asking"}
answer_mode = policy.get("answer_mode", DEFAULT_ANSWER_MODE)
if answer_mode not in ANSWER_MODES:
raise ValueError(
f"policy['answer_mode'] must be one of {ANSWER_MODES}, got {answer_mode!r}"
)
chunk_texts = [unpack_chunk(r["content"]) for r in chunk_rows]
document_text = "\n\n".join(chunk_texts)
# Wikitext → prose before the LLM sees it. The model can then quote
# verbatim against the prose form; the verifier compares like-against-
# like. Idempotent if context is already plain prose. Gated on
# policy["base_version"] so this is part of governance_policy_hash.
if policy.get("base_version") and _wikitext_to_base is not None:
document_text = _wikitext_to_base(document_text)
chunk_texts = [_wikitext_to_base(t) for t in chunk_texts]
evidence_map = []
if answer_mode == "claim_lattice_pointer":
# Quote-by-pointer: one evidence object per chunk. The model sees
# the literal spans labeled with content-addressed IDs and is
# instructed to reference IDs, not type quote text. Synthetic
# elision is impossible by construction — the model never produces
# the quote string.
chunks_for_map = [
{
"source_root": document_root,
"document_uri": doc["document_uri"],
"title": None,
"chunk_idx": r["idx"],
"chunk_root": r["leaf_hash"],
"span": chunk_texts[i],
"source_role": "primary_answer_source",
}
for i, r in enumerate(chunk_rows)
]
evidence_map = build_evidence_map(chunks_for_map)
sys_prompt = policy["claim_lattice_system_prompt"]
grounding_reminder = policy.get("claim_lattice_grounding_reminder")
rendered_evidence = render_evidence_map(evidence_map)
def _user_payload(q: str) -> str:
return f"EVIDENCE:\n\n{rendered_evidence}\n\n---\n\nQUESTION: {q}"
elif answer_mode == "claim_lattice":
# JSON variant — same per-chunk evidence map as pointer mode,
# blocks labeled with content-addressed evidence_id (long hex)
# since the model emits IDs in JSON. Pairs with grammar-
# constrained inference; lenient pre-parser handles drift.
chunks_for_map = [
{
"source_root": document_root,
"document_uri": doc["document_uri"],
"title": None,
"chunk_idx": r["idx"],
"chunk_root": r["leaf_hash"],
"span": chunk_texts[i],
"source_role": "primary_answer_source",
}
for i, r in enumerate(chunk_rows)
]
evidence_map = build_evidence_map(chunks_for_map)
sys_prompt = policy.get(
"claim_lattice_json_system_prompt",
policy["claim_lattice_system_prompt"],
)
grounding_reminder = policy.get(
"claim_lattice_json_grounding_reminder",
policy.get("claim_lattice_grounding_reminder"),
)
rendered_evidence = render_evidence_map_for_json(evidence_map)
def _user_payload(q: str) -> str:
return f"EVIDENCE:\n\n{rendered_evidence}\n\n---\n\nQUESTION: {q}"
else:
sys_prompt = policy["system_prompt"]
grounding_reminder = policy.get("grounding_reminder")
def _user_payload(q: str) -> str:
return f"Document:\n\n{document_text}\n\n---\n\nQuestion: {q}"
# System sets the policy; a user-turn reminder restates the rule one
# message before the payload arrives. Payload (document or evidence
# map + question) lands last as the most-recent tokens before
# generation.
messages = [{"role": "system", "content": sys_prompt}]
if grounding_reminder:
messages.append({"role": "user", "content": grounding_reminder})
# Quantifier-specific reminder (Ticket #000008 Phase 3, default
# off). When the question is broad (ALL/COMPREHENSIVE/OPEN_
# REQUEST) and the operator opted in via
# quantifier_reminder_enabled=True, append a one-line reminder
# restating the cap and the no-prior-enumeration rule.
# quantifier_reminder_enabled defaults to False because Hermes-3-8B
# already ignores parts of the existing reminder under enumeration
# pressure (§3 Option B con); empirical effect requires bench
# measurement before flipping default-on (§10.8 decision tree).
if (
quantifier_guard_on
and quantifier_mode_gated
and quantifier.get("is_broad")
and bool(policy.get("quantifier_reminder_enabled", False))
):
from arborist.qa.quantifier_reminder import broad_quantifier_reminder
broad = broad_quantifier_reminder(
intensity=quantifier["intensity"],
cap=effective_max_claims,
scope_bound_hint=quantifier["scope_bound_hint"],
)
if broad:
messages.append({"role": "user", "content": broad})
messages.append({"role": "user", "content": _user_payload(question)})
mhash = model_profile_hash(model_id, revision, quantization)
# Dedup-mode-aware cache_key. See arborist/qa/query.py for rationale —
# policy_variant matches the alternate mode so governance_policy_hash
# agrees with what an agent under that mode would have written,
# enabling cross-silo fallback.
def _ckey_for_mode(mode: str) -> str:
canon_q = canonical_question(question, mode=mode)
canon_msgs = list(messages[:-1]) + [
{"role": "user", "content": _user_payload(canon_q)},
]
policy_variant = dict(policy, question_dedup=mode)
return cache_key(
document_root,
question_hash(question, mode=mode),
mhash,
conversation_hash(canon_msgs),
governance_policy_hash(policy_variant),
SCHEMA_VERSION,
CANONICALIZATION_VERSION,
doc["chunking_version"],
verifier_policy_hash(policy_variant),
)
ghash = governance_policy_hash(policy) # for the legacy INSERT below
primary_dedup = policy.get("question_dedup", DEFAULT_QUESTION_DEDUP)
if primary_dedup not in QUESTION_DEDUP_MODES:
raise ValueError(
f"policy['question_dedup'] must be one of {QUESTION_DEDUP_MODES}, "
f"got {primary_dedup!r}"
)
# Re-derive the per-mode hashes for use in the INSERT below. _ckey_for_mode
# already builds them, but the legacy INSERT references qhash/chash by name.
qhash = question_hash(question, mode=primary_dedup)
canonical_q_primary = canonical_question(question, mode=primary_dedup)
canonical_messages_primary = list(messages[:-1]) + [
{"role": "user", "content": _user_payload(canonical_q_primary)},
]
chash = conversation_hash(canonical_messages_primary)
primary_ckey = _ckey_for_mode(primary_dedup)
ckey = primary_ckey # legacy name for the rest of the function
t_lookup = time.monotonic()
cached = conn.execute(
"SELECT * FROM providence_cache "
"WHERE cache_key = ? AND falsification_state = 'live'",
(primary_ckey,),
).fetchone()
hit_ckey = primary_ckey
lookup_path = primary_dedup if cached is not None else None
if cached is None and fidelity == "equivalence_class":
other_mode = (
"equivalence_class" if primary_dedup == "strict" else "strict"
)
other_ckey = _ckey_for_mode(other_mode)
if other_ckey != primary_ckey:
cached = conn.execute(
"SELECT * FROM providence_cache "
"WHERE cache_key = ? AND falsification_state = 'live'",
(other_ckey,),
).fetchone()
if cached is not None:
hit_ckey = other_ckey
lookup_path = f"{other_mode}_fallback"
cache_lookup_ms = _ms_since(t_lookup)
if cached is not None:
with transaction(conn):
now = int(time.time())
conn.execute(
"UPDATE providence_cache "
"SET hit_count = hit_count + 1, last_hit_at = ? "
"WHERE cache_key = ?",
(now, hit_ckey),
)
return {
"status": "cache_hit",
"audit_mode": cached["audit_mode"],
"cache_key": hit_ckey,
"lookup_path": lookup_path,
"source_root": document_root,
"answer_text": cached["answer_text"],
"merkle_proof": json.loads(cached["merkle_proof"]),
"n_quotes": cached["n_quotes"],
"n_verified": cached["n_verified"],
"verifier_method": cached["verifier_method"],
"unverified_quotes": (
json.loads(cached["unverified_quotes"])
if cached["unverified_quotes"]
else []
),
"partially_verified_quotes": [],
# Quantifier preflight (Ticket #000008 Phase 1+2). Pure
# on the question string, so cache hits re-classify
# cheaply and carry the same schema as miss-path rows.
"quantifier_intensity": quantifier["intensity"],
"quantifier_matched_token": quantifier["matched_token"],
"scope_bound_hint": quantifier["scope_bound_hint"],
"quantifier_explicit_count": quantifier["explicit_count"],
"claim_cap_applied": claim_cap_lookup,
# Ticket #000010 — meta-cognition QuestionState.
"question_state": question_state.to_dict(),
"timings": {
"cache_lookup_ms": cache_lookup_ms,
"llm_ms": None,
"total_ms": _ms_since(t_start),
},
}
t_llm = time.monotonic()
# JSON mode: pass structured-output extras under all three engine
# conventions (vLLM `guided_json`, llama.cpp `json_schema`, OpenAI-
# spec `response_format`) so the same call site enforces the schema
# whether the model is on vLLM, llama.cpp, or OpenAI-spec. Engines
# silently drop unknown keys. Pre-2026-05-19 this was vLLM-only
# (`guided_json` alone); llama.cpp Qwen got NO enforcement and
# relied on the lenient pre-parser to clean drift — the parse-
# tolerant fallback still handles whatever drift remains.
from arborist.qa.verify import claim_lattice_structured_output_extras
extra_body: dict | None = None
stop_seqs: list[str] | None = None
if answer_mode == "claim_lattice" and policy.get(
"claim_lattice_use_guided_json", True
):
extra_body = claim_lattice_structured_output_extras()
if answer_mode == "claim_lattice":
# JSON-mode token-runaway guard. On broad-descriptive /
# comparison questions Hermes-3-8B sometimes spams whitespace
# / newlines after the closing brace until max_tokens
# exhausts; the resulting truncated payload won't parse and
# the run lands UNGROUNDED 0/0 at 12-15s instead of 2-4s.
# Stopping on a blank line (\n\n) cuts the runaway —
# well-formed JSON-mode output never contains a blank line
# since the model emits a single object on one line (or
# with simple internal newlines).
stop_seqs = list(policy.get(
"claim_lattice_json_stop_sequences", ["\n\n"]
))
raw_answer = client.chat_completion(
messages,
model=model_id,
temperature=policy["temperature"],
max_tokens=policy["max_tokens"],
top_p=policy.get("top_p", 1.0),
extra_body=extra_body,
stop=stop_seqs,
)
llm_ms = _ms_since(t_llm)
repair_changes: list[dict] = []
pre_repair_verdict: dict | None = None
# Phase 3 of #000031: load the warrant-chain core_root set once
# from the conn's shards directory. The verifier consults this
# set to suppress WARRANT_MISSING when the cited chunk's document
# has a warrant-resolver derivation row (Merkle-bound primary-
# source backing). Empty set if the shard has no derivations
# rows yet — fully backward-compatible.
_warrant_chain_roots: frozenset[str] = frozenset()
try:
from pathlib import Path as _Path
from arborist.qa.warrant_chain import warrant_chain_lookup as _wcl
_db_path = conn.execute("PRAGMA database_list").fetchall()
# PRAGMA database_list rows: (seq, name, file). Main DB is
# the first row with name='main'.
_main_row = next((r for r in _db_path if r[1] == "main"), None)
if _main_row and _main_row[2]:
_warrant_chain_roots = _wcl(_Path(_main_row[2]).parent)
except Exception:
# Fail-closed: empty set means no suppression, behavior
# identical to pre-Phase-3.
_warrant_chain_roots = frozenset()
if answer_mode == "claim_lattice_pointer":
verdict = verify_claim_lattice(
raw_answer,
evidence_map,
allowed_source_roles=tuple(
policy.get(
"claim_lattice_allowed_source_roles",
[
"primary_answer_source",
"secondary_context_source",
"background_source",
"unclassified",
],
)
),
max_pointers_per_claim=int(policy.get(
"claim_lattice_max_pointers_per_claim", 2
)),
min_citation_coverage=float(policy.get(
"claim_lattice_min_citation_coverage", 0.30
)),
min_claim_content_tokens=int(policy.get(
"claim_lattice_min_claim_content_tokens", 3
)),
lazy_anchor_demote_threshold=float(policy.get(
"claim_lattice_lazy_anchor_demote_threshold", 0.5
)),
lazy_anchor_demote_min_pairs=int(policy.get(
"claim_lattice_lazy_anchor_demote_min_pairs", 3
)),
max_claims_per_answer=effective_max_claims,
subject_tokens_absent_threshold=int(policy.get(
"claim_lattice_subject_tokens_absent_threshold", 3
)),
question=question,
warrant_check_enabled=bool(policy.get(
"claim_lattice_warrant_check_enabled", True
)),
deflection_check_enabled=bool(policy.get(
"claim_lattice_deflection_check_enabled", True
)),
format_collapse_check_enabled=bool(policy.get(
"claim_lattice_format_collapse_check_enabled", True
)),
warrant_chain_roots=_warrant_chain_roots,
)
# Rendered prose (literal spans interpolated) is the user-facing
# answer text — never the model's raw pointer-line output. If
# rendering produced nothing (no valid claims), persist the raw
# output so an operator can see what the model actually said.
rendered = verdict["rendered_text"]
answer_text = rendered if rendered else raw_answer
elif answer_mode == "claim_lattice":
verdict = verify_claim_lattice_json(
raw_answer,
evidence_map,
allowed_source_roles=tuple(
policy.get(
"claim_lattice_allowed_source_roles",
[
"primary_answer_source",
"secondary_context_source",
"background_source",
"unclassified",
],
)
),
max_evidence_per_claim=int(policy.get(
"claim_lattice_max_pointers_per_claim", 2
)),
min_citation_coverage=float(policy.get(
"claim_lattice_min_citation_coverage", 0.30
)),
max_claims_per_answer=effective_max_claims,
subject_tokens_absent_threshold=int(policy.get(
"claim_lattice_subject_tokens_absent_threshold", 3
)),
question=question,
warrant_check_enabled=bool(policy.get(
"claim_lattice_warrant_check_enabled", True
)),
deflection_check_enabled=bool(policy.get(
"claim_lattice_deflection_check_enabled", True
)),
warrant_chain_roots=_warrant_chain_roots,
)
rendered = verdict["rendered_text"]
answer_text = rendered if rendered else raw_answer
else:
answer_text = raw_answer
verdict = verify_quotes(
answer_text,
document_text,
entity_policy=policy.get("entity_policy", "hybrid"),
proximity_n=policy.get("entity_proximity_n", 3),
proximity_window=policy.get("entity_proximity_window", 300),
)
def _verify(text: str) -> dict:
return verify_quotes(
text,
document_text,
entity_policy=policy.get("entity_policy", "hybrid"),
proximity_n=policy.get("entity_proximity_n", 3),
proximity_window=policy.get("entity_proximity_window", 300),
)
if (
policy.get("repair_enabled")
and verdict["audit_mode"] != "STRICT"
and verdict.get("unverified_quotes")
):
repair_result = mechanical_repair(
answer_text, verdict["unverified_quotes"], document_text
)
if repair_result["changes"]:
new_verdict = _verify(repair_result["repaired_text"])
if new_verdict["n_verified"] >= verdict["n_verified"]:
pre_repair_verdict = verdict
answer_text = repair_result["repaired_text"]
verdict = new_verdict
repair_changes = list(repair_result["changes"])
max_reprompts = int(policy.get("repair_max_reprompts", 0))
for _ in range(max_reprompts):
if (
verdict["audit_mode"] == "STRICT"
or not verdict.get("unverified_quotes")
):
break
new_text = reprompt_repair(
chat_client=client,
model_id=model_id,
original_messages=messages,
original_answer=answer_text,
failed_quotes=verdict["unverified_quotes"],
policy=policy,
)
if not new_text:
break
new_verdict = _verify(new_text)
if new_verdict["n_verified"] > verdict["n_verified"]:
if pre_repair_verdict is None:
pre_repair_verdict = verdict
answer_text = new_text
verdict = new_verdict
repair_changes.append({
"action": "reprompt_rewrite",
"diagnosis": "model_feedback_loop",
})
else:
break
unverified_blob = (
json.dumps(verdict["unverified_quotes"], separators=(",", ":"))
if verdict["unverified_quotes"]
else None
)
leaves = [bytes.fromhex(r["leaf_hash"]) for r in chunk_rows]
tree = MerkleTree.build(leaves)
proof_obj = {
"document_root": document_root,
"chunk_0_proof": proof_to_dict(tree.proof(0)),
}
proof_blob = json.dumps(proof_obj, separators=(",", ":"))
# Per-run Merkle-DAG (see arborist/qa/dag.py). Single-doc shape:
# the only "source" is document_root. Quote mode: 7 stages base
# (8 with #000009 preflight). Pointer mode: 9 stages base (10 with
# preflight); context drops out and answer splits into raw_answer
# / parsed_claim_lattice / render.
ev_root = evidence_map_root(evidence_map) if evidence_map else None
parsed_lattice = None
is_lattice_mode = answer_mode in ("claim_lattice_pointer", "claim_lattice")
if is_lattice_mode:
# Per-claim list of {claim_text, content-addressed evidence_ids}
# for the parsed_claim_lattice node hash. Pointer ids are
# run-dependent; evidence_ids are content-addressed → the run-
# DAG hashes the run-stable form. Same shape for JSON and
# pointer; verifier already returns evidence_id_pairs.
evidence_id_pairs = verdict.get("evidence_id_pairs") or []
parsed_lattice = [
{
"claim_text": cs.get("text", ""),
"evidence_ids": evidence_id_pairs[i] if i < len(evidence_id_pairs) else [],
}
for i, cs in enumerate(verdict.get("claim_statuses") or [])
]
# Ticket #000009 — preflight node binding (mirror of query();
# nested CTI clauses per ticket §8 / 2026-05-04 feedback).
from arborist.qa.dag import preflight_node_hash
# verifier_policy_hash + model_profile_hash imported at module
# top; do NOT re-import locally (free-variable shadowing).
ghash_for_dag = verifier_policy_hash(policy)
claim_cap_actually_applied = (
claim_cap_lookup
if (quantifier_apply_caps
and quantifier_caps_mode_gated
and claim_cap_lookup is not None)
else None
)
reminder_eligible = (
quantifier_guard_on
and quantifier_mode_gated
and quantifier.get("is_broad", False)
)
reminder_enabled = bool(policy.get("quantifier_reminder_enabled", False))
reminder_injected = reminder_eligible and reminder_enabled
reminder_template_id = None
if reminder_injected:
reminder_template_id = (
"broad-quantifier-bounded-v1"
if quantifier.get("scope_bound_hint") == "bounded"
else "broad-quantifier-unbounded-v1"
)
# Build payload + hash separately so we can persist both into
# run_dag_blob (Ticket #000009 §7.2 — `arborist providence
# --show-preflight` renders the full clause set).
from arborist.qa.dag import (
_canonical_json as _runner_canon,
_sha256_hex as _runner_sha,
build_preflight_node_payload as _runner_build_payload,
)
_runner_preflight_payload = _runner_build_payload(
question_state=question_state.to_dict(),
quantifier=quantifier,
answer_contract={
"guard_enabled": quantifier_guard_on,
"mode_gated": quantifier_mode_gated,
"apply_caps_active": quantifier_apply_caps,
"apply_caps_mode_gated": quantifier_caps_mode_gated,
"claim_cap_resolved": claim_cap_lookup,
"claim_cap_applied": claim_cap_actually_applied,
"manual_quotes_allowed": False,
"evidence_pointer_required": is_lattice_mode,
"allow_unbounded_enumeration": False,
"reject_broad_active": bool(
policy.get("quantifier_reject_broad", False)
),
"metacognition_enabled": bool(
policy.get("metacognition_enabled", True)
),
"block_on_contradiction": bool(
policy.get("metacognition_block_on_contradiction", False)
),
},
prompt_contract={
"reminder_enabled": reminder_enabled,
"reminder_injected": reminder_injected,
"reminder_template_id": reminder_template_id,
},
evidence_contract={
"max_evidence_ids_exposed": int(policy.get(
"claim_lattice_max_pointers_per_claim", 2
)),
"one_claim_per_line": is_lattice_mode,
},
policy_refs={
"governance_policy_hash": ghash_for_dag,
"model_profile_hash": mhash,
"answer_mode": answer_mode,
},
)
preflight_hash = _runner_sha(_runner_canon(_runner_preflight_payload))
run_dag = build_run_dag(
question_hash=qhash,
sources=[{
"document_root": document_root,
"source_role": "primary_answer_source",
"score": None,
"chunk_idx": None,
}],
context_root=document_root,
conversation_hash=chash,
answer_text=answer_text,
audit_mode=verdict["audit_mode"],
verifier_method=verdict["verifier_method"],
n_quotes=verdict["n_quotes"],
n_verified=verdict["n_verified"],
claim_statuses=verdict.get("claim_statuses", []),
lookup_path="miss",
evidence_map_root=ev_root,
answer_mode=answer_mode if answer_mode != "quote" else None,
violations=verdict.get("violations"),
raw_answer_text=raw_answer if is_lattice_mode else None,
parsed_lattice=parsed_lattice,
rendered_text=answer_text if is_lattice_mode else None,
preflight_hash=preflight_hash,
preflight_payload=_runner_preflight_payload,
)
run_dag_blob = json.dumps(run_dag, separators=(",", ":"))
now = int(time.time())
with transaction(conn):
if repair_changes and pre_repair_verdict is not None:
append_audit(
conn,
event_type="providence_repair",
subject_root=ckey,
body={
"kind": "mechanical",
"n_changes": len(repair_changes),
"changes": repair_changes,
"pre_audit_mode": pre_repair_verdict["audit_mode"],
"post_audit_mode": verdict["audit_mode"],
"pre_n_verified": pre_repair_verdict["n_verified"],
"post_n_verified": verdict["n_verified"],
},
ts=now,
)
event_hash = append_audit(
conn,
event_type="providence_write",
subject_root=ckey,
body={
"source_root": document_root,
"model_id": model_id,
"revision": revision,
"quantization": quantization,
"chunks_in_context": len(chunk_rows),
"answer_chars": len(answer_text),
"audit_mode": verdict["audit_mode"],
"n_quotes": verdict["n_quotes"],
"n_verified": verdict["n_verified"],
"verifier_method": verdict["verifier_method"],
},
ts=now,
)
# Capital ledger (ticket #000020). Sibling table; advisory.
from arborist.capital import profile_for_op, record as capital_record
capital_profile, capital_inputs = profile_for_op(
"qa",
{
"cache_hit": False,
"answer_chars": len(answer_text),
"llm_seconds": llm_ms / 1000.0,
},
)
capital_record(
conn,
audit_event_hash=event_hash,
op_type="qa",
profile=capital_profile,
estimator_inputs=capital_inputs,
ts=now,
)
conn.execute(
# ON CONFLICT(cache_key) DO NOTHING: the cache lookup above runs
# outside this transaction, so two concurrent ask()s on the same
# cache_key can both miss and both reach this INSERT — the loser
# no-ops instead of raising UNIQUE constraint failed (its answer is
# equivalent: same question/model/policy ⇒ same cache_key).
"INSERT INTO providence_cache "
"(cache_key, source_root, document_uri, question_hash, question_text, "
" answer_text, merkle_proof, model_profile_hash, conversation_hash, "
" governance_policy_hash, schema_version, canonicalization_version, "
" chunking_version, falsification_state, chain, audit_event_hash, "
" created_at, hit_count, audit_mode, n_quotes, n_verified, "
" unverified_quotes, verifier_method, run_dag_root, run_dag_blob) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'live', ?, ?, ?, 0, "
" ?, ?, ?, ?, ?, ?, ?) "
"ON CONFLICT(cache_key) DO NOTHING",
(
ckey,
document_root,
doc["document_uri"],
qhash,
question,
answer_text,
proof_blob,
mhash,
chash,
ghash,
SCHEMA_VERSION,
CANONICALIZATION_VERSION,
doc["chunking_version"],
chain,
event_hash,
now,
verdict["audit_mode"],
verdict["n_quotes"],
verdict["n_verified"],
unverified_blob,
verdict["verifier_method"],
run_dag["root"],
run_dag_blob,
),
)
# #000037 Phase 2 — emit advisory controller_events for this QA
# cycle. Single-branch decision, byte-cheap. Wrapped so any
# controller-emit error never blocks the QA return. Audit-only.
try:
_emit_qa_controller_advisory(conn, ckey, verdict)
except Exception: # pragma: no cover — advisory must not break QA
pass
from arborist.qa.dag import localize_failure as _localize
failure_stage = _localize(
audit_mode=verdict["audit_mode"],
n_sources=1, # ask() runs against one document
n_quotes=verdict["n_quotes"],
n_verified=verdict["n_verified"],
)
return {
"status": "cache_miss_then_written",
"audit_mode": verdict["audit_mode"],
"cache_key": ckey,
"run_dag_root": run_dag["root"],
"lookup_path": "miss",
"failure_stage": failure_stage,
"repair_changes": repair_changes,
"pre_repair_audit_mode": (
pre_repair_verdict["audit_mode"] if pre_repair_verdict else None
),
"source_root": document_root,
"answer_text": answer_text,
"merkle_proof": proof_obj,
"n_quotes": verdict["n_quotes"],
"n_verified": verdict["n_verified"],
"verifier_method": verdict["verifier_method"],
"unverified_quotes": verdict["unverified_quotes"],
"partially_verified_quotes": verdict.get("partially_verified_quotes") or [],
# Quantifier preflight (Ticket #000008 Phase 1+2). See query.py
# for full rationale; runner.ask carries the same schema for
# CLI-side `arborist ask` parity with `arborist query`.
"quantifier_intensity": quantifier["intensity"],
"quantifier_matched_token": quantifier["matched_token"],
"scope_bound_hint": quantifier["scope_bound_hint"],
"quantifier_explicit_count": quantifier["explicit_count"],
"claim_cap_applied": claim_cap_lookup,
# Ticket #000010 — meta-cognition QuestionState.
"question_state": question_state.to_dict(),
# Sidecar smell signals (claim_lattice mode only) — render-
# layer; never persisted, never in run_dag_root.
"pointer_id_distribution": verdict.get("pointer_id_distribution"),
"lazy_anchor_ratio": verdict.get("lazy_anchor_ratio"),
"timings": {
"cache_lookup_ms": cache_lookup_ms,
"llm_ms": llm_ms,
"total_ms": _ms_since(t_start),
},
}