"""Pure quantifier preflight classifier — Ticket #000008 Phase 1.
Maps a question string onto the ten-rung intensity ladder defined in
``docs/tickets/ticket-000008-broad-quantifier-preflight-guard.md`` §2.
The classifier exists to estimate **expected number of claims in the
answer** and **format-discipline risk on small models**. It is not
formal-semantics quantifier theory; the operational axis is what
matters.
Pure function. No I/O. No model call. No retrieval call. Folds into
``governance_policy_hash`` via ``classifier_version`` (added to
``arborist.qa.keys._VERIFIER_POLICY_FIELDS`` in Phase 2).
Intensity rungs (highest wins for multi-quantifier questions)::
1. ABSENT universal-negation, single-claim shape
2. SINGULAR one-fact wh / definite reference
3. PROPORTIONAL descriptive fraction (`most`, `half`)
4. SMALL_NUM_EXPLICIT bounded by digit/word (`top 3`, `seven X`)
5. COMPARATIVE_BOUND bounded by inequality (`at least 5`)
6. FEW small set, vague (`some`, `a few`)
7. MANY medium set, vague (`many`, `numerous`)
8. ALL universal quantifier (`all`, `every`)
9. COMPREHENSIVE exhaustive request (`complete list of`,
`tell me everything`)
10. OPEN_REQUEST verb-driven enumeration (`tell me about`,
`describe`, `explain`)
Returns a dict with::
intensity one of the ten rungs (or "SINGULAR" by default)
matched_token the lexical surface form that triggered the rung
explicit_count int when SMALL_NUM_EXPLICIT or COMPARATIVE_BOUND;
None otherwise
is_broad True for ALL / COMPREHENSIVE / OPEN_REQUEST
operational_shape mnemonic for downstream policy (e.g.
"universal_enumeration", "exhaustive_request")
scope_bound_hint "bounded" | "unbounded" | "unknown"
(see ticket §10.1 -- bounded != unbounded
universals; classifier defaults to "unknown"
when intensity is broad and no domain anchor
is present)
Highest-intensity-wins arbitration: when a question contains
overlapping markers (e.g. "tell me about all the planets"), pick the
rung farther from SINGULAR. The order in ``_RUNG_PRIORITY`` codifies
this — later rungs win.
"""
from __future__ import annotations
import re
CLASSIFIER_VERSION = "quantifier-v0.1"
# ---------------------------------------------------------------- intensities
_RUNG_PRIORITY = (
"SINGULAR",
"PROPORTIONAL",
"SMALL_NUM_EXPLICIT",
"COMPARATIVE_BOUND",
"FEW",
"MANY",
"ABSENT",
"ALL",
# OPEN_REQUEST sits below COMPREHENSIVE: "tell me everything
# about X" is BOTH OPEN_REQUEST-shape ("tell me about") AND
# COMPREHENSIVE-shape ("everything"). COMPREHENSIVE wins because
# it carries the explicit exhaustive request — see §2.2 of
# ticket #000008 ("COMPREHENSIVE strictly stronger than ALL").
# On the cap table, COMPREHENSIVE caps higher than OPEN_REQUEST
# for large models because the operator is asking for depth
# explicitly; both clamp at 5 on small models.
"OPEN_REQUEST",
"COMPREHENSIVE",
)
_BROAD_RUNGS = frozenset({"ALL", "COMPREHENSIVE", "OPEN_REQUEST"})
# Lexical patterns per rung. Patterns are compiled with re.IGNORECASE
# at module-load. Order within each rung doesn't matter — first match
# wins for matched_token reporting, but rung selection is by priority
# (later rung = higher intensity, see _classify).
# OPEN_REQUEST — verb-driven enumeration without explicit quantifier.
_OPEN_REQUEST_PATTERNS = [
r"\btell me (?:about|all about)\b",
r"\btell me everything\b",
r"\bdescribe\b",
r"\bexplain\b",
r"\bsummari[sz]e\b",
r"\bgive me (?:an? )?(?:overview|summary)\b",
r"\bwalk me through\b",
r"\bdiscuss\b",
r"\belaborate on\b",
r"\bexpound on\b",
r"\bwhat about\b",
]
# COMPREHENSIVE — exhaustive request, strictly stronger than ALL.
_COMPREHENSIVE_PATTERNS = [
r"\bcomprehensive\b",
r"\bcomplete (?:list|inventory|set|enumeration|account)\b",
r"\bexhaustive\b",
r"\bdefinitive\b",
r"\beverything (?:you know|there is)\b",
r"\btell me everything\b",
r"\bthe whole (?:story|picture|thing)\b",
r"\bthe full (?:story|picture|account)\b",
r"\bfrom a to z\b",
r"\ball there is to know\b",
]
# ALL — universal quantifier.
_ALL_PATTERNS = [
r"\ball\b",
r"\bevery\b",
r"\beach (?:and every )?\b",
r"\bevery single\b",
r"\bthe whole\b",
r"\bthe entirety of\b",
r"\bthe totality of\b",
r"\bany\b", # universal use ("any X is Y")
r"\bwhatever\b",
r"\bwhoever\b",
]
# MANY — medium set, vague.
_MANY_PATTERNS = [
r"\bmany\b",
r"\bvarious\b",
r"\bmultiple\b",
r"\bnumerous\b",
r"\ba number of\b",
r"\blots of\b",
r"\bplenty of\b",
r"\ba great many\b",
r"\bmultitudes\b",
r"\bseveral dozen\b",
]
# FEW — small set, vague.
_FEW_PATTERNS = [
r"\bsome\b",
r"\ba few\b",
r"\bseveral\b",
r"\ba couple\b",
r"\ba handful\b",
r"\ba small number of\b",
r"\ba smattering of\b",
r"\bnot many\b",
r"\bhardly any\b",
]
# SMALL_NUM_EXPLICIT — bounded by digit or numeric word.
# Matched token reports the count phrase; explicit_count populated
# from the digit / lexicon below.
_SMALL_NUM_DIGIT = re.compile(
r"\btop (\d+)\b|\b(\d+) (?:biggest|smallest|largest|most|best|worst)\b"
r"|\b(?:first|last|top) (\d+)\b",
re.IGNORECASE,
)
_NUMBER_WORDS = {
"one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
"six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
"eleven": 11, "twelve": 12, "dozen": 12,
}
_SMALL_NUM_WORD_PATTERNS = [
rf"\b(?:top |first |last )?({w})\b" for w in _NUMBER_WORDS
]
_PAIR_OF = re.compile(r"\bpair of\b", re.IGNORECASE)
_HANDFUL_OF = re.compile(r"\ba handful of\b", re.IGNORECASE)
# COMPARATIVE_BOUND — bounded by inequality.
# at_least / at_most / more_than / fewer_than / under / over /
# up_to / between A and B.
_COMPARATIVE_PATTERNS = [
re.compile(r"\bat least (\d+)\b", re.IGNORECASE),
re.compile(r"\bat most (\d+)\b", re.IGNORECASE),
re.compile(r"\bno more than (\d+)\b", re.IGNORECASE),
re.compile(r"\bmore than (\d+)\b", re.IGNORECASE),
re.compile(r"\bfewer than (\d+)\b", re.IGNORECASE),
re.compile(r"\bless than (\d+)\b", re.IGNORECASE),
re.compile(r"\bunder (\d+)\b", re.IGNORECASE),
re.compile(r"\bover (\d+)\b", re.IGNORECASE),
re.compile(r"\bup to (\d+)\b", re.IGNORECASE),
re.compile(r"\bbetween (\d+) and (\d+)\b", re.IGNORECASE),
]
# PROPORTIONAL — descriptive fraction.
_PROPORTIONAL_PATTERNS = [
r"\bmost\b",
r"\bmajority of\b",
r"\bminority of\b",
r"\bhalf (?:of|the)\b",
r"\ba third of\b",
r"\ba quarter of\b",
r"\b\d+%\s*of\b",
r"\bthe bulk of\b",
r"\bthe lion's share of\b",
]
# Count-question short-circuit. `how many` / `how much` at the
# start of a question (or after a leading wh-clause like
# "and how many...") is asking for a single numeric answer, not
# enumeration. Without this short-circuit the bare `\bmany\b`
# pattern below misfires.
#
# Anchored so we only short-circuit when the question is a count-
# question SHAPE — `how many` further into the question (e.g.
# "list the states; how many are there?") doesn't take precedence
# over the rest of the question's quantifier markers.
_COUNT_QUESTION_RE = re.compile(
r"^\s*(?:and\s+|but\s+|so\s+)?how (?:many|much)\b",
re.IGNORECASE,
)
# ABSENT — universal-negation.
_ABSENT_PATTERNS = [
r"\bnone\b",
r"\bno (?:one|body|where)\b",
r"\bnothing\b",
r"\bnobody\b",
r"\bnowhere\b",
r"\bneither\b",
r"\bnever\b",
r"\bnot a single\b",
r"\bzero\b",
# Negative wh-shape: "which X is/are/do/does/did/has/have not Y".
# Catches both copular ("which X is not Y") and auxiliary
# ("which X do not Y") forms — both express universal negation
# over the X universe.
r"\bwhich \w+ (?:is |are |do |does |did |has |have |were |was )?not\b",
r"\bwho (?:is |are |do |does |did |has |have )?not\b",
r"\bwhat (?:is |are |do |does |did |has |have )?not\b",
]
# Compile each rung's patterns once.
_OPEN_REQUEST_RE = [re.compile(p, re.IGNORECASE) for p in _OPEN_REQUEST_PATTERNS]
_COMPREHENSIVE_RE = [re.compile(p, re.IGNORECASE) for p in _COMPREHENSIVE_PATTERNS]
_ALL_RE = [re.compile(p, re.IGNORECASE) for p in _ALL_PATTERNS]
_MANY_RE = [re.compile(p, re.IGNORECASE) for p in _MANY_PATTERNS]
_FEW_RE = [re.compile(p, re.IGNORECASE) for p in _FEW_PATTERNS]
_PROPORTIONAL_RE = [re.compile(p, re.IGNORECASE) for p in _PROPORTIONAL_PATTERNS]
_ABSENT_RE = [re.compile(p, re.IGNORECASE) for p in _ABSENT_PATTERNS]
# ---------------------------------------------------------------- shape mnemonics
_OPERATIONAL_SHAPE = {
"ABSENT": "universal_negation",
"SINGULAR": "single_fact",
"PROPORTIONAL": "descriptive_fraction",
"SMALL_NUM_EXPLICIT": "bounded_count",
"COMPARATIVE_BOUND": "bounded_inequality",
"FEW": "small_set_vague",
"MANY": "medium_set_vague",
"ALL": "universal_enumeration",
"COMPREHENSIVE": "exhaustive_request",
"OPEN_REQUEST": "verb_driven_enumeration",
}
# ---------------------------------------------------------------- scope hint
# Lexical anchors that hint a bounded universe — when present alongside
# a broad quantifier, scope_bound_hint upgrades from "unknown" to
# "bounded". This is a heuristic; the corpus arity check (§10.1) is
# left for a future refinement.
_BOUNDED_DOMAIN_ANCHORS = (
re.compile(r"\bthe (?:beatles|fab four)\b", re.IGNORECASE),
re.compile(r"\b(?:US|U\.S\.|united states) (?:states|presidents)\b", re.IGNORECASE),
re.compile(r"\bplanets (?:in (?:the|our) solar system)?\b", re.IGNORECASE),
re.compile(r"\bfounding fathers\b", re.IGNORECASE),
re.compile(r"\bcontinents\b", re.IGNORECASE),
re.compile(r"\boceans\b", re.IGNORECASE),
# Year/season-anchored questions tend to bound the universe to one
# event — "winners of the 2024 World Series" is bounded.
re.compile(r"\b(?:19|20)\d{2}\b"),
re.compile(r"\b(?:season|year|championship|tournament|event) (?:of|for)\b", re.IGNORECASE),
)
def _scope_bound_hint(question: str, intensity: str) -> str:
"""Return ``"bounded"``, ``"unbounded"``, or ``"unknown"``.
Bounded universals (``"all members of the Beatles"``, ``"every US
state"``) have a finite, corpus-known answer set. Unbounded
universals (``"winners of all major sports"``) have an undefined
set under the current scope. The classifier defaults to
``"unknown"`` for non-broad intensities (the question doesn't
have universal pressure to gate) and to ``"unbounded"`` for broad
intensities lacking a domain anchor.
Heuristic only. The corpus arity check (§10.1 future refinement)
will sharpen this when implemented.
"""
if intensity not in _BROAD_RUNGS:
return "unknown"
for anchor in _BOUNDED_DOMAIN_ANCHORS:
if anchor.search(question):
return "bounded"
return "unbounded"
# ---------------------------------------------------------------- classifier
[docs]
def classify_question_quantifier(question: str) -> dict:
"""Classify ``question`` onto the ten-rung intensity ladder.
Highest-intensity-wins arbitration: when multiple rungs match,
pick the one farther from SINGULAR. Operationally that means
"tell me about all the planets" classifies OPEN_REQUEST (later
in the priority order than ALL), even though ALL also matched.
The downstream cap is the broader rung's cap, which is what we
want under enumeration pressure.
Returns a dict; see module docstring for fields.
Empty / whitespace-only questions classify SINGULAR (no quantifier
pressure) with no matched_token.
"""
if not question or not question.strip():
return {
"intensity": "SINGULAR",
"matched_token": None,
"explicit_count": None,
"is_broad": False,
"operational_shape": _OPERATIONAL_SHAPE["SINGULAR"],
"scope_bound_hint": "unknown",
"classifier_version": CLASSIFIER_VERSION,
}
# Count-question short-circuit (caught by 2026-05-03 dry-run
# review across bench/qa_questions.txt). `how many X?` and
# `how much X?` ask for a SINGLE numeric answer ("50 states",
# "206 bones") — not enumeration. Without this short-circuit,
# the bare `\bmany\b` pattern in _MANY_PATTERNS misfires on
# `how many` and the question lands in MANY rung (cap 8 on
# Hermes), which is wrong: a count question deserves cap 1
# (SINGULAR), not 8.
#
# The same applies to `how often`, `how long`, `how big` —
# all count/measurement questions with single-fact answers.
# We catch the dominant `how many|much` shape here; the others
# already classify SINGULAR by default.
if _COUNT_QUESTION_RE.search(question):
m = _COUNT_QUESTION_RE.search(question)
return {
"intensity": "SINGULAR",
"matched_token": m.group(0),
"explicit_count": None,
"is_broad": False,
"operational_shape": _OPERATIONAL_SHAPE["SINGULAR"],
"scope_bound_hint": "unknown",
"classifier_version": CLASSIFIER_VERSION,
}
candidates: list[tuple[str, str, int | None]] = []
# Per-rung detection. Earlier rungs run first but rung selection
# uses _RUNG_PRIORITY (highest-priority match wins).
def _check(rung_re_list: list[re.Pattern], rung_name: str) -> None:
for pat in rung_re_list:
m = pat.search(question)
if m:
token = m.group(0)
candidates.append((rung_name, token, None))
return
_check(_ABSENT_RE, "ABSENT")
_check(_PROPORTIONAL_RE, "PROPORTIONAL")
_check(_FEW_RE, "FEW")
_check(_MANY_RE, "MANY")
_check(_ALL_RE, "ALL")
_check(_COMPREHENSIVE_RE, "COMPREHENSIVE")
_check(_OPEN_REQUEST_RE, "OPEN_REQUEST")
# SMALL_NUM_EXPLICIT — explicit digit or number word.
digit_match = _SMALL_NUM_DIGIT.search(question)
if digit_match:
# Pull the first non-None group as the count.
count = next(
(int(g) for g in digit_match.groups() if g is not None),
None,
)
candidates.append(("SMALL_NUM_EXPLICIT", digit_match.group(0), count))
else:
# Number-word path. Try each word; first hit wins.
for word, count in _NUMBER_WORDS.items():
if re.search(rf"\b(?:top |first |last )?{word}\b",
question, re.IGNORECASE):
candidates.append(
("SMALL_NUM_EXPLICIT", word, count)
)
break
# `pair of` → 2; `a handful of` → 5 (folds into FEW
# operationally, but record as SMALL_NUM_EXPLICIT with
# count=2/5 so the cap is exact).
if _PAIR_OF.search(question):
candidates.append(("SMALL_NUM_EXPLICIT", "pair of", 2))
if _HANDFUL_OF.search(question):
candidates.append(("SMALL_NUM_EXPLICIT", "a handful of", 5))
# COMPARATIVE_BOUND — explicit inequality.
for pat in _COMPARATIVE_PATTERNS:
m = pat.search(question)
if m:
# `between A and B` → use the upper bound; otherwise the
# single captured number.
groups = [int(g) for g in m.groups() if g is not None]
count = max(groups) if groups else None
candidates.append(("COMPARATIVE_BOUND", m.group(0), count))
break
if not candidates:
# Default — SINGULAR for any wh-question or definite reference
# without quantifier markers.
intensity = "SINGULAR"
matched_token: str | None = None
explicit_count: int | None = None
else:
# Highest-intensity wins (later in _RUNG_PRIORITY = higher).
priority = {r: i for i, r in enumerate(_RUNG_PRIORITY)}
candidates.sort(key=lambda c: priority.get(c[0], -1), reverse=True)
intensity, matched_token, explicit_count = candidates[0]
return {
"intensity": intensity,
"matched_token": matched_token,
"explicit_count": explicit_count,
"is_broad": intensity in _BROAD_RUNGS,
"operational_shape": _OPERATIONAL_SHAPE[intensity],
"scope_bound_hint": _scope_bound_hint(question, intensity),
"classifier_version": CLASSIFIER_VERSION,
}