Source code for arborist.qa.quantifier

"""Pure quantifier preflight classifier — Ticket #000008 Phase 1.

Maps a question string onto the ten-rung intensity ladder defined in
``docs/tickets/ticket-000008-broad-quantifier-preflight-guard.md`` §2.
The classifier exists to estimate **expected number of claims in the
answer** and **format-discipline risk on small models**. It is not
formal-semantics quantifier theory; the operational axis is what
matters.

Pure function. No I/O. No model call. No retrieval call. Folds into
``governance_policy_hash`` via ``classifier_version`` (added to
``arborist.qa.keys._VERIFIER_POLICY_FIELDS`` in Phase 2).

Intensity rungs (highest wins for multi-quantifier questions)::

    1.  ABSENT             universal-negation, single-claim shape
    2.  SINGULAR           one-fact wh / definite reference
    3.  PROPORTIONAL       descriptive fraction (`most`, `half`)
    4.  SMALL_NUM_EXPLICIT bounded by digit/word (`top 3`, `seven X`)
    5.  COMPARATIVE_BOUND  bounded by inequality (`at least 5`)
    6.  FEW                small set, vague (`some`, `a few`)
    7.  MANY               medium set, vague (`many`, `numerous`)
    8.  ALL                universal quantifier (`all`, `every`)
    9.  COMPREHENSIVE      exhaustive request (`complete list of`,
                           `tell me everything`)
    10. OPEN_REQUEST       verb-driven enumeration (`tell me about`,
                           `describe`, `explain`)

Returns a dict with::

    intensity              one of the ten rungs (or "SINGULAR" by default)
    matched_token          the lexical surface form that triggered the rung
    explicit_count         int when SMALL_NUM_EXPLICIT or COMPARATIVE_BOUND;
                           None otherwise
    is_broad               True for ALL / COMPREHENSIVE / OPEN_REQUEST
    operational_shape      mnemonic for downstream policy (e.g.
                           "universal_enumeration", "exhaustive_request")
    scope_bound_hint       "bounded" | "unbounded" | "unknown"
                           (see ticket §10.1 -- bounded != unbounded
                           universals; classifier defaults to "unknown"
                           when intensity is broad and no domain anchor
                           is present)

Highest-intensity-wins arbitration: when a question contains
overlapping markers (e.g. "tell me about all the planets"), pick the
rung farther from SINGULAR. The order in ``_RUNG_PRIORITY`` codifies
this — later rungs win.
"""

from __future__ import annotations

import re

CLASSIFIER_VERSION = "quantifier-v0.1"

# ---------------------------------------------------------------- intensities

_RUNG_PRIORITY = (
    "SINGULAR",
    "PROPORTIONAL",
    "SMALL_NUM_EXPLICIT",
    "COMPARATIVE_BOUND",
    "FEW",
    "MANY",
    "ABSENT",
    "ALL",
    # OPEN_REQUEST sits below COMPREHENSIVE: "tell me everything
    # about X" is BOTH OPEN_REQUEST-shape ("tell me about") AND
    # COMPREHENSIVE-shape ("everything"). COMPREHENSIVE wins because
    # it carries the explicit exhaustive request — see §2.2 of
    # ticket #000008 ("COMPREHENSIVE strictly stronger than ALL").
    # On the cap table, COMPREHENSIVE caps higher than OPEN_REQUEST
    # for large models because the operator is asking for depth
    # explicitly; both clamp at 5 on small models.
    "OPEN_REQUEST",
    "COMPREHENSIVE",
)

_BROAD_RUNGS = frozenset({"ALL", "COMPREHENSIVE", "OPEN_REQUEST"})


# Lexical patterns per rung. Patterns are compiled with re.IGNORECASE
# at module-load. Order within each rung doesn't matter — first match
# wins for matched_token reporting, but rung selection is by priority
# (later rung = higher intensity, see _classify).

# OPEN_REQUEST — verb-driven enumeration without explicit quantifier.
_OPEN_REQUEST_PATTERNS = [
    r"\btell me (?:about|all about)\b",
    r"\btell me everything\b",
    r"\bdescribe\b",
    r"\bexplain\b",
    r"\bsummari[sz]e\b",
    r"\bgive me (?:an? )?(?:overview|summary)\b",
    r"\bwalk me through\b",
    r"\bdiscuss\b",
    r"\belaborate on\b",
    r"\bexpound on\b",
    r"\bwhat about\b",
]

# COMPREHENSIVE — exhaustive request, strictly stronger than ALL.
_COMPREHENSIVE_PATTERNS = [
    r"\bcomprehensive\b",
    r"\bcomplete (?:list|inventory|set|enumeration|account)\b",
    r"\bexhaustive\b",
    r"\bdefinitive\b",
    r"\beverything (?:you know|there is)\b",
    r"\btell me everything\b",
    r"\bthe whole (?:story|picture|thing)\b",
    r"\bthe full (?:story|picture|account)\b",
    r"\bfrom a to z\b",
    r"\ball there is to know\b",
]

# ALL — universal quantifier.
_ALL_PATTERNS = [
    r"\ball\b",
    r"\bevery\b",
    r"\beach (?:and every )?\b",
    r"\bevery single\b",
    r"\bthe whole\b",
    r"\bthe entirety of\b",
    r"\bthe totality of\b",
    r"\bany\b",            # universal use ("any X is Y")
    r"\bwhatever\b",
    r"\bwhoever\b",
]

# MANY — medium set, vague.
_MANY_PATTERNS = [
    r"\bmany\b",
    r"\bvarious\b",
    r"\bmultiple\b",
    r"\bnumerous\b",
    r"\ba number of\b",
    r"\blots of\b",
    r"\bplenty of\b",
    r"\ba great many\b",
    r"\bmultitudes\b",
    r"\bseveral dozen\b",
]

# FEW — small set, vague.
_FEW_PATTERNS = [
    r"\bsome\b",
    r"\ba few\b",
    r"\bseveral\b",
    r"\ba couple\b",
    r"\ba handful\b",
    r"\ba small number of\b",
    r"\ba smattering of\b",
    r"\bnot many\b",
    r"\bhardly any\b",
]

# SMALL_NUM_EXPLICIT — bounded by digit or numeric word.
# Matched token reports the count phrase; explicit_count populated
# from the digit / lexicon below.
_SMALL_NUM_DIGIT = re.compile(
    r"\btop (\d+)\b|\b(\d+) (?:biggest|smallest|largest|most|best|worst)\b"
    r"|\b(?:first|last|top) (\d+)\b",
    re.IGNORECASE,
)
_NUMBER_WORDS = {
    "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
    "eleven": 11, "twelve": 12, "dozen": 12,
}
_SMALL_NUM_WORD_PATTERNS = [
    rf"\b(?:top |first |last )?({w})\b" for w in _NUMBER_WORDS
]
_PAIR_OF = re.compile(r"\bpair of\b", re.IGNORECASE)
_HANDFUL_OF = re.compile(r"\ba handful of\b", re.IGNORECASE)

# COMPARATIVE_BOUND — bounded by inequality.
# at_least / at_most / more_than / fewer_than / under / over /
# up_to / between A and B.
_COMPARATIVE_PATTERNS = [
    re.compile(r"\bat least (\d+)\b", re.IGNORECASE),
    re.compile(r"\bat most (\d+)\b", re.IGNORECASE),
    re.compile(r"\bno more than (\d+)\b", re.IGNORECASE),
    re.compile(r"\bmore than (\d+)\b", re.IGNORECASE),
    re.compile(r"\bfewer than (\d+)\b", re.IGNORECASE),
    re.compile(r"\bless than (\d+)\b", re.IGNORECASE),
    re.compile(r"\bunder (\d+)\b", re.IGNORECASE),
    re.compile(r"\bover (\d+)\b", re.IGNORECASE),
    re.compile(r"\bup to (\d+)\b", re.IGNORECASE),
    re.compile(r"\bbetween (\d+) and (\d+)\b", re.IGNORECASE),
]

# PROPORTIONAL — descriptive fraction.
_PROPORTIONAL_PATTERNS = [
    r"\bmost\b",
    r"\bmajority of\b",
    r"\bminority of\b",
    r"\bhalf (?:of|the)\b",
    r"\ba third of\b",
    r"\ba quarter of\b",
    r"\b\d+%\s*of\b",
    r"\bthe bulk of\b",
    r"\bthe lion's share of\b",
]

# Count-question short-circuit. `how many` / `how much` at the
# start of a question (or after a leading wh-clause like
# "and how many...") is asking for a single numeric answer, not
# enumeration. Without this short-circuit the bare `\bmany\b`
# pattern below misfires.
#
# Anchored so we only short-circuit when the question is a count-
# question SHAPE — `how many` further into the question (e.g.
# "list the states; how many are there?") doesn't take precedence
# over the rest of the question's quantifier markers.
_COUNT_QUESTION_RE = re.compile(
    r"^\s*(?:and\s+|but\s+|so\s+)?how (?:many|much)\b",
    re.IGNORECASE,
)


# ABSENT — universal-negation.
_ABSENT_PATTERNS = [
    r"\bnone\b",
    r"\bno (?:one|body|where)\b",
    r"\bnothing\b",
    r"\bnobody\b",
    r"\bnowhere\b",
    r"\bneither\b",
    r"\bnever\b",
    r"\bnot a single\b",
    r"\bzero\b",
    # Negative wh-shape: "which X is/are/do/does/did/has/have not Y".
    # Catches both copular ("which X is not Y") and auxiliary
    # ("which X do not Y") forms — both express universal negation
    # over the X universe.
    r"\bwhich \w+ (?:is |are |do |does |did |has |have |were |was )?not\b",
    r"\bwho (?:is |are |do |does |did |has |have )?not\b",
    r"\bwhat (?:is |are |do |does |did |has |have )?not\b",
]

# Compile each rung's patterns once.
_OPEN_REQUEST_RE = [re.compile(p, re.IGNORECASE) for p in _OPEN_REQUEST_PATTERNS]
_COMPREHENSIVE_RE = [re.compile(p, re.IGNORECASE) for p in _COMPREHENSIVE_PATTERNS]
_ALL_RE = [re.compile(p, re.IGNORECASE) for p in _ALL_PATTERNS]
_MANY_RE = [re.compile(p, re.IGNORECASE) for p in _MANY_PATTERNS]
_FEW_RE = [re.compile(p, re.IGNORECASE) for p in _FEW_PATTERNS]
_PROPORTIONAL_RE = [re.compile(p, re.IGNORECASE) for p in _PROPORTIONAL_PATTERNS]
_ABSENT_RE = [re.compile(p, re.IGNORECASE) for p in _ABSENT_PATTERNS]


# ---------------------------------------------------------------- shape mnemonics

_OPERATIONAL_SHAPE = {
    "ABSENT": "universal_negation",
    "SINGULAR": "single_fact",
    "PROPORTIONAL": "descriptive_fraction",
    "SMALL_NUM_EXPLICIT": "bounded_count",
    "COMPARATIVE_BOUND": "bounded_inequality",
    "FEW": "small_set_vague",
    "MANY": "medium_set_vague",
    "ALL": "universal_enumeration",
    "COMPREHENSIVE": "exhaustive_request",
    "OPEN_REQUEST": "verb_driven_enumeration",
}


# ---------------------------------------------------------------- scope hint

# Lexical anchors that hint a bounded universe — when present alongside
# a broad quantifier, scope_bound_hint upgrades from "unknown" to
# "bounded". This is a heuristic; the corpus arity check (§10.1) is
# left for a future refinement.
_BOUNDED_DOMAIN_ANCHORS = (
    re.compile(r"\bthe (?:beatles|fab four)\b", re.IGNORECASE),
    re.compile(r"\b(?:US|U\.S\.|united states) (?:states|presidents)\b", re.IGNORECASE),
    re.compile(r"\bplanets (?:in (?:the|our) solar system)?\b", re.IGNORECASE),
    re.compile(r"\bfounding fathers\b", re.IGNORECASE),
    re.compile(r"\bcontinents\b", re.IGNORECASE),
    re.compile(r"\boceans\b", re.IGNORECASE),
    # Year/season-anchored questions tend to bound the universe to one
    # event — "winners of the 2024 World Series" is bounded.
    re.compile(r"\b(?:19|20)\d{2}\b"),
    re.compile(r"\b(?:season|year|championship|tournament|event) (?:of|for)\b", re.IGNORECASE),
)


def _scope_bound_hint(question: str, intensity: str) -> str:
    """Return ``"bounded"``, ``"unbounded"``, or ``"unknown"``.

    Bounded universals (``"all members of the Beatles"``, ``"every US
    state"``) have a finite, corpus-known answer set. Unbounded
    universals (``"winners of all major sports"``) have an undefined
    set under the current scope. The classifier defaults to
    ``"unknown"`` for non-broad intensities (the question doesn't
    have universal pressure to gate) and to ``"unbounded"`` for broad
    intensities lacking a domain anchor.

    Heuristic only. The corpus arity check (§10.1 future refinement)
    will sharpen this when implemented.
    """
    if intensity not in _BROAD_RUNGS:
        return "unknown"
    for anchor in _BOUNDED_DOMAIN_ANCHORS:
        if anchor.search(question):
            return "bounded"
    return "unbounded"


# ---------------------------------------------------------------- classifier


[docs]
def classify_question_quantifier(question: str) -> dict:
    """Classify ``question`` onto the ten-rung intensity ladder.

    Highest-intensity-wins arbitration: when multiple rungs match,
    pick the one farther from SINGULAR. Operationally that means
    "tell me about all the planets" classifies OPEN_REQUEST (later
    in the priority order than ALL), even though ALL also matched.
    The downstream cap is the broader rung's cap, which is what we
    want under enumeration pressure.

    Returns a dict; see module docstring for fields.

    Empty / whitespace-only questions classify SINGULAR (no quantifier
    pressure) with no matched_token.
    """
    if not question or not question.strip():
        return {
            "intensity": "SINGULAR",
            "matched_token": None,
            "explicit_count": None,
            "is_broad": False,
            "operational_shape": _OPERATIONAL_SHAPE["SINGULAR"],
            "scope_bound_hint": "unknown",
            "classifier_version": CLASSIFIER_VERSION,
        }

    # Count-question short-circuit (caught by 2026-05-03 dry-run
    # review across bench/qa_questions.txt). `how many X?` and
    # `how much X?` ask for a SINGLE numeric answer ("50 states",
    # "206 bones") — not enumeration. Without this short-circuit,
    # the bare `\bmany\b` pattern in _MANY_PATTERNS misfires on
    # `how many` and the question lands in MANY rung (cap 8 on
    # Hermes), which is wrong: a count question deserves cap 1
    # (SINGULAR), not 8.
    #
    # The same applies to `how often`, `how long`, `how big` —
    # all count/measurement questions with single-fact answers.
    # We catch the dominant `how many|much` shape here; the others
    # already classify SINGULAR by default.
    if _COUNT_QUESTION_RE.search(question):
        m = _COUNT_QUESTION_RE.search(question)
        return {
            "intensity": "SINGULAR",
            "matched_token": m.group(0),
            "explicit_count": None,
            "is_broad": False,
            "operational_shape": _OPERATIONAL_SHAPE["SINGULAR"],
            "scope_bound_hint": "unknown",
            "classifier_version": CLASSIFIER_VERSION,
        }

    candidates: list[tuple[str, str, int | None]] = []

    # Per-rung detection. Earlier rungs run first but rung selection
    # uses _RUNG_PRIORITY (highest-priority match wins).
    def _check(rung_re_list: list[re.Pattern], rung_name: str) -> None:
        for pat in rung_re_list:
            m = pat.search(question)
            if m:
                token = m.group(0)
                candidates.append((rung_name, token, None))
                return

    _check(_ABSENT_RE, "ABSENT")
    _check(_PROPORTIONAL_RE, "PROPORTIONAL")
    _check(_FEW_RE, "FEW")
    _check(_MANY_RE, "MANY")
    _check(_ALL_RE, "ALL")
    _check(_COMPREHENSIVE_RE, "COMPREHENSIVE")
    _check(_OPEN_REQUEST_RE, "OPEN_REQUEST")

    # SMALL_NUM_EXPLICIT — explicit digit or number word.
    digit_match = _SMALL_NUM_DIGIT.search(question)
    if digit_match:
        # Pull the first non-None group as the count.
        count = next(
            (int(g) for g in digit_match.groups() if g is not None),
            None,
        )
        candidates.append(("SMALL_NUM_EXPLICIT", digit_match.group(0), count))
    else:
        # Number-word path. Try each word; first hit wins.
        for word, count in _NUMBER_WORDS.items():
            if re.search(rf"\b(?:top |first |last )?{word}\b",
                         question, re.IGNORECASE):
                candidates.append(
                    ("SMALL_NUM_EXPLICIT", word, count)
                )
                break
        # `pair of` → 2; `a handful of` → 5 (folds into FEW
        # operationally, but record as SMALL_NUM_EXPLICIT with
        # count=2/5 so the cap is exact).
        if _PAIR_OF.search(question):
            candidates.append(("SMALL_NUM_EXPLICIT", "pair of", 2))
        if _HANDFUL_OF.search(question):
            candidates.append(("SMALL_NUM_EXPLICIT", "a handful of", 5))

    # COMPARATIVE_BOUND — explicit inequality.
    for pat in _COMPARATIVE_PATTERNS:
        m = pat.search(question)
        if m:
            # `between A and B` → use the upper bound; otherwise the
            # single captured number.
            groups = [int(g) for g in m.groups() if g is not None]
            count = max(groups) if groups else None
            candidates.append(("COMPARATIVE_BOUND", m.group(0), count))
            break

    if not candidates:
        # Default — SINGULAR for any wh-question or definite reference
        # without quantifier markers.
        intensity = "SINGULAR"
        matched_token: str | None = None
        explicit_count: int | None = None
    else:
        # Highest-intensity wins (later in _RUNG_PRIORITY = higher).
        priority = {r: i for i, r in enumerate(_RUNG_PRIORITY)}
        candidates.sort(key=lambda c: priority.get(c[0], -1), reverse=True)
        intensity, matched_token, explicit_count = candidates[0]

    return {
        "intensity": intensity,
        "matched_token": matched_token,
        "explicit_count": explicit_count,
        "is_broad": intensity in _BROAD_RUNGS,
        "operational_shape": _OPERATIONAL_SHAPE[intensity],
        "scope_bound_hint": _scope_bound_hint(question, intensity),
        "classifier_version": CLASSIFIER_VERSION,
    }