Source code for arborist.distill.first_sentence

"""First-sentence-per-chunk distiller (deterministic, no ML).

Compresses a source by taking the first non-trivial sentence of each chunk
and concatenating. Useful as a stub to exercise the core/derivation schema
without external models.
"""

from __future__ import annotations

import re

from arborist.distill.base import DistillationResult, Distiller
from arborist.document import Document

_SENTENCE_BOUNDARY = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9])")
_MIN_SENTENCE_LEN = 10


[docs] class FirstSentenceDistiller(Distiller): name = "first-sentence-v1" def __init__(self, max_chars: int = 4096): self.max_chars = max_chars
[docs] def distill( self, source: Document, source_chunks: list[str] ) -> DistillationResult: sentences: list[str] = [] contributing: list[int] = [] for chunk_idx, chunk in enumerate(source_chunks): first = self._first_sentence(chunk) if first: sentences.append(first) contributing.append(chunk_idx) core_text = "\n".join(sentences) if len(core_text) > self.max_chars: core_text = core_text[: self.max_chars].rstrip() core = Document( uri=f"{source.uri}#core/{self.name}", content=core_text, source_type=f"core:{self.name}", title=(source.title or "") + " [CORE]", ) return DistillationResult(core=core, contributing_chunk_indices=contributing)
@staticmethod def _first_sentence(text: str) -> str | None: text = text.strip() if not text: return None # Try paragraph-aware: split on blank lines first, then take first paragraph. for para in text.split("\n\n"): para = para.strip() if not para: continue # First sentence within the paragraph. parts = _SENTENCE_BOUNDARY.split(para, maxsplit=1) first = parts[0].strip() if len(first) >= _MIN_SENTENCE_LEN: return first return None