Source code for arborist.distill.first_sentence
"""First-sentence-per-chunk distiller (deterministic, no ML).
Compresses a source by taking the first non-trivial sentence of each chunk
and concatenating. Useful as a stub to exercise the core/derivation schema
without external models.
"""
from __future__ import annotations
import re
from arborist.distill.base import DistillationResult, Distiller
from arborist.document import Document
_SENTENCE_BOUNDARY = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9])")
_MIN_SENTENCE_LEN = 10
[docs]
class FirstSentenceDistiller(Distiller):
name = "first-sentence-v1"
def __init__(self, max_chars: int = 4096):
self.max_chars = max_chars
[docs]
def distill(
self, source: Document, source_chunks: list[str]
) -> DistillationResult:
sentences: list[str] = []
contributing: list[int] = []
for chunk_idx, chunk in enumerate(source_chunks):
first = self._first_sentence(chunk)
if first:
sentences.append(first)
contributing.append(chunk_idx)
core_text = "\n".join(sentences)
if len(core_text) > self.max_chars:
core_text = core_text[: self.max_chars].rstrip()
core = Document(
uri=f"{source.uri}#core/{self.name}",
content=core_text,
source_type=f"core:{self.name}",
title=(source.title or "") + " [CORE]",
)
return DistillationResult(core=core, contributing_chunk_indices=contributing)
@staticmethod
def _first_sentence(text: str) -> str | None:
text = text.strip()
if not text:
return None
# Try paragraph-aware: split on blank lines first, then take first paragraph.
for para in text.split("\n\n"):
para = para.strip()
if not para:
continue
# First sentence within the paragraph.
parts = _SENTENCE_BOUNDARY.split(para, maxsplit=1)
first = parts[0].strip()
if len(first) >= _MIN_SENTENCE_LEN:
return first
return None