Source code for arborist.distill.first_sentence

"""First-sentence-per-chunk distiller (deterministic, no ML).

Compresses a source by taking the first non-trivial sentence of each chunk
and concatenating. Useful as a stub to exercise the core/derivation schema
without external models.
"""

from __future__ import annotations

import re

from arborist.distill.base import DistillationResult, Distiller
from arborist.document import Document

_SENTENCE_BOUNDARY = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9])")
_MIN_SENTENCE_LEN = 10



[docs]
class FirstSentenceDistiller(Distiller):
    name = "first-sentence-v1"

    def __init__(self, max_chars: int = 4096):
        self.max_chars = max_chars


[docs]
    def distill(
        self, source: Document, source_chunks: list[str]
    ) -> DistillationResult:
        sentences: list[str] = []
        contributing: list[int] = []
        for chunk_idx, chunk in enumerate(source_chunks):
            first = self._first_sentence(chunk)
            if first:
                sentences.append(first)
                contributing.append(chunk_idx)

        core_text = "\n".join(sentences)
        if len(core_text) > self.max_chars:
            core_text = core_text[: self.max_chars].rstrip()

        core = Document(
            uri=f"{source.uri}#core/{self.name}",
            content=core_text,
            source_type=f"core:{self.name}",
            title=(source.title or "") + " [CORE]",
        )
        return DistillationResult(core=core, contributing_chunk_indices=contributing)


    @staticmethod
    def _first_sentence(text: str) -> str | None:
        text = text.strip()
        if not text:
            return None
        # Try paragraph-aware: split on blank lines first, then take first paragraph.
        for para in text.split("\n\n"):
            para = para.strip()
            if not para:
                continue
            # First sentence within the paragraph.
            parts = _SENTENCE_BOUNDARY.split(para, maxsplit=1)
            first = parts[0].strip()
            if len(first) >= _MIN_SENTENCE_LEN:
                return first
        return None