Source code for arborist.distill.tfidf
"""TF-IDF top-K keyword distiller.
Pure-Python, stdlib-only. Uses chunk-level "documents" as the corpus
baseline so a term that appears in every chunk of the same source gets
penalized relative to a term concentrated in fewer chunks.
Output core content is a deterministic comma-separated keyword list —
extreme compression toward the "tweet/haiku" end of the planet metaphor.
"""
from __future__ import annotations
import math
import re
from collections import Counter
from arborist.distill.base import DistillationResult, Distiller
from arborist.document import Document
_TOKEN_RE = re.compile(r"\b[a-zA-Z][a-zA-Z\-']{2,}\b")
_STOPWORDS = frozenset(
"""
the and or in of to is are was were for with on as by an be this that
these those from at but not have has had been they their his her its
our we you he she it all any if no more than such also can may will
would should could do does did between which where when what who how
some many most into through during above below after before since
while well much even only still about other only own same so very
just over under
""".split()
)
def _tokenize(text: str) -> list[str]:
return [t.lower() for t in _TOKEN_RE.findall(text) if t.lower() not in _STOPWORDS]
[docs]
class TfidfKeywordDistiller(Distiller):
name = "tfidf-keywords-v1"
def __init__(self, top_k: int = 16):
self.top_k = top_k
[docs]
def distill(
self, source: Document, source_chunks: list[str]
) -> DistillationResult:
if not source_chunks:
return self._empty_result(source)
chunk_tokens = [_tokenize(c) for c in source_chunks]
n = len(chunk_tokens)
df: Counter[str] = Counter()
for toks in chunk_tokens:
df.update(set(toks))
tf: Counter[str] = Counter()
for toks in chunk_tokens:
tf.update(toks)
scores: dict[str, float] = {}
for t, f in tf.items():
idf = math.log((n + 1) / (df[t] + 1)) + 1.0
scores[t] = f * idf
ranked = sorted(scores.items(), key=lambda kv: (-kv[1], kv[0]))
keywords = [t for t, _ in ranked[: self.top_k]]
kw_set = set(keywords)
contributing: set[int] = set()
for i, toks in enumerate(chunk_tokens):
if any(t in kw_set for t in toks):
contributing.add(i)
core_text = ", ".join(keywords)
return DistillationResult(
core=Document(
uri=f"{source.uri}#core/{self.name}",
content=core_text,
source_type=f"core:{self.name}",
title=(source.title or "") + " [KEYWORDS]",
),
contributing_chunk_indices=sorted(contributing),
)
def _empty_result(self, source: Document) -> DistillationResult:
return DistillationResult(
core=Document(
uri=f"{source.uri}#core/{self.name}",
content="",
source_type=f"core:{self.name}",
title=(source.title or "") + " [KEYWORDS]",
),
contributing_chunk_indices=[],
)