Auto_PPT_Generator / modules /text_processing.py
Corin1998's picture
Update modules/text_processing.py
a19c579 verified
raw
history blame
4.56 kB
import re
from typing import Dict, List, Tuple, Any
from .llm import LLMClient
# ----------------- Regex helpers -----------------
LIST_BULLET = re.compile(r"^(?:[-*•・]|\d+\.|\d+\))\s+(.*)")
KEYVAL_LINE = re.compile(r"^\s*([^::]+?)\s*[::]\s*([^\n]+?)\s*$")
LABEL_NUM = re.compile(r"^\s*([^::]+?)\s*[::]\s*([+-]?\d+(?:\.\d+)?)\s*$")
HEADER = re.compile(r"^(#+|\d+\.|\d+\))\s*(.+)$")
def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str, str]]:
"""Split into (title, content) using headings or by size."""
lines = text.splitlines()
sections: List[Tuple[str, str]] = []
cur_title = "セクション"
cur_buf: List[str] = []
def flush():
nonlocal cur_title, cur_buf
if cur_buf:
sections.append((cur_title, "\n".join(cur_buf).strip()))
cur_buf = []
for ln in lines:
m = HEADER.match(ln.strip())
if m:
flush()
cur_title = m.group(2).strip()
continue
cur_buf.append(ln)
if sum(len(x) for x in cur_buf) > target_chars:
flush()
# ★ 修正ポイント:f-string の {} が抜けていた
cur_title = f"セクション{len(sections)+1}"
flush()
# Fallback single section
if not sections:
sections = [("本文", text)]
return sections
def extract_bullets(section_text: str, max_items: int = 8) -> List[str]:
bullets: List[str] = []
for line in section_text.splitlines():
m = LIST_BULLET.match(line.strip())
if m:
bullets.append(m.group(1).strip())
if not bullets:
# Heuristic: split by '。' or '.' and take concise sentences
sents = re.split(r"[。\.!?]\s*", section_text)
for s in sents:
s = s.strip()
if 8 <= len(s) <= 120:
bullets.append(s)
if len(bullets) >= max_items:
break
return bullets[:max_items]
def extract_keyval_table(section_text: str) -> List[Tuple[str, str]]:
pairs: List[Tuple[str, str]] = []
for line in section_text.splitlines():
m = KEYVAL_LINE.match(line)
if m:
k = m.group(1).strip()
v = m.group(2).strip()
if k and v:
pairs.append((k, v))
return pairs
def extract_chart_data(section_text: str, top_k: int = 10) -> List[Tuple[str, float]]:
data: List[Tuple[str, float]] = []
for line in section_text.splitlines():
m = LABEL_NUM.match(line)
if m:
label = m.group(1).strip()
try:
val = float(m.group(2))
except ValueError:
continue
data.append((label, val))
# Deduplicate by label, keep last occurrence
seen = {}
for k, v in data:
seen[k] = v
items = list(seen.items())
# Sort by abs value desc
items.sort(key=lambda x: abs(x[1]), reverse=True)
return items[:top_k]
def process_text(text: str,
use_inference_api: bool,
summarizer_model: str,
generator_model: str,
want_summary: bool,
want_tables: bool,
want_charts: bool,
max_summary_words: int = 200) -> Dict[str, Any]:
client = LLMClient(use_inference_api=use_inference_api)
# 1) Executive summary
summary = None
if want_summary:
summary = client.summarize(text, model=summarizer_model, max_words=max_summary_words)
# 2) Sections (rule-based; reliable on CPU)
sections = naive_section_split(text)
# 3) Per-section bullets / tables / charts
bullets_by_section: Dict[int, List[str]] = {}
tables: List[Dict[str, Any]] = []
charts: List[Dict[str, Any]] = []
for idx, (title, body) in enumerate(sections):
bullets_by_section[idx] = extract_bullets(body)
if want_tables:
kv = extract_keyval_table(body)
if kv:
tables.append({
"title": f"{title} — 表",
"pairs": kv
})
if want_charts:
series = extract_chart_data(body)
if series:
charts.append({
"title": f"{title} — チャート",
"series": series
})
return {
"summary": summary,
"sections": sections, # list of (title, text)
"bullets": bullets_by_section,
"tables": tables,
"charts": charts,
}