import re from typing import Dict, List, Tuple, Any # ★ 相対ではなく絶対(modules 配下直読み)に変更 from llm import LLMClient # ----------------- Regex helpers ----------------- LIST_BULLET = re.compile(r"^(?:[-*•・]|\d+\.|\d+\))\s+(.*)") KEYVAL_LINE = re.compile(r"^\s*([^::]+?)\s*[::]\s*([^\n]+?)\s*$") LABEL_NUM = re.compile(r"^\s*([^::]+?)\s*[::]\s*([+-]?\d+(?:\.\d+)?)\s*$") HEADER = re.compile(r"^(#+|\d+\.|\d+\))\s*(.+)$") def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str, str]]: """Split into (title, content) using headings or by size.""" lines = text.splitlines() sections: List[Tuple[str, str]] = [] cur_title = "セクション" cur_buf: List[str] = [] def flush(): nonlocal cur_title, cur_buf if cur_buf: sections.append((cur_title, "\n".join(cur_buf).strip())) cur_buf = [] for ln in lines: m = HEADER.match(ln.strip()) if m: flush() cur_title = m.group(2).strip() continue cur_buf.append(ln) if sum(len(x) for x in cur_buf) > target_chars: flush() cur_title = f"セクション{len(sections)+1}" flush() # Fallback single section if not sections: sections = [("本文", text)] return sections def extract_bullets(section_text: str, max_items: int = 8) -> List[str]: bullets: List[str] = [] for line in section_text.splitlines(): m = LIST_BULLET.match(line.strip()) if m: bullets.append(m.group(1).strip()) if not bullets: # Heuristic: split by '。' or '.' and take concise sentences sents = re.split(r"[。\.!?]\s*", section_text) for s in sents: s = s.strip() if 8 <= len(s) <= 120: bullets.append(s) if len(bullets) >= max_items: break return bullets[:max_items] def extract_keyval_table(section_text: str) -> List[Tuple[str, str]]: pairs: List[Tuple[str, str]] = [] for line in section_text.splitlines(): m = KEYVAL_LINE.match(line) if m: k = m.group(1).strip() v = m.group(2).strip() if k and v: pairs.append((k, v)) return pairs def extract_chart_data(section_text: str, top_k: int = 10) -> List[Tuple[str, float]]: data: List[Tuple[str, float]] = [] for line in section_text.splitlines(): m = LABEL_NUM.match(line) if m: label = m.group(1).strip() try: val = float(m.group(2)) except ValueError: continue data.append((label, val)) # Deduplicate by label, keep last occurrence seen = {} for k, v in data: seen[k] = v items = list(seen.items()) # Sort by abs value desc items.sort(key=lambda x: abs(x[1]), reverse=True) return items[:top_k] def process_text(text: str, use_inference_api: bool, summarizer_model: str, generator_model: str, want_summary: bool, want_tables: bool, want_charts: bool, max_summary_words: int = 200) -> Dict[str, Any]: client = LLMClient(use_inference_api=use_inference_api) # 1) Executive summary summary = None if want_summary: summary = client.summarize(text, model=summarizer_model, max_words=max_summary_words) # 2) Sections (rule-based; reliable on CPU) sections = naive_section_split(text) # 3) Per-section bullets / tables / charts bullets_by_section: Dict[int, List[str]] = {} tables: List[Dict[str, Any]] = [] charts: List[Dict[str, Any]] = [] for idx, (title, body) in enumerate(sections): bullets_by_section[idx] = extract_bullets(body) if want_tables: kv = extract_keyval_table(body) if kv: tables.append({ "title": f"{title} — 表", "pairs": kv }) if want_charts: series = extract_chart_data(body) if series: charts.append({ "title": f"{title} — チャート", "series": series }) return { "summary": summary, "sections": sections, # list of (title, text) "bullets": bullets_by_section, "tables": tables, "charts": charts, }