Spaces:
Sleeping
Sleeping
| import re | |
| from typing import Dict, List, Tuple, Any | |
| from .llm import LLMClient | |
| # ----------------- Regex helpers ----------------- | |
| LIST_BULLET = re.compile(r"^(?:[-*•・]|\d+\.|\d+\))\s+(.*)") | |
| KEYVAL_LINE = re.compile(r"^\s*([^::]+?)\s*[::]\s*([^\n]+?)\s*$") | |
| LABEL_NUM = re.compile(r"^\s*([^::]+?)\s*[::]\s*([+-]?\d+(?:\.\d+)?)\s*$") | |
| HEADER = re.compile(r"^(#+|\d+\.|\d+\))\s*(.+)$") | |
| def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str, str]]: | |
| """Split into (title, content) using headings or by size.""" | |
| lines = text.splitlines() | |
| sections: List[Tuple[str, str]] = [] | |
| cur_title = "セクション" | |
| cur_buf: List[str] = [] | |
| def flush(): | |
| nonlocal cur_title, cur_buf | |
| if cur_buf: | |
| sections.append((cur_title, "\n".join(cur_buf).strip())) | |
| cur_buf = [] | |
| for ln in lines: | |
| m = HEADER.match(ln.strip()) | |
| if m: | |
| flush() | |
| cur_title = m.group(2).strip() | |
| continue | |
| cur_buf.append(ln) | |
| if sum(len(x) for x in cur_buf) > target_chars: | |
| flush() | |
| # ★ 修正ポイント:f-string の {} が抜けていた | |
| cur_title = f"セクション{len(sections)+1}" | |
| flush() | |
| # Fallback single section | |
| if not sections: | |
| sections = [("本文", text)] | |
| return sections | |
| def extract_bullets(section_text: str, max_items: int = 8) -> List[str]: | |
| bullets: List[str] = [] | |
| for line in section_text.splitlines(): | |
| m = LIST_BULLET.match(line.strip()) | |
| if m: | |
| bullets.append(m.group(1).strip()) | |
| if not bullets: | |
| # Heuristic: split by '。' or '.' and take concise sentences | |
| sents = re.split(r"[。\.!?]\s*", section_text) | |
| for s in sents: | |
| s = s.strip() | |
| if 8 <= len(s) <= 120: | |
| bullets.append(s) | |
| if len(bullets) >= max_items: | |
| break | |
| return bullets[:max_items] | |
| def extract_keyval_table(section_text: str) -> List[Tuple[str, str]]: | |
| pairs: List[Tuple[str, str]] = [] | |
| for line in section_text.splitlines(): | |
| m = KEYVAL_LINE.match(line) | |
| if m: | |
| k = m.group(1).strip() | |
| v = m.group(2).strip() | |
| if k and v: | |
| pairs.append((k, v)) | |
| return pairs | |
| def extract_chart_data(section_text: str, top_k: int = 10) -> List[Tuple[str, float]]: | |
| data: List[Tuple[str, float]] = [] | |
| for line in section_text.splitlines(): | |
| m = LABEL_NUM.match(line) | |
| if m: | |
| label = m.group(1).strip() | |
| try: | |
| val = float(m.group(2)) | |
| except ValueError: | |
| continue | |
| data.append((label, val)) | |
| # Deduplicate by label, keep last occurrence | |
| seen = {} | |
| for k, v in data: | |
| seen[k] = v | |
| items = list(seen.items()) | |
| # Sort by abs value desc | |
| items.sort(key=lambda x: abs(x[1]), reverse=True) | |
| return items[:top_k] | |
| def process_text(text: str, | |
| use_inference_api: bool, | |
| summarizer_model: str, | |
| generator_model: str, | |
| want_summary: bool, | |
| want_tables: bool, | |
| want_charts: bool, | |
| max_summary_words: int = 200) -> Dict[str, Any]: | |
| client = LLMClient(use_inference_api=use_inference_api) | |
| # 1) Executive summary | |
| summary = None | |
| if want_summary: | |
| summary = client.summarize(text, model=summarizer_model, max_words=max_summary_words) | |
| # 2) Sections (rule-based; reliable on CPU) | |
| sections = naive_section_split(text) | |
| # 3) Per-section bullets / tables / charts | |
| bullets_by_section: Dict[int, List[str]] = {} | |
| tables: List[Dict[str, Any]] = [] | |
| charts: List[Dict[str, Any]] = [] | |
| for idx, (title, body) in enumerate(sections): | |
| bullets_by_section[idx] = extract_bullets(body) | |
| if want_tables: | |
| kv = extract_keyval_table(body) | |
| if kv: | |
| tables.append({ | |
| "title": f"{title} — 表", | |
| "pairs": kv | |
| }) | |
| if want_charts: | |
| series = extract_chart_data(body) | |
| if series: | |
| charts.append({ | |
| "title": f"{title} — チャート", | |
| "series": series | |
| }) | |
| return { | |
| "summary": summary, | |
| "sections": sections, # list of (title, text) | |
| "bullets": bullets_by_section, | |
| "tables": tables, | |
| "charts": charts, | |
| } | |