Spaces:

Corin1998
/

Auto_PPT_Generator

Sleeping

App Files Files Community

Corin1998 commited on Sep 16

Commit

a19c579

verified ·

1 Parent(s): 8613cb4

Update modules/text_processing.py

Browse files

Files changed (1) hide show

modules/text_processing.py +52 -46

modules/text_processing.py CHANGED Viewed

@@ -2,35 +2,37 @@ import re
 from typing import Dict, List, Tuple, Any
 from .llm import LLMClient
-# ---------- Regex helpers ----------
 LIST_BULLET = re.compile(r"^(?:[-*•・]|\d+\.|\d+\))\s+(.*)")
-KEYVAL_LINE = re.compile(r"^\s*([^: :]+?)\s*[: :]\s*([^\n]+?)\s*$")
-LABEL_NUM = re.compile(r"^\s*([^: :]+?)\s*[: :]\s*([+-]?\d+(?:\.\d+)?)\s*$")
-HEADER = re.compile(r"^(#+|\d+\.|\d+\))\s+(.+)$")
 def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str, str]]:
-    """ Split into (title, section) using headings or by size."""
     lines = text.splitlines()
-    sections : List[Tuple[str, str]] = []
     cur_title = "セクション"
-    cur_buffer : List[str] = []
     def flush():
-        nonlocal cur_title, cur_buffer
-        if cur_buffer:
-            sections.append((cur_title, "\n".join(cur_buffer).strip()))
-            cur_buffer = []
-    for line in lines:
         m = HEADER.match(ln.strip())
         if m:
             flush()
             cur_title = m.group(2).strip()
             continue
-        cur_buffer.append(ln)
-        if sum(len(x) for x in cur_buffer) > target_chars:
             flush()
-            cur_title = f"セクション"{len(sections)+1}
     flush()
     # Fallback single section
@@ -38,24 +40,26 @@ def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str,
         sections = [("本文", text)]
     return sections
-def extract_list(section_text: str, max_items: int = 8) -> List[str]:
     bullets: List[str] = []
     for line in section_text.splitlines():
         m = LIST_BULLET.match(line.strip())
         if m:
             bullets.append(m.group(1).strip())
-   if not bullets:
-        # Heuristic: split by '。' or '、' and take concise sentences
-        ents = re.split(r'[。\. !?]\s*', section_text)
         for s in sents:
-            s  = s.strip()
-            if 8 < len(s) < 120:
                 bullets.append(s)
             if len(bullets) >= max_items:
                 break
     return bullets[:max_items]
-def extract_keyvals(section_text: str, ) -> List[Tuple[str, str]]:
     pairs: List[Tuple[str, str]] = []
     for line in section_text.splitlines():
         m = KEYVAL_LINE.match(line)
@@ -66,14 +70,15 @@ def extract_keyvals(section_text: str, ) -> List[Tuple[str, str]]:
                 pairs.append((k, v))
     return pairs
-def extract_chart_data(section_text: str, top_k: int =10) -> List[Tuple[str, float]]:
     data: List[Tuple[str, float]] = []
     for line in section_text.splitlines():
         m = LABEL_NUM.match(line)
         if m:
             label = m.group(1).strip()
             try:
-                val = float(m.group(2).strip())
             except ValueError:
                 continue
             data.append((label, val))
@@ -86,52 +91,53 @@ def extract_chart_data(section_text: str, top_k: int =10) -> List[Tuple[str, flo
     items.sort(key=lambda x: abs(x[1]), reverse=True)
     return items[:top_k]
 def process_text(text: str,
                  use_inference_api: bool,
-                 summaarizer_model:str,
-                 generator_model:str,
                  want_summary: bool,
-                 want_table: bool,
                  want_charts: bool,
                  max_summary_words: int = 200) -> Dict[str, Any]:
     client = LLMClient(use_inference_api=use_inference_api)
-    #1) Executive summary
     summary = None
     if want_summary:
-        summary = claient.summarize(text, model=summaarizer_model, max_words=max_summary_words)
-    #2) Section (rule-based:reliable on CPU)
     sections = naive_section_split(text)
-    #3) Per-section bullets
-    bullets_by_section: Dict[str, List[str]] = {}
     tables: List[Dict[str, Any]] = []
     charts: List[Dict[str, Any]] = []
     for idx, (title, body) in enumerate(sections):
         bullets_by_section[idx] = extract_bullets(body)
-        if want_table:
-            kv = extract_keyvals(body)
             if kv:
                 tables.append({
-                    "title": f"{title} - 表”,
                     "pairs": kv
                 })
         if want_charts:
-                series = extract_chart_data(body)
-                if series:
-                    charts.append({
-                        "title": f"{title} - チャート”,
-                        "series": series
-                    })
     return {
         "summary": summary,
-        "sections": sections, # list of (title, text)
-        "bullets": bullets_by_section,
         "tables": tables,
-        "charts": charts
     }

 from typing import Dict, List, Tuple, Any
 from .llm import LLMClient
+# ----------------- Regex helpers -----------------
 LIST_BULLET = re.compile(r"^(?:[-*•・]|\d+\.|\d+\))\s+(.*)")
+KEYVAL_LINE = re.compile(r"^\s*([^:：]+?)\s*[:：]\s*([^\n]+?)\s*$")
+LABEL_NUM = re.compile(r"^\s*([^:：]+?)\s*[:：]\s*([+-]?\d+(?:\.\d+)?)\s*$")
+HEADER = re.compile(r"^(#+|\d+\.|\d+\))\s*(.+)$")
 def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str, str]]:
+    """Split into (title, content) using headings or by size."""
     lines = text.splitlines()
+    sections: List[Tuple[str, str]] = []
     cur_title = "セクション"
+    cur_buf: List[str] = []
     def flush():
+        nonlocal cur_title, cur_buf
+        if cur_buf:
+            sections.append((cur_title, "\n".join(cur_buf).strip()))
+            cur_buf = []
+    for ln in lines:
         m = HEADER.match(ln.strip())
         if m:
             flush()
             cur_title = m.group(2).strip()
             continue
+        cur_buf.append(ln)
+        if sum(len(x) for x in cur_buf) > target_chars:
             flush()
+            # ★ 修正ポイント：f-string の {} が抜けていた
+            cur_title = f"セクション{len(sections)+1}"
     flush()
     # Fallback single section
         sections = [("本文", text)]
     return sections
+def extract_bullets(section_text: str, max_items: int = 8) -> List[str]:
     bullets: List[str] = []
     for line in section_text.splitlines():
         m = LIST_BULLET.match(line.strip())
         if m:
             bullets.append(m.group(1).strip())
+    if not bullets:
+        # Heuristic: split by '。' or '.' and take concise sentences
+        sents = re.split(r"[。\.!?]\s*", section_text)
         for s in sents:
+            s = s.strip()
+            if 8 <= len(s) <= 120:
                 bullets.append(s)
             if len(bullets) >= max_items:
                 break
     return bullets[:max_items]
+def extract_keyval_table(section_text: str) -> List[Tuple[str, str]]:
     pairs: List[Tuple[str, str]] = []
     for line in section_text.splitlines():
         m = KEYVAL_LINE.match(line)
                 pairs.append((k, v))
     return pairs
+def extract_chart_data(section_text: str, top_k: int = 10) -> List[Tuple[str, float]]:
     data: List[Tuple[str, float]] = []
     for line in section_text.splitlines():
         m = LABEL_NUM.match(line)
         if m:
             label = m.group(1).strip()
             try:
+                val = float(m.group(2))
             except ValueError:
                 continue
             data.append((label, val))
     items.sort(key=lambda x: abs(x[1]), reverse=True)
     return items[:top_k]
 def process_text(text: str,
                  use_inference_api: bool,
+                 summarizer_model: str,
+                 generator_model: str,
                  want_summary: bool,
+                 want_tables: bool,
                  want_charts: bool,
                  max_summary_words: int = 200) -> Dict[str, Any]:
     client = LLMClient(use_inference_api=use_inference_api)
+    # 1) Executive summary
     summary = None
     if want_summary:
+        summary = client.summarize(text, model=summarizer_model, max_words=max_summary_words)
+    # 2) Sections (rule-based; reliable on CPU)
     sections = naive_section_split(text)
+    # 3) Per-section bullets / tables / charts
+    bullets_by_section: Dict[int, List[str]] = {}
     tables: List[Dict[str, Any]] = []
     charts: List[Dict[str, Any]] = []
     for idx, (title, body) in enumerate(sections):
         bullets_by_section[idx] = extract_bullets(body)
+        if want_tables:
+            kv = extract_keyval_table(body)
             if kv:
                 tables.append({
+                    "title": f"{title} — 表",
                     "pairs": kv
                 })
         if want_charts:
+            series = extract_chart_data(body)
+            if series:
+                charts.append({
+                    "title": f"{title} — チャート",
+                    "series": series
+                })
     return {
         "summary": summary,
+        "sections": sections,  # list of (title, text)
+        "bullets": bullets_by_section,
         "tables": tables,
+        "charts": charts,
     }