Corin1998 commited on
Commit
a19c579
·
verified ·
1 Parent(s): 8613cb4

Update modules/text_processing.py

Browse files
Files changed (1) hide show
  1. modules/text_processing.py +52 -46
modules/text_processing.py CHANGED
@@ -2,35 +2,37 @@ import re
2
  from typing import Dict, List, Tuple, Any
3
  from .llm import LLMClient
4
 
5
- # ---------- Regex helpers ----------
6
  LIST_BULLET = re.compile(r"^(?:[-*•・]|\d+\.|\d+\))\s+(.*)")
7
- KEYVAL_LINE = re.compile(r"^\s*([^: :]+?)\s*[: :]\s*([^\n]+?)\s*$")
8
- LABEL_NUM = re.compile(r"^\s*([^: :]+?)\s*[: :]\s*([+-]?\d+(?:\.\d+)?)\s*$")
9
- HEADER = re.compile(r"^(#+|\d+\.|\d+\))\s+(.+)$")
 
10
 
11
  def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str, str]]:
12
- """ Split into (title, section) using headings or by size."""
13
  lines = text.splitlines()
14
- sections : List[Tuple[str, str]] = []
15
  cur_title = "セクション"
16
- cur_buffer : List[str] = []
17
 
18
  def flush():
19
- nonlocal cur_title, cur_buffer
20
- if cur_buffer:
21
- sections.append((cur_title, "\n".join(cur_buffer).strip()))
22
- cur_buffer = []
23
-
24
- for line in lines:
25
  m = HEADER.match(ln.strip())
26
  if m:
27
  flush()
28
  cur_title = m.group(2).strip()
29
  continue
30
- cur_buffer.append(ln)
31
- if sum(len(x) for x in cur_buffer) > target_chars:
32
  flush()
33
- cur_title = f"セクション"{len(sections)+1}
 
34
  flush()
35
 
36
  # Fallback single section
@@ -38,24 +40,26 @@ def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str,
38
  sections = [("本文", text)]
39
  return sections
40
 
41
- def extract_list(section_text: str, max_items: int = 8) -> List[str]:
 
42
  bullets: List[str] = []
43
  for line in section_text.splitlines():
44
  m = LIST_BULLET.match(line.strip())
45
  if m:
46
  bullets.append(m.group(1).strip())
47
- if not bullets:
48
- # Heuristic: split by '。' or '' and take concise sentences
49
- ents = re.split(r'[。\. !?]\s*', section_text)
50
  for s in sents:
51
- s = s.strip()
52
- if 8 < len(s) < 120:
53
  bullets.append(s)
54
  if len(bullets) >= max_items:
55
  break
56
  return bullets[:max_items]
57
 
58
- def extract_keyvals(section_text: str, ) -> List[Tuple[str, str]]:
 
59
  pairs: List[Tuple[str, str]] = []
60
  for line in section_text.splitlines():
61
  m = KEYVAL_LINE.match(line)
@@ -66,14 +70,15 @@ def extract_keyvals(section_text: str, ) -> List[Tuple[str, str]]:
66
  pairs.append((k, v))
67
  return pairs
68
 
69
- def extract_chart_data(section_text: str, top_k: int =10) -> List[Tuple[str, float]]:
 
70
  data: List[Tuple[str, float]] = []
71
  for line in section_text.splitlines():
72
  m = LABEL_NUM.match(line)
73
  if m:
74
  label = m.group(1).strip()
75
  try:
76
- val = float(m.group(2).strip())
77
  except ValueError:
78
  continue
79
  data.append((label, val))
@@ -86,52 +91,53 @@ def extract_chart_data(section_text: str, top_k: int =10) -> List[Tuple[str, flo
86
  items.sort(key=lambda x: abs(x[1]), reverse=True)
87
  return items[:top_k]
88
 
 
89
  def process_text(text: str,
90
  use_inference_api: bool,
91
- summaarizer_model:str,
92
- generator_model:str,
93
  want_summary: bool,
94
- want_table: bool,
95
  want_charts: bool,
96
  max_summary_words: int = 200) -> Dict[str, Any]:
97
  client = LLMClient(use_inference_api=use_inference_api)
98
 
99
- #1) Executive summary
100
  summary = None
101
  if want_summary:
102
- summary = claient.summarize(text, model=summaarizer_model, max_words=max_summary_words)
103
 
104
- #2) Section (rule-based:reliable on CPU)
105
  sections = naive_section_split(text)
106
 
107
- #3) Per-section bullets
108
- bullets_by_section: Dict[str, List[str]] = {}
109
  tables: List[Dict[str, Any]] = []
110
  charts: List[Dict[str, Any]] = []
111
 
112
  for idx, (title, body) in enumerate(sections):
113
  bullets_by_section[idx] = extract_bullets(body)
114
 
115
- if want_table:
116
- kv = extract_keyvals(body)
117
  if kv:
118
  tables.append({
119
- "title": f"{title} - 表”,
120
  "pairs": kv
121
  })
122
 
123
  if want_charts:
124
- series = extract_chart_data(body)
125
- if series:
126
- charts.append({
127
- "title": f"{title} - チャート”,
128
- "series": series
129
- })
 
130
  return {
131
  "summary": summary,
132
- "sections": sections, # list of (title, text)
133
- "bullets": bullets_by_section,
134
  "tables": tables,
135
- "charts": charts
136
  }
137
-
 
2
  from typing import Dict, List, Tuple, Any
3
  from .llm import LLMClient
4
 
5
+ # ----------------- Regex helpers -----------------
6
  LIST_BULLET = re.compile(r"^(?:[-*•・]|\d+\.|\d+\))\s+(.*)")
7
+ KEYVAL_LINE = re.compile(r"^\s*([^::]+?)\s*[::]\s*([^\n]+?)\s*$")
8
+ LABEL_NUM = re.compile(r"^\s*([^::]+?)\s*[::]\s*([+-]?\d+(?:\.\d+)?)\s*$")
9
+ HEADER = re.compile(r"^(#+|\d+\.|\d+\))\s*(.+)$")
10
+
11
 
12
  def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str, str]]:
13
+ """Split into (title, content) using headings or by size."""
14
  lines = text.splitlines()
15
+ sections: List[Tuple[str, str]] = []
16
  cur_title = "セクション"
17
+ cur_buf: List[str] = []
18
 
19
  def flush():
20
+ nonlocal cur_title, cur_buf
21
+ if cur_buf:
22
+ sections.append((cur_title, "\n".join(cur_buf).strip()))
23
+ cur_buf = []
24
+
25
+ for ln in lines:
26
  m = HEADER.match(ln.strip())
27
  if m:
28
  flush()
29
  cur_title = m.group(2).strip()
30
  continue
31
+ cur_buf.append(ln)
32
+ if sum(len(x) for x in cur_buf) > target_chars:
33
  flush()
34
+ # 修正ポイント:f-string の {} が抜けていた
35
+ cur_title = f"セクション{len(sections)+1}"
36
  flush()
37
 
38
  # Fallback single section
 
40
  sections = [("本文", text)]
41
  return sections
42
 
43
+
44
+ def extract_bullets(section_text: str, max_items: int = 8) -> List[str]:
45
  bullets: List[str] = []
46
  for line in section_text.splitlines():
47
  m = LIST_BULLET.match(line.strip())
48
  if m:
49
  bullets.append(m.group(1).strip())
50
+ if not bullets:
51
+ # Heuristic: split by '。' or '.' and take concise sentences
52
+ sents = re.split(r"[。\.!?]\s*", section_text)
53
  for s in sents:
54
+ s = s.strip()
55
+ if 8 <= len(s) <= 120:
56
  bullets.append(s)
57
  if len(bullets) >= max_items:
58
  break
59
  return bullets[:max_items]
60
 
61
+
62
+ def extract_keyval_table(section_text: str) -> List[Tuple[str, str]]:
63
  pairs: List[Tuple[str, str]] = []
64
  for line in section_text.splitlines():
65
  m = KEYVAL_LINE.match(line)
 
70
  pairs.append((k, v))
71
  return pairs
72
 
73
+
74
+ def extract_chart_data(section_text: str, top_k: int = 10) -> List[Tuple[str, float]]:
75
  data: List[Tuple[str, float]] = []
76
  for line in section_text.splitlines():
77
  m = LABEL_NUM.match(line)
78
  if m:
79
  label = m.group(1).strip()
80
  try:
81
+ val = float(m.group(2))
82
  except ValueError:
83
  continue
84
  data.append((label, val))
 
91
  items.sort(key=lambda x: abs(x[1]), reverse=True)
92
  return items[:top_k]
93
 
94
+
95
  def process_text(text: str,
96
  use_inference_api: bool,
97
+ summarizer_model: str,
98
+ generator_model: str,
99
  want_summary: bool,
100
+ want_tables: bool,
101
  want_charts: bool,
102
  max_summary_words: int = 200) -> Dict[str, Any]:
103
  client = LLMClient(use_inference_api=use_inference_api)
104
 
105
+ # 1) Executive summary
106
  summary = None
107
  if want_summary:
108
+ summary = client.summarize(text, model=summarizer_model, max_words=max_summary_words)
109
 
110
+ # 2) Sections (rule-based; reliable on CPU)
111
  sections = naive_section_split(text)
112
 
113
+ # 3) Per-section bullets / tables / charts
114
+ bullets_by_section: Dict[int, List[str]] = {}
115
  tables: List[Dict[str, Any]] = []
116
  charts: List[Dict[str, Any]] = []
117
 
118
  for idx, (title, body) in enumerate(sections):
119
  bullets_by_section[idx] = extract_bullets(body)
120
 
121
+ if want_tables:
122
+ kv = extract_keyval_table(body)
123
  if kv:
124
  tables.append({
125
+ "title": f"{title} 表",
126
  "pairs": kv
127
  })
128
 
129
  if want_charts:
130
+ series = extract_chart_data(body)
131
+ if series:
132
+ charts.append({
133
+ "title": f"{title} チャート",
134
+ "series": series
135
+ })
136
+
137
  return {
138
  "summary": summary,
139
+ "sections": sections, # list of (title, text)
140
+ "bullets": bullets_by_section,
141
  "tables": tables,
142
+ "charts": charts,
143
  }