lianghsun commited on
Commit
2ca143e
·
1 Parent(s): 539fbd8

Build w/ love

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +154 -66
src/streamlit_app.py CHANGED
@@ -1,94 +1,182 @@
 
1
  import time
2
  import requests
3
  import pandas as pd
4
  import streamlit as st
5
- import urllib3
6
 
7
  API_URL = "https://taic.moda.gov.tw/api/v1/dataset.search.export"
8
 
9
- # 關閉 SSL 驗證 warning(HF Space 需要)
10
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
11
-
12
  st.set_page_config(page_title="TAIC Pulse", layout="wide")
13
 
14
- st.title("臺灣主權AI訓練語料庫 Explorer")
15
- st.caption(
16
- "⚡ 即時資料:本頁面在啟動時會從來源 API 抓取一次最新 JSON,"
17
- )
18
-
19
- # ---------------------------
20
- # Fetch once, cache forever (per Space runtime)
21
- # ---------------------------
22
 
 
 
 
23
 
24
- @st.cache_data
25
- def fetch_data_once():
26
- r = requests.get(API_URL, timeout=30, verify=False)
27
- r.raise_for_status()
28
- fetched_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
29
- return r.json(), fetched_at
30
 
31
 
32
- with st.spinner("載入資料中..."):
33
- data, fetched_at = fetch_data_once()
 
 
34
 
35
- st.metric("資料抓取時間", fetched_at)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  st.divider()
38
 
39
- # ---------------------------
40
- # Normalize JSON to DataFrame
41
- # ---------------------------
42
- items = data if isinstance(data, list) else data.get("data", data)
43
- df = pd.json_normalize(items)
44
-
45
- # ---------------------------
46
- # Sidebar filters
47
- # ---------------------------
48
- st.sidebar.header("篩選條件")
49
-
50
- candidate_fields = [
51
- c for c in df.columns
52
- if c.lower() in {"category", "theme", "publisher", "organization", "org", "format", "license", "city"}
53
- ]
54
-
55
- if not candidate_fields:
56
- candidate_fields = st.sidebar.multiselect(
57
- "選擇要做成下拉選單的欄位",
58
- options=df.columns.tolist(),
59
- default=['授權方式', '是否為開放資料', '資料提供機關']
60
- )
61
-
62
- filters = {}
63
- for field in candidate_fields:
64
- values = sorted(df[field].dropna().astype(str).unique().tolist())
65
- if not values:
66
- continue
67
- choice = st.sidebar.selectbox(f"{field}", ["(全部)"] + values)
68
- if choice != "(全部)":
69
- filters[field] = choice
70
-
71
- filtered = df.copy()
72
- for k, v in filters.items():
73
- filtered = filtered[filtered[k].astype(str) == v]
74
-
75
- q = st.sidebar.text_input("全文關鍵字搜尋")
76
  if q.strip():
77
  mask = filtered.astype(str).apply(
78
  lambda row: row.str.contains(q, case=False, na=False)
79
  ).any(axis=1)
80
  filtered = filtered[mask]
81
 
82
- # ---------------------------
83
- # Main view
84
- # ---------------------------
85
- st.subheader("資料預覽")
86
- st.write(f"{len(filtered):,} 筆(原始 {len(df):,} 筆)")
 
87
  st.dataframe(filtered, use_container_width=True)
88
 
89
- # ---------------------------
90
- # Download (simple & clean)
91
- # ---------------------------
92
  csv_bytes = filtered.to_csv(index=False).encode("utf-8-sig")
93
 
94
  st.download_button(
 
1
+ import os
2
  import time
3
  import requests
4
  import pandas as pd
5
  import streamlit as st
 
6
 
7
  API_URL = "https://taic.moda.gov.tw/api/v1/dataset.search.export"
8
 
9
+ # ---- Config ----
 
 
10
  st.set_page_config(page_title="TAIC Pulse", layout="wide")
11
 
12
+ APP_TITLE = "臺灣主權 AI 訓練語料庫 Explorer"
13
+ st.title(APP_TITLE)
14
+ st.caption("⚡ 即時資料:本頁面在啟動時來源 API 抓取一次最新 JSON,並提供互動式篩選與檢視(非持續輪詢)。")
 
 
 
 
 
15
 
16
+ # SSL 驗證開關:遇到憑證鏈問題時可設 0
17
+ # HF Spaces 可在 Settings -> Variables 設定
18
+ VERIFY_SSL = os.getenv("TAIC_VERIFY_SSL", "1") == "1"
19
 
20
+ # ---- Helpers ----
 
 
 
 
 
21
 
22
 
23
+ def fetch_json_once() -> dict | list:
24
+ # 不提供 timeout slider:這邊給一個合理預設即可
25
+ # 若想改 timeout,請直接改數字或改用 env
26
+ timeout_sec = int(os.getenv("TAIC_TIMEOUT_SEC", "20"))
27
 
28
+ r = requests.get(API_URL, timeout=timeout_sec, verify=VERIFY_SSL)
29
+ r.raise_for_status()
30
+ return r.json()
31
+
32
+
33
+ @st.cache_data(show_spinner=True)
34
+ def cached_fetch_json() -> dict | list:
35
+ # cache 版本(session 重啟仍可快取命中)
36
+ return fetch_json_once()
37
+
38
+
39
+ def load_data_once():
40
+ """
41
+ 只抓一次:
42
+ - 若 session_state 已有資料:永遠使用,不再打 API
43
+ - 若沒有:從 st.cache_data 取(可能命中 cache 或實際打一次)
44
+ """
45
+ if "taic_data" not in st.session_state:
46
+ data = cached_fetch_json()
47
+ st.session_state.taic_data = data
48
+ st.session_state.fetched_at = time.strftime(
49
+ "%Y-%m-%d %H:%M:%S", time.localtime())
50
+ return st.session_state.taic_data, st.session_state.fetched_at
51
+
52
+
53
+ def extract_items(data):
54
+ # 依你先前寫法:list 就直接用;dict 優先取 data,其次整包
55
+ if isinstance(data, list):
56
+ return data
57
+ if isinstance(data, dict):
58
+ return data.get("data", data)
59
+ return data
60
+
61
+
62
+ def normalize_df(items) -> pd.DataFrame:
63
+ df = pd.json_normalize(items)
64
+
65
+ # 避免欄位全空造成後續選單/表格爆炸
66
+ if df.empty:
67
+ return df
68
+
69
+ # 如果有欄位型態很怪(list/dict),先轉字串,確保能顯示/篩選
70
+ for c in df.columns:
71
+ if df[c].map(lambda x: isinstance(x, (list, dict))).any():
72
+ df[c] = df[c].apply(lambda x: str(x) if pd.notna(x) else x)
73
+
74
+ return df
75
+
76
+
77
+ def pick_candidate_fields(df: pd.DataFrame) -> list[str]:
78
+ # 常見欄位名對應(你可按 TAIC 實際欄位補更多)
79
+ preferred = {"category", "theme", "publisher",
80
+ "organization", "org", "format", "license", "city"}
81
+ candidates = [c for c in df.columns if c.lower() in preferred]
82
+
83
+ # 如果沒命中,就提供讓使用者選(但仍是「選一次」的互動,不會重抓)
84
+ if not candidates:
85
+ st.sidebar.info("找不到預設欄位,請自行選擇要做成下拉選單的欄位。")
86
+ candidates = st.sidebar.multiselect(
87
+ "選擇要做成下拉選單的欄位(連動順序=顯示順序)",
88
+ df.columns.tolist(),
89
+ default=df.columns[:2].tolist() if len(
90
+ df.columns) >= 2 else df.columns.tolist(),
91
+ )
92
+ return candidates
93
+
94
+
95
+ def cascading_filters(df: pd.DataFrame, fields: list[str]) -> tuple[pd.DataFrame, dict]:
96
+ """
97
+ 連動式 filters:
98
+ - 依 fields 的順序逐一生成 selectbox
99
+ - 每個 selectbox 的選項都來自「前面已套用 filters 的資料」
100
+ """
101
+ filtered_tmp = df.copy()
102
+ selected = {}
103
+
104
+ for field in fields:
105
+ if field not in filtered_tmp.columns:
106
+ continue
107
+
108
+ # 以目前 filtered_tmp 生成可選值
109
+ values = (
110
+ filtered_tmp[field]
111
+ .dropna()
112
+ .astype(str)
113
+ .unique()
114
+ .tolist()
115
+ )
116
+ values = sorted(values)
117
+
118
+ if not values:
119
+ # 這欄在目前條件下已無可用值
120
+ continue
121
+
122
+ choice = st.sidebar.selectbox(
123
+ f"{field} 篩選",
124
+ ["(全部)"] + values,
125
+ index=0,
126
+ key=f"filter_{field}",
127
+ )
128
+
129
+ if choice != "(全部)":
130
+ selected[field] = choice
131
+ filtered_tmp = filtered_tmp[filtered_tmp[field].astype(
132
+ str) == choice]
133
+
134
+ return filtered_tmp, selected
135
+
136
+
137
+ # ---- Load ----
138
+ with st.sidebar:
139
+ st.header("篩選條件(連動選單)")
140
+
141
+ data, fetched_at = load_data_once()
142
+ items = extract_items(data)
143
+ df = normalize_df(items)
144
+
145
+ # ---- Header metrics ----
146
+ col1, col2 = st.columns([2, 1])
147
+ with col1:
148
+ st.subheader("資料預覽與篩選")
149
+ with col2:
150
+ st.metric("資料抓取時間", fetched_at)
151
 
152
  st.divider()
153
 
154
+ if df.empty:
155
+ st.warning("資料是空的,或 JSON 結構不符合預期(items 解析後沒有表格資料)。")
156
+ st.stop()
157
+
158
+ # ---- Filters ----
159
+ candidate_fields = pick_candidate_fields(df)
160
+ filtered, selected_filters = cascading_filters(df, candidate_fields)
161
+
162
+ # 文字搜尋(可選)
163
+ q = st.sidebar.text_input("全文關鍵字(contains)", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  if q.strip():
165
  mask = filtered.astype(str).apply(
166
  lambda row: row.str.contains(q, case=False, na=False)
167
  ).any(axis=1)
168
  filtered = filtered[mask]
169
 
170
+ # ---- Table ----
171
+ st.write(f"共 **{len(filtered):,}** 筆(原始 **{len(df):,}** 筆)")
172
+ if selected_filters:
173
+ st.caption(
174
+ "已套用條件:" + "、".join([f"{k}={v}" for k, v in selected_filters.items()]))
175
+
176
  st.dataframe(filtered, use_container_width=True)
177
 
178
+ # ---- Download ----
179
+ # 不要 progress bar:直接準備 bytes
 
180
  csv_bytes = filtered.to_csv(index=False).encode("utf-8-sig")
181
 
182
  st.download_button(