Spaces:
Runtime error
Runtime error
adding parquets
Browse files- app.py +10 -17
- assets/data/amazon_polarity_albert-base-v2-yelp-polarity.parquet +3 -0
- assets/data/amazon_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet +3 -0
- assets/data/yelp_polarity_albert-base-v2-yelp-polarity.parquet +3 -0
- assets/data/yelp_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet +3 -0
app.py
CHANGED
|
@@ -109,7 +109,7 @@ def quant_panel(embedding_df):
|
|
| 109 |
st.markdown("* Each **point** is an input example.")
|
| 110 |
st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
|
| 111 |
st.markdown("* The **shape** of each point reflects the label category -- positive (diamond) or negative sentiment (circle).")
|
| 112 |
-
st.altair_chart(data_comparison(down_samp(embedding_df)))
|
| 113 |
|
| 114 |
|
| 115 |
def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
|
|
@@ -156,15 +156,11 @@ def get_data(spotlight, emb):
|
|
| 156 |
|
| 157 |
@st.cache(ttl=600)
|
| 158 |
def clustering(data,num_clusters):
|
| 159 |
-
|
| 160 |
X = np.array(data['embedding'].tolist())
|
| 161 |
-
|
| 162 |
kclusterer = KMeansClusterer(
|
| 163 |
num_clusters, distance=nltk.cluster.util.cosine_distance,
|
| 164 |
repeats=25,avoid_empty_clusters=True)
|
| 165 |
-
|
| 166 |
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
|
| 167 |
-
|
| 168 |
data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
|
| 169 |
data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
|
| 170 |
|
|
@@ -222,22 +218,18 @@ if __name__ == "__main__":
|
|
| 222 |
# ******* loading the mode and the data
|
| 223 |
dataset = st.sidebar.selectbox(
|
| 224 |
"Dataset",
|
| 225 |
-
["amazon_polarity", "
|
| 226 |
-
index=
|
| 227 |
)
|
| 228 |
|
| 229 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 230 |
-
"distilbert-base-uncased-finetuned-sst-2-english")
|
| 231 |
-
|
| 232 |
model = st.sidebar.selectbox(
|
| 233 |
"Model",
|
| 234 |
["distilbert-base-uncased-finetuned-sst-2-english",
|
| 235 |
-
"
|
| 236 |
-
index=0
|
| 237 |
)
|
| 238 |
|
| 239 |
loss_quantile = st.sidebar.slider(
|
| 240 |
-
"Loss Quantile", min_value=0.
|
| 241 |
)
|
| 242 |
|
| 243 |
run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
|
|
@@ -245,10 +237,11 @@ if __name__ == "__main__":
|
|
| 245 |
num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
|
| 246 |
|
| 247 |
### LOAD DATA AND SESSION VARIABLES ###
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
|
|
|
| 252 |
if "user_data" not in st.session_state:
|
| 253 |
st.session_state["user_data"] = data_df
|
| 254 |
if "selected_slice" not in st.session_state:
|
|
|
|
| 109 |
st.markdown("* Each **point** is an input example.")
|
| 110 |
st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
|
| 111 |
st.markdown("* The **shape** of each point reflects the label category -- positive (diamond) or negative sentiment (circle).")
|
| 112 |
+
st.altair_chart(data_comparison(down_samp(embedding_df)), use_container_width=True)
|
| 113 |
|
| 114 |
|
| 115 |
def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
|
|
|
|
| 156 |
|
| 157 |
@st.cache(ttl=600)
|
| 158 |
def clustering(data,num_clusters):
|
|
|
|
| 159 |
X = np.array(data['embedding'].tolist())
|
|
|
|
| 160 |
kclusterer = KMeansClusterer(
|
| 161 |
num_clusters, distance=nltk.cluster.util.cosine_distance,
|
| 162 |
repeats=25,avoid_empty_clusters=True)
|
|
|
|
| 163 |
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
|
|
|
|
| 164 |
data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
|
| 165 |
data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
|
| 166 |
|
|
|
|
| 218 |
# ******* loading the mode and the data
|
| 219 |
dataset = st.sidebar.selectbox(
|
| 220 |
"Dataset",
|
| 221 |
+
["amazon_polarity", "yelp_polarity"],
|
| 222 |
+
index = 1
|
| 223 |
)
|
| 224 |
|
|
|
|
|
|
|
|
|
|
| 225 |
model = st.sidebar.selectbox(
|
| 226 |
"Model",
|
| 227 |
["distilbert-base-uncased-finetuned-sst-2-english",
|
| 228 |
+
"albert-base-v2-yelp-polarity"],
|
|
|
|
| 229 |
)
|
| 230 |
|
| 231 |
loss_quantile = st.sidebar.slider(
|
| 232 |
+
"Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.95
|
| 233 |
)
|
| 234 |
|
| 235 |
run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
|
|
|
|
| 237 |
num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
|
| 238 |
|
| 239 |
### LOAD DATA AND SESSION VARIABLES ###
|
| 240 |
+
data_df = pd.read_parquet('./assets/data/'+dataset+ '_'+ model+'.parquet')
|
| 241 |
+
if model == 'albert-base-v2-yelp-polarity':
|
| 242 |
+
tokenizer = AutoTokenizer.from_pretrained('textattack/'+model)
|
| 243 |
+
else:
|
| 244 |
+
tokenizer = AutoTokenizer.from_pretrained(model)
|
| 245 |
if "user_data" not in st.session_state:
|
| 246 |
st.session_state["user_data"] = data_df
|
| 247 |
if "selected_slice" not in st.session_state:
|
assets/data/amazon_polarity_albert-base-v2-yelp-polarity.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bce0297bedc66865c01644421ea934008d74807befb7b0bd94aa92729bd02a59
|
| 3 |
+
size 56644779
|
assets/data/amazon_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a193c26851f48b7b76a35986ced0dc1fddafd26b92f1aaf9a4e69fd83fd2f2e4
|
| 3 |
+
size 56643545
|
assets/data/yelp_polarity_albert-base-v2-yelp-polarity.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a56147880841c6f78a868fb58f6e97661547009e570c2887ef7c12ffd54474e
|
| 3 |
+
size 103294569
|
assets/data/yelp_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:165515be2837df9b02f782fe1e7bd3b31bb01c49960e73238f77541eee7589ad
|
| 3 |
+
size 61796202
|