Spaces:

nazneen
/

error-analysis

Runtime error

App Files Files Community

nazneen commited on May 21, 2022

Commit

3d8ee64

1 Parent(s): 58f2ab9

adding parquets

Browse files

Files changed (5) hide show

app.py +10 -17
assets/data/amazon_polarity_albert-base-v2-yelp-polarity.parquet +3 -0
assets/data/amazon_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet +3 -0
assets/data/yelp_polarity_albert-base-v2-yelp-polarity.parquet +3 -0
assets/data/yelp_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet +3 -0

app.py CHANGED Viewed

@@ -109,7 +109,7 @@ def quant_panel(embedding_df):
         st.markdown("* Each **point** is an input example.")
         st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
         st.markdown("* The **shape** of each point reflects the label category --  positive (diamond) or negative sentiment (circle).")
-    st.altair_chart(data_comparison(down_samp(embedding_df)))
 def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
@@ -156,15 +156,11 @@ def get_data(spotlight, emb):
 @st.cache(ttl=600)
 def clustering(data,num_clusters):
     X = np.array(data['embedding'].tolist())
     kclusterer = KMeansClusterer(
         num_clusters, distance=nltk.cluster.util.cosine_distance,
         repeats=25,avoid_empty_clusters=True)
     assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
     data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
     data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
@@ -222,22 +218,18 @@ if __name__ == "__main__":
     # ******* loading the mode and the data
     dataset = st.sidebar.selectbox(
         "Dataset",
-        ["amazon_polarity", "squad", "movielens", "waterbirds"],
-        index=0
     )
-    tokenizer = AutoTokenizer.from_pretrained(
-        "distilbert-base-uncased-finetuned-sst-2-english")
     model = st.sidebar.selectbox(
         "Model",
         ["distilbert-base-uncased-finetuned-sst-2-english",
-            "distilbert-base-uncased-finetuned-sst-2-english"],
-        index=0
     )
     loss_quantile = st.sidebar.slider(
-        "Loss Quantile", min_value=0.0, max_value=1.0,step=0.01,value=0.95
     )
     run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
@@ -245,10 +237,11 @@ if __name__ == "__main__":
     num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
     ### LOAD DATA AND SESSION VARIABLES ###
-    data = pd.read_parquet('./assets/data/amazon_polarity.test.parquet')
-    embedding_umap = data[['x','y']]
-    emb_df = pd.read_parquet('./assets/data/amazon_test_emb.parquet')
-    data_df = pd.DataFrame([data['content'], data['label'], data['pred'], data['loss'], emb_df['embedding'], data['x'], data['y']]).transpose()
     if "user_data" not in st.session_state:
         st.session_state["user_data"] = data_df
     if "selected_slice" not in st.session_state:

         st.markdown("* Each **point** is an input example.")
         st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
         st.markdown("* The **shape** of each point reflects the label category --  positive (diamond) or negative sentiment (circle).")
+    st.altair_chart(data_comparison(down_samp(embedding_df)), use_container_width=True)
 def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
 @st.cache(ttl=600)
 def clustering(data,num_clusters):
     X = np.array(data['embedding'].tolist())
     kclusterer = KMeansClusterer(
         num_clusters, distance=nltk.cluster.util.cosine_distance,
         repeats=25,avoid_empty_clusters=True)
     assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
     data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
     data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
     # ******* loading the mode and the data
     dataset = st.sidebar.selectbox(
         "Dataset",
+        ["amazon_polarity", "yelp_polarity"],
+        index = 1
     )
     model = st.sidebar.selectbox(
         "Model",
         ["distilbert-base-uncased-finetuned-sst-2-english",
+            "albert-base-v2-yelp-polarity"],
     )
     loss_quantile = st.sidebar.slider(
+        "Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.95
     )
     run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
     num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
     ### LOAD DATA AND SESSION VARIABLES ###
+    data_df = pd.read_parquet('./assets/data/'+dataset+ '_'+ model+'.parquet')
+    if model == 'albert-base-v2-yelp-polarity':
+        tokenizer = AutoTokenizer.from_pretrained('textattack/'+model)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model)
     if "user_data" not in st.session_state:
         st.session_state["user_data"] = data_df
     if "selected_slice" not in st.session_state:

assets/data/amazon_polarity_albert-base-v2-yelp-polarity.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bce0297bedc66865c01644421ea934008d74807befb7b0bd94aa92729bd02a59
+size 56644779

assets/data/amazon_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a193c26851f48b7b76a35986ced0dc1fddafd26b92f1aaf9a4e69fd83fd2f2e4
+size 56643545

assets/data/yelp_polarity_albert-base-v2-yelp-polarity.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a56147880841c6f78a868fb58f6e97661547009e570c2887ef7c12ffd54474e
+size 103294569

assets/data/yelp_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:165515be2837df9b02f782fe1e7bd3b31bb01c49960e73238f77541eee7589ad
+size 61796202