Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import openai | |
| import os | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| from streamlit import session_state | |
| import numpy as np | |
| import json | |
| from io import StringIO | |
| import json | |
| import os | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| import nltk | |
| from nltk import word_tokenize | |
| from nltk.corpus import stopwords | |
| from sklearn.cluster import MiniBatchKMeans | |
| from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score | |
| from sklearn.cluster import AgglomerativeClustering,k_means | |
| from scipy.cluster.hierarchy import linkage, dendrogram, fcluster | |
| import numpy as np | |
| nltk.download("stopwords") | |
| import nltk | |
| nltk.download('punkt') | |
| #text preprocessing function | |
| def clean_text_1(text): | |
| stop_words = set(stopwords.words("english")) | |
| def remove_stopwords(text): | |
| return " ".join([word for word in str(text).split() if word not in stop_words]) | |
| text = remove_stopwords(text) | |
| text = str(text).lower() # Lowercase words | |
| text = re.sub(r"\[(.*?)\]", " ", text) # Remove [+XYZ chars] in content | |
| text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content | |
| text = re.sub(r"\w+β¦|β¦", " ", text) # Remove ellipsis (and last word) | |
| text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words | |
| # text = re.sub(stop_words, " ", text) # Replace dash between words | |
| text = re.sub( | |
| f"[{re.escape(string.punctuation)}]", "", text | |
| ) # Remove punctuation | |
| return text | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.manifold import TSNE | |
| import matplotlib.pyplot as plt | |
| import matplotlib.colors as mcolors | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here | |
| # Load sentence transformer model | |
| def get_embedding(text): | |
| # Assuming you have a function clean_text_1 to clean the text | |
| #text = clean_text_1(text) | |
| return model.encode(text) | |
| # Streamlit UI configuration | |
| st.set_page_config( | |
| page_title="text_clustering.py", | |
| page_icon="π", | |
| ) | |
| # Upload file | |
| uploaded_file = st.file_uploader("Choose a file") | |
| if uploaded_file: | |
| # Read data from file | |
| df = pd.read_csv(uploaded_file) | |
| # Clean data | |
| df = df[df['text'].notna()].reset_index(drop=True) | |
| # Get embeddings | |
| df['embedding'] = df['text'].apply(get_embedding) | |
| matrix = np.vstack(df['embedding'].values) | |
| # Distance threshold slider | |
| distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1) | |
| # Perform clustering | |
| agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward') | |
| cluster_labels = agg_clustering.fit_predict(matrix) | |
| df['Cluster'] = cluster_labels | |
| # Visualize clusters with t-SNE | |
| tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200) | |
| vis_dims2 = tsne.fit_transform(matrix) | |
| x = [x for x, y in vis_dims2] | |
| y = [y for x, y in vis_dims2] | |
| unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True) | |
| # Create a colormap based on cluster sizes | |
| colormap = plt.cm.get_cmap("viridis", len(unique_clusters)) | |
| # Set up Streamlit app | |
| fig, ax = plt.subplots() | |
| for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)): | |
| xs = np.array(x)[cluster_labels == category] | |
| ys = np.array(y)[cluster_labels == category] | |
| ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})') | |
| avg_x = xs.mean() | |
| avg_y = ys.mean() | |
| ax.scatter(avg_x, avg_y, marker="x", color=color, s=100) | |
| ax.set_title("Clusters identified visualized in language 2D using t-SNE") | |
| ax.legend() | |
| # Display the plot in Streamlit | |
| st.pyplot(fig) | |
| st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist()))) | |
| # Reading a review which belong to each group. | |
| rev_per_cluster = 1 | |
| n_clusters = len(np.unique(cluster_labels.tolist())) | |
| for i in range(n_clusters): | |
| print(f"Cluster {i} Theme:", end=" ") | |
| reviews = "\n".join( | |
| df[df.Cluster == i] | |
| .text.str.replace("Title: ", "") | |
| .str.replace("\n\nContent: ", ": ") | |
| .sample(rev_per_cluster, random_state=42) | |
| .values | |
| ) | |
| messages = [ | |
| {"role": "user", "content": f'What do the following have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'} | |
| ] | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=messages, | |
| temperature=0, | |
| max_tokens=64, | |
| top_p=1, | |
| frequency_penalty=0, | |
| presence_penalty=0) | |
| print(response.choices[0].message.content.replace("\n", "")) | |
| st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", "")) | |
| # sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42) | |
| # for j in range(rev_per_cluster): | |
| # print(sample_cluster_rows.Score.values[j], end=", ") | |
| # print(sample_cluster_rows.Summary.values[j], end=": ") | |
| # print(sample_cluster_rows.Text.str[:70].values[j]) | |
| # print("-" * 100) | |
| # | |