Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import transformers | |
| import streamlit as st | |
| from streamlit import session_state | |
| import json | |
| import pandas as pd | |
| from transformers import BertTokenizer, BertModel | |
| from torch import cuda | |
| import numpy | |
| import emoji | |
| import string | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer # PorterStemmer LancasterStemmer | |
| from nltk.stem import WordNetLemmatizer | |
| import re | |
| stemmer = PorterStemmer() | |
| # uncomment this when run first time | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| nltk.download('stopwords') | |
| lemmatizer = WordNetLemmatizer() | |
| from transformers import pipeline | |
| stopwords = nltk.corpus.stopwords.words('english') | |
| classifier = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch") | |
| def pre_processing_str_esg(df_col): | |
| df_col = df_col.lower() | |
| #defining the function to remove punctuation | |
| def remove_punctuation(text): | |
| punctuationfree="".join([i for i in text if i not in string.punctuation]) | |
| return punctuationfree | |
| #storing the puntuation free text | |
| df_col= remove_punctuation(df_col) | |
| df_col = re.sub(r"http\S+", " ", df_col) | |
| def remove_stopwords(text): | |
| return " ".join([word for word in str(text).split() if word not in stopwords]) | |
| #applying the function | |
| df_col = remove_stopwords(df_col) | |
| df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col) | |
| df_col = df_col.replace("¶", "") | |
| df_col = df_col.replace("§", "") | |
| df_col = df_col.replace('“', ' ') | |
| df_col = df_col.replace('”', ' ') | |
| df_col = df_col.replace('-', ' ') | |
| REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') | |
| BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') | |
| df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col) | |
| df_col = BAD_SYMBOLS_RE.sub(' ',df_col) | |
| # df_col = re.sub('W*dw*','',df_col) | |
| df_col = re.sub('[0-9]+', ' ', df_col) | |
| df_col = re.sub(' ', ' ', df_col) | |
| def remove_emoji(string): | |
| emoji_pattern = re.compile("[" | |
| u"\U0001F600-\U0001F64F" # emoticons | |
| u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
| u"\U0001F680-\U0001F6FF" # transport & map symbols | |
| u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
| u"\U00002702-\U000027B0" | |
| u"\U000024C2-\U0001F251" | |
| "]+", flags=re.UNICODE) | |
| return emoji_pattern.sub(r'', string) | |
| df_col = remove_emoji(df_col) | |
| return df_col | |
| def pre_processing_str(df_col): | |
| # df_col = df_col.lower() | |
| if len(df_col.split()) >= 70: | |
| return pre_processing_str_esg(df_col) | |
| else: | |
| df_col = df_col.replace('#', '') | |
| df_col = df_col.replace('!', '') | |
| df_col = re.sub(r"http\S+", " ", df_col) | |
| df_col = re.sub('[0-9]+', ' ', df_col) | |
| df_col = re.sub(' ', ' ', df_col) | |
| def remove_emojis(text): | |
| return emoji.replace_emoji(text) | |
| df_col = remove_emojis(df_col) | |
| df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col) | |
| df_col = re.sub(r"[^\x20-\x7E]+", "", df_col) | |
| df_col = df_col.strip() | |
| return df_col | |
| # start for the api steps make sure name should me match with file name and application = Flask(__name__). 'application.py and application | |
| def process(text): | |
| text = pre_processing_str(text) | |
| try: | |
| if len(text) != 0: | |
| results = classifier(text, top_k = 2) | |
| else: | |
| results = 'No Text' | |
| return {'output_16':results} | |
| except: | |
| return {'output_16':'something went wrong'} | |
| st.set_page_config(page_title="core_risk", page_icon="📈") | |
| if 'topic_class' not in session_state: | |
| session_state['topic_class']= "" | |
| st.title("Core Risk Category Classifier") | |
| text= st.text_area(label= "Please write the text bellow", | |
| placeholder="What does the text say?") | |
| def classify(text): | |
| session_state['topic_class'] = process(text) | |
| st.text_area("result", value=session_state['topic_class']) | |
| st.button("Classify", on_click=classify, args=[text]) | |