Spaces:

ai-based
/

DataAIDemo

Sleeping

App Files Files Community

DataAIDemo / pages /core_risk.py

themeetjani

Update pages/core_risk.py

5ac5229 verified almost 2 years ago

raw

history blame

4.2 kB

	import numpy as np
	import transformers
	import streamlit as st
	from streamlit import session_state
	import json
	import pandas as pd
	from transformers import BertTokenizer, BertModel
	from torch import cuda
	import numpy
	import emoji
	import string
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer # PorterStemmer LancasterStemmer
	from nltk.stem import WordNetLemmatizer
	import re
	stemmer = PorterStemmer()

	# uncomment this when run first time
	nltk.download('wordnet')
	nltk.download('omw-1.4')
	nltk.download('stopwords')

	lemmatizer = WordNetLemmatizer()

	from transformers import pipeline
	stopwords = nltk.corpus.stopwords.words('english')


	classifier = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
	def pre_processing_str_esg(df_col):
	df_col = df_col.lower()
	#defining the function to remove punctuation
	def remove_punctuation(text):
	punctuationfree="".join([i for i in text if i not in string.punctuation])
	return punctuationfree
	#storing the puntuation free text
	df_col= remove_punctuation(df_col)
	df_col = re.sub(r"http\S+", " ", df_col)

	def remove_stopwords(text):
	return " ".join([word for word in str(text).split() if word not in stopwords])
	#applying the function
	df_col = remove_stopwords(df_col)
	df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
	df_col = df_col.replace("¶", "")
	df_col = df_col.replace("§", "")
	df_col = df_col.replace('“', ' ')
	df_col = df_col.replace('”', ' ')
	df_col = df_col.replace('-', ' ')
	REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\\|@,;]')
	BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
	df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
	df_col = BAD_SYMBOLS_RE.sub(' ',df_col)

	# df_col = re.sub('Wdw','',df_col)
	df_col = re.sub('[0-9]+', ' ', df_col)
	df_col = re.sub(' ', ' ', df_col)

	def remove_emoji(string):
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)
	return emoji_pattern.sub(r'', string)
	df_col = remove_emoji(df_col)

	return df_col

	def pre_processing_str(df_col):
	# df_col = df_col.lower()
	if len(df_col.split()) >= 70:
	return pre_processing_str_esg(df_col)
	else:
	df_col = df_col.replace('#', '')
	df_col = df_col.replace('!', '')
	df_col = re.sub(r"http\S+", " ", df_col)

	df_col = re.sub('[0-9]+', ' ', df_col)
	df_col = re.sub(' ', ' ', df_col)
	def remove_emojis(text):
	return emoji.replace_emoji(text)
	df_col = remove_emojis(df_col)
	df_col = re.sub(r"(?:\@\|https?\://)\S+", "", df_col)
	df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)
	df_col = df_col.strip()
	return df_col


	# start for the api steps make sure name should me match with file name and application = Flask(__name__). 'application.py and application

	def process(text):
	text = pre_processing_str(text)

	try:
	if len(text) != 0:
	results = classifier(text, top_k = 2)
	else:
	results = 'No Text'

	return {'output_16':results}
	except:
	return {'output_16':'something went wrong'}

	st.set_page_config(page_title="core_risk", page_icon="📈")
	if 'topic_class' not in session_state:
	session_state['topic_class']= ""

	st.title("Core Risk Category Classifier")
	text= st.text_area(label= "Please write the text bellow",
	placeholder="What does the text say?")
	def classify(text):
	session_state['topic_class'] = process(text)


	st.text_area("result", value=session_state['topic_class'])

	st.button("Classify", on_click=classify, args=[text])