Spaces:

VRLLab
/

TurkishBERTweet-SA-LoRA

Sleeping

App Files Files Community

TurkishBERTweet-SA-LoRA / Preprocessor /preprocessor.py

AliNajafi

Add application files

d856fda almost 2 years ago

raw

history blame contribute delete

1.92 kB

	import urllib
	import html
	import re

	from urlextract import URLExtract
	from unicodedata import normalize

	from .demojize import demojize


	def hashtag_handler(text: str):
	pattern = r"(#([^\s]+))"
	return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)


	def cashtag_handler(text: str):
	pattern = r"(\$([^\s]+))"
	return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)


	def mention_handler(text: str):
	pattern = r"(@([^\s]+))"
	return re.sub(pattern, " @user ", text)


	url_extractor = URLExtract()


	def url_handler(text: str):
	urls = list(url_extractor.gen_urls(text))
	updated_urls = list(
	set([url if "http" in url else f"https://{url}" for url in urls])
	)
	domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
	for i in range(len(domains)):
	text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
	return text


	def email_handler(text: str):
	pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
	match = re.findall(pattern, text)
	for m in match:
	text = text.replace(m, " <email> ").strip()
	return text


	def emoji_handler(text: str):
	return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))


	def normalize_text(text: str):
	return normalize("NFC", text)


	def preprocess(text: str):
	output = html.unescape(text)
	output = normalize_text(output)
	output = email_handler(output)
	output = url_handler(output)
	output = hashtag_handler(output)
	output = cashtag_handler(output)
	output = mention_handler(output)
	output = emoji_handler(output)
	output = re.sub(r"\s+", " ", output)
	output = output.lower()
	output = output.strip()

	return output


	if __name__ == "__main__":
	sample_text = ""
	preprocessed_text = preprocess(sample_text)
	print(preprocessed_text)