Spaces:
Sleeping
Sleeping
| import urllib | |
| import html | |
| import re | |
| from urlextract import URLExtract | |
| from unicodedata import normalize | |
| from .demojize import demojize | |
| def hashtag_handler(text: str): | |
| pattern = r"(#([^\s]+))" | |
| return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text) | |
| def cashtag_handler(text: str): | |
| pattern = r"(\$([^\s]+))" | |
| return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text) | |
| def mention_handler(text: str): | |
| pattern = r"(@([^\s]+))" | |
| return re.sub(pattern, " @user ", text) | |
| url_extractor = URLExtract() | |
| def url_handler(text: str): | |
| urls = list(url_extractor.gen_urls(text)) | |
| updated_urls = list( | |
| set([url if "http" in url else f"https://{url}" for url in urls]) | |
| ) | |
| domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls] | |
| for i in range(len(domains)): | |
| text = text.replace(urls[i], f" <http> {domains[i]} </http> ") | |
| return text | |
| def email_handler(text: str): | |
| pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+" | |
| match = re.findall(pattern, text) | |
| for m in match: | |
| text = text.replace(m, " <email> ").strip() | |
| return text | |
| def emoji_handler(text: str): | |
| return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> ")) | |
| def normalize_text(text: str): | |
| return normalize("NFC", text) | |
| def preprocess(text: str): | |
| output = html.unescape(text) | |
| output = normalize_text(output) | |
| output = email_handler(output) | |
| output = url_handler(output) | |
| output = hashtag_handler(output) | |
| output = cashtag_handler(output) | |
| output = mention_handler(output) | |
| output = emoji_handler(output) | |
| output = re.sub(r"\s+", " ", output) | |
| output = output.lower() | |
| output = output.strip() | |
| return output | |
| if __name__ == "__main__": | |
| sample_text = "" | |
| preprocessed_text = preprocess(sample_text) | |
| print(preprocessed_text) | |