Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Mon May 19 16:49:22 2025 | |
| @author: jacobwildt-persson | |
| """ | |
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| # ----------------------------------------------- | |
| # Requirements & Setup Instructions | |
| # ----------------------------------------------- | |
| # Python version: | |
| # Requires Python 3.10 or later (tested on 3.12) | |
| # Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts. | |
| # Recreate the environment with theese command in terminal | |
| # conda env create -f environment.yml | |
| # conda activate sprakenv | |
| # | |
| # Install all required packages: | |
| # Run these commands in the terminal: | |
| # pip install --upgrade gradio | |
| # pip install pdfplumber | |
| # pip install nltk | |
| # pip install transformers | |
| # pip install -U spacy | |
| # Download language models: | |
| # python -m spacy download es_core_news_lg | |
| # python -m spacy download en_core_web_lg # if you add NER for English | |
| # Check Gradio version used: | |
| # import gradio as gr | |
| # print(gr.__version__) # Gradio version 4.18.0 | |
| # 🔗 Reference: Gradio Quickstart Guide | |
| # https://www.gradio.app/guides/quickstart | |
| #Hugging Face | |
| # https://huggingface.co/models | |
| # Enghlish API model | |
| # LanguageTool API: https://languagetool.org/http-api/swagger | |
| #Rembember !!!!!!!!!!!!!!!!!!!!!!!!! | |
| # Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts. | |
| # Recreate the environment with theese command in terminal | |
| # conda env create -f environment.yml | |
| # conda activate sprakenv | |
| # python -m spacy download es_core_news_lg | |
| #python -m nltk.downloader punkt wordnet | |
| # ----------------------------------------------- | |
| """ | |
| Language learning app with Gradio UI, on & multiple users: | |
| - Import text from file (.txt/.csv/.pdf) or manual text input | |
| - Grammar correction via transformers (Spanish) or LanguageTool API (English) | |
| - Analyze text (known/unknown words) per user & language | |
| - Save unknown words as known | |
| - Generate coherent practice sentence (Spanish & English) | |
| - Log grammar corrections and practice sentence suggestions to CSV | |
| """ | |
| import os | |
| import datetime | |
| import sqlite3 | |
| import requests | |
| import random | |
| import pandas as pd | |
| import pdfplumber | |
| import spacy | |
| import csv | |
| # SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary) | |
| import sqlite3 | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM | |
| import gradio as gr | |
| import gradio_client.utils as _gcu | |
| # --- PATCH for Gradio utils schema bug --- | |
| _orig_json = _gcu.json_schema_to_python_type | |
| _orig_get = _gcu.get_type | |
| def _patched_json_to_py(schema, defs=None): | |
| if not isinstance(schema, dict): | |
| return "any" | |
| try: | |
| return _orig_json(schema, defs) | |
| except Exception: | |
| return "any" | |
| def _patched_get_type(schema): | |
| if not isinstance(schema, dict): | |
| return "any" | |
| try: | |
| return _orig_get(schema) | |
| except Exception: | |
| return "any" | |
| _gcu.json_schema_to_python_type = _patched_json_to_py | |
| _gcu.get_type = _patched_get_type | |
| # --- SQLite Database initialization --- | |
| DB_NAME = "vocabulary.db" | |
| conn = sqlite3.connect(DB_NAME) | |
| conn.execute(""" | |
| CREATE TABLE IF NOT EXISTS vocabulary ( | |
| user_id TEXT, | |
| language TEXT, | |
| word TEXT, | |
| timestamp TEXT, | |
| UNIQUE(user_id, language, word) | |
| ) | |
| """) | |
| conn.commit() | |
| conn.close() | |
| # --- Save word to database --- | |
| def save_word_to_db(user_id: str, language: str, word: str): | |
| ts = datetime.datetime.now().isoformat() | |
| conn = sqlite3.connect(DB_NAME) | |
| conn.execute( | |
| "INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)", | |
| (user_id, language, word, ts) | |
| ) | |
| conn.commit() | |
| conn.close() | |
| # --- Retrieve known words for user/language --- | |
| def get_user_vocabulary(user_id: str, language: str) -> set[str]: | |
| conn = sqlite3.connect(DB_NAME) | |
| rows = conn.execute( | |
| "SELECT word FROM vocabulary WHERE user_id=? AND language=?", | |
| (user_id, language) | |
| ).fetchall() | |
| conn.close() | |
| return {r[0] for r in rows} | |
| # --- Load NLP models --- | |
| nlp = spacy.load("es_core_news_lg") | |
| tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H") | |
| model = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H") | |
| gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2") | |
| gpt2_model_es = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2") | |
| gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2") | |
| gpt2_model_en = AutoModelForCausalLM.from_pretrained("gpt2") | |
| lemmatizer = WordNetLemmatizer() | |
| # ---Log to CSV (grammar corrections and sentence suggestions) --- | |
| def log_to_csv(filename, row, fieldnames): | |
| file_exists = os.path.isfile(filename) | |
| with open(filename, "a", newline='', encoding="utf-8") as csvfile: | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| if not file_exists: | |
| writer.writeheader() | |
| writer.writerow(row) | |
| # --- File Import --- | |
| def import_file(path: str) -> str: | |
| ext = os.path.splitext(path)[1].lower() | |
| if ext == ".pdf": | |
| pages = [] | |
| with pdfplumber.open(path) as pdf: | |
| for p in pdf.pages: | |
| pages.append(p.extract_text() or "") | |
| return "\n".join(pages) | |
| if ext == ".csv": | |
| df = pd.read_csv(path) | |
| if "text" in df: | |
| return "\n".join(df["text"].astype(str)) | |
| raise ValueError("CSV saknar kolumnen 'text'.") | |
| if ext == ".txt": | |
| return open(path, encoding="utf-8").read() | |
| raise ValueError(f"Okänt filformat: {ext}") | |
| # --- Grammar Correction --- | |
| def correct_grammar(text: str, language: str) -> str: | |
| if language == "es": | |
| corrected = [] | |
| for sent in nlp(text).sents: | |
| s = sent.text.strip() | |
| if not s: continue | |
| inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True) | |
| out = model.generate( | |
| **inp, | |
| max_new_tokens=inp.input_ids.shape[1], | |
| num_beams=5, | |
| early_stopping=True | |
| ) | |
| corrected.append(tokenizer.decode(out[0], skip_special_tokens=True)) | |
| return " ".join(corrected) | |
| # English: LanguageTool API | |
| resp = requests.post( | |
| "https://api.languagetool.org/v2/check", | |
| data={"text": text, "language": language} | |
| ).json() | |
| for m in reversed(resp.get("matches", [])): | |
| off, ln = m["offset"], m["length"] | |
| repls = m.get("replacements", []) | |
| val = repls[0]["value"] if repls else "" | |
| text = text[:off] + val + text[off+ln:] | |
| return text | |
| # --- Analyze known and unknown words --- | |
| def analyze_text(text: str, user_id: str, language: str): | |
| toks = word_tokenize(text) | |
| lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()] | |
| vocab = get_user_vocabulary(user_id, language) | |
| known = [w for w in lems if w in vocab] | |
| unknown = [w for w in lems if w not in vocab] | |
| return known, unknown | |
| # --- Generate sentence using GPT2 based on unknown words --- | |
| def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str: | |
| kn, un = analyze_text(text, user_id, language) | |
| if not un: | |
| return "Inga okända ord att generera mening med." | |
| chosen = random.sample(un, min(num_unknown, len(un))) | |
| if language == "es": | |
| prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "." | |
| tokenizer = gpt2_tokenizer_es | |
| model = gpt2_model_es | |
| else: | |
| prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "." | |
| tokenizer = gpt2_tokenizer_en | |
| model = gpt2_model_en | |
| inp = tokenizer(prompt, return_tensors="pt", truncation=True) | |
| outs = model.generate( | |
| **inp, | |
| max_new_tokens=50, | |
| do_sample=True, | |
| top_k=50, | |
| top_p=0.95 | |
| ) | |
| gen = tokenizer.decode(outs[0], skip_special_tokens=True) | |
| body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip() | |
| sentence = (body.split(".")[0].strip() + ".") if "." in body else body | |
| if not any(c.isalpha() for c in sentence): | |
| return "Misslyckades att generera meningsfull övningsmening." | |
| return sentence | |
| # --- Gradio process callback --- | |
| def process(user, language, txt, file, do_grammar, do_save): | |
| try: | |
| if txt and txt.strip(): | |
| text = txt.strip() | |
| elif file: | |
| text = import_file(file.name) | |
| else: | |
| return "", "", "", "Ingen text angiven.", "" | |
| out = correct_grammar(text, language) if do_grammar else text | |
| kn, un = analyze_text(out, user, language) | |
| status = "" | |
| if do_save and un: | |
| for w in un: | |
| save_word_to_db(user, language, w) | |
| status = f"Sparade {len(un)} ord." | |
| # Logga grammatikrättning till CSV | |
| log_to_csv( | |
| "grammarlog.csv", | |
| { | |
| "user": user, "language": language, "input": text, | |
| "output": out, "timestamp": datetime.datetime.now().isoformat() | |
| }, | |
| ["user", "language", "input", "output", "timestamp"] | |
| ) | |
| return out, ", ".join(kn), ", ".join(un), status, "" | |
| except Exception as e: | |
| import traceback | |
| tb = traceback.format_exc() | |
| return "", "", "", f"FEL i process:\n{tb}", "" | |
| # --- Sentence generation callback --- | |
| def coherent_fn(user, language, txt, num): | |
| try: | |
| suggestion = generate_coherent_sentence(txt or "", user, language, num) | |
| # Logga övningsförslag till CSV | |
| log_to_csv( | |
| "sentencelog.csv", | |
| { | |
| "user": user, "language": language, "input": txt, | |
| "output": suggestion, "timestamp": datetime.datetime.now().isoformat() | |
| }, | |
| ["user", "language", "input", "output", "timestamp"] | |
| ) | |
| return suggestion | |
| except Exception as e: | |
| return f"Fel vid generering: {e}" | |
| # --- Gradio UI --- | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet") | |
| with gr.Row(): | |
| user_input = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här") | |
| lang_dd = gr.Dropdown(choices=["es", "en"], value="es", label="Språk") | |
| with gr.Column(): | |
| manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text") | |
| file_input = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil") | |
| grammar_cb = gr.Checkbox(label="Grammatikrättning") | |
| autosave_cb = gr.Checkbox(label="Spara okända ord") | |
| run_btn = gr.Button("Kör analys & korrigering") | |
| num_slider = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning") | |
| coherent_btn = gr.Button("Koherent övningsmening") | |
| corr_out = gr.Textbox(label="Korrigerad text", lines=4) | |
| known_out = gr.Textbox(label="Kända ord") | |
| unknown_out = gr.Textbox(label="Okända ord") | |
| status_out = gr.Textbox(label="Status") | |
| coherent_out = gr.Textbox(label="Koherent övningsmening") | |
| # --- Knapparnas click‐kopplingar --- | |
| run_btn.click( | |
| fn=process, | |
| inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb], | |
| outputs=[corr_out, known_out, unknown_out, status_out, coherent_out] | |
| ) | |
| coherent_btn.click( | |
| fn=coherent_fn, | |
| inputs=[user_input, lang_dd, manual_input, num_slider], | |
| outputs=[coherent_out] | |
| ) | |
| #Make sure to change language for the textfile to be analyzed in its target language | |
| # --- Start app --- | |
| if __name__ == "__main__": | |
| url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True) | |
| print("Appen körs på:", url) | |