Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| import torch.nn.functional as F | |
| import numpy as np # To calculate mean | |
| # --- Global Configuration Parameters --- | |
| model_name = "MonkeyDAnh/my-awesome-ai-detector-roberta-base-v4-human-vs-machine-finetune" | |
| MAX_LENGTH_CHUNK = 512 # Max chunk size for RoBERTa | |
| CHUNK_OVERLAP = 128 # Number of tokens to overlap between chunks | |
| # 1. Load model and tokenizer from Hugging Face Hub | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| # Check for GPU and move model to GPU if available | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| model.eval() # Set model to evaluation mode | |
| # Get label mapping from model config (if available) | |
| id_to_label = model.config.id2label if hasattr(model.config, 'id2label') else {0: "human-produced", 1: "machine-generated"} | |
| label_to_id = model.config.label2id if hasattr(model.config, 'label2id') else {"human-produced": 0, "machine-generated": 1} | |
| # Get IDs for "machine-generated" and "human-produced" labels for easy access | |
| AI_LABEL_ID = label_to_id.get("machine-generated", 1) | |
| HUMAN_LABEL_ID = label_to_id.get("human-produced", 0) | |
| # 2. Define the long text processing function (chunking and aggregation) | |
| def predict_long_text_with_chunking(text): | |
| """ | |
| This function processes long text by splitting it into chunks, | |
| predicting for each chunk, and aggregating the results. | |
| """ | |
| if not text.strip(): | |
| return "Please enter text for analysis." | |
| # Tokenize the original text to get total token count | |
| tokens = tokenizer.encode(text, add_special_tokens=False) | |
| total_tokens = len(tokens) | |
| # If the text is short enough, process as a single chunk | |
| if total_tokens <= MAX_LENGTH_CHUNK: | |
| return predict_single_chunk(text) | |
| # If the text is long, proceed with chunking | |
| chunk_ai_probabilities = [] | |
| # Use `return_overflowing_tokens=True` to let the tokenizer automatically create chunks | |
| # and `stride` for overlap | |
| tokenized_inputs = tokenizer( | |
| text, | |
| truncation=True, | |
| max_length=MAX_LENGTH_CHUNK, | |
| padding=False, | |
| return_overflowing_tokens=True, | |
| stride=CHUNK_OVERLAP | |
| ) | |
| # Iterate through each chunk for prediction | |
| for i in range(len(tokenized_inputs["input_ids"])): | |
| chunk_input_ids = tokenized_inputs["input_ids"][i] | |
| chunk_attention_mask = tokenized_inputs["attention_mask"][i] | |
| inputs = { | |
| "input_ids": torch.tensor([chunk_input_ids]).to(device), | |
| "attention_mask": torch.tensor([chunk_attention_mask]).to(device) | |
| } | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probabilities = F.softmax(outputs.logits, dim=-1) | |
| # AI probability for this chunk | |
| ai_prob_chunk = probabilities[0, AI_LABEL_ID].item() * 100 | |
| chunk_ai_probabilities.append(ai_prob_chunk) | |
| # Aggregate results: take the average of AI probabilities from all chunks | |
| if chunk_ai_probabilities: | |
| avg_ai_probability = np.mean(chunk_ai_probabilities) | |
| else: | |
| # This case is unlikely if text.strip() is not empty | |
| return "Could not analyze text. Please try again." | |
| avg_human_probability = 100 - avg_ai_probability | |
| return (f"**Long Text Analysis (Chunked):**\n" | |
| f"AI-generated probability: {avg_ai_probability:.2f}% (Human-produced probability: {avg_human_probability:.2f}%)" | |
| f"\n\n*Processed {len(chunk_ai_probabilities)} text chunks with max length {MAX_LENGTH_CHUNK} tokens and {CHUNK_OVERLAP} token overlap.*") | |
| # Prediction function for a single chunk (can be called by the main function) | |
| def predict_single_chunk(text): | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH_CHUNK) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probabilities = F.softmax(outputs.logits, dim=-1) | |
| ai_probability = probabilities[0, AI_LABEL_ID].item() * 100 | |
| human_probability = probabilities[0, HUMAN_LABEL_ID].item() * 100 | |
| return (f"**Short Text Analysis (Single Chunk):**\n" | |
| f"AI-generated probability: {ai_probability:.2f}% (Human-produced probability: {human_probability:.2f}%)") | |
| # 3. Create the Gradio interface | |
| iface = gr.Interface( | |
| fn=predict_long_text_with_chunking, # Use the new function to handle both short and long texts | |
| inputs=gr.Textbox(lines=10, label="Enter your full report/text here to check AI/Human ratio"), # Reverted label to English | |
| outputs=gr.Textbox(label="Analysis Results"), | |
| title="AI to Human Text Ratio Tester Tool (Long Report Analysis Supported)", # Reverted title to English | |
| description="This tool predicts the percentage of AI-generated text compared to human-written text. Please **paste your entire report, including titles, paragraphs, bullet points, indices, etc.,** into the box below. For long texts, the tool will automatically break them into chunks for analysis and aggregate the results.", # Reverted description to English | |
| theme="huggingface" | |
| ) | |
| # 4. Launch the interface | |
| if __name__ == "__main__": | |
| iface.launch() |