MonkeyDAnh's picture
Update app.py
58e49ab verified
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import numpy as np # To calculate mean
# --- Global Configuration Parameters ---
model_name = "MonkeyDAnh/my-awesome-ai-detector-roberta-base-v4-human-vs-machine-finetune"
MAX_LENGTH_CHUNK = 512 # Max chunk size for RoBERTa
CHUNK_OVERLAP = 128 # Number of tokens to overlap between chunks
# 1. Load model and tokenizer from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Check for GPU and move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() # Set model to evaluation mode
# Get label mapping from model config (if available)
id_to_label = model.config.id2label if hasattr(model.config, 'id2label') else {0: "human-produced", 1: "machine-generated"}
label_to_id = model.config.label2id if hasattr(model.config, 'label2id') else {"human-produced": 0, "machine-generated": 1}
# Get IDs for "machine-generated" and "human-produced" labels for easy access
AI_LABEL_ID = label_to_id.get("machine-generated", 1)
HUMAN_LABEL_ID = label_to_id.get("human-produced", 0)
# 2. Define the long text processing function (chunking and aggregation)
def predict_long_text_with_chunking(text):
"""
This function processes long text by splitting it into chunks,
predicting for each chunk, and aggregating the results.
"""
if not text.strip():
return "Please enter text for analysis."
# Tokenize the original text to get total token count
tokens = tokenizer.encode(text, add_special_tokens=False)
total_tokens = len(tokens)
# If the text is short enough, process as a single chunk
if total_tokens <= MAX_LENGTH_CHUNK:
return predict_single_chunk(text)
# If the text is long, proceed with chunking
chunk_ai_probabilities = []
# Use `return_overflowing_tokens=True` to let the tokenizer automatically create chunks
# and `stride` for overlap
tokenized_inputs = tokenizer(
text,
truncation=True,
max_length=MAX_LENGTH_CHUNK,
padding=False,
return_overflowing_tokens=True,
stride=CHUNK_OVERLAP
)
# Iterate through each chunk for prediction
for i in range(len(tokenized_inputs["input_ids"])):
chunk_input_ids = tokenized_inputs["input_ids"][i]
chunk_attention_mask = tokenized_inputs["attention_mask"][i]
inputs = {
"input_ids": torch.tensor([chunk_input_ids]).to(device),
"attention_mask": torch.tensor([chunk_attention_mask]).to(device)
}
with torch.no_grad():
outputs = model(**inputs)
probabilities = F.softmax(outputs.logits, dim=-1)
# AI probability for this chunk
ai_prob_chunk = probabilities[0, AI_LABEL_ID].item() * 100
chunk_ai_probabilities.append(ai_prob_chunk)
# Aggregate results: take the average of AI probabilities from all chunks
if chunk_ai_probabilities:
avg_ai_probability = np.mean(chunk_ai_probabilities)
else:
# This case is unlikely if text.strip() is not empty
return "Could not analyze text. Please try again."
avg_human_probability = 100 - avg_ai_probability
return (f"**Long Text Analysis (Chunked):**\n"
f"AI-generated probability: {avg_ai_probability:.2f}% (Human-produced probability: {avg_human_probability:.2f}%)"
f"\n\n*Processed {len(chunk_ai_probabilities)} text chunks with max length {MAX_LENGTH_CHUNK} tokens and {CHUNK_OVERLAP} token overlap.*")
# Prediction function for a single chunk (can be called by the main function)
def predict_single_chunk(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH_CHUNK)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
probabilities = F.softmax(outputs.logits, dim=-1)
ai_probability = probabilities[0, AI_LABEL_ID].item() * 100
human_probability = probabilities[0, HUMAN_LABEL_ID].item() * 100
return (f"**Short Text Analysis (Single Chunk):**\n"
f"AI-generated probability: {ai_probability:.2f}% (Human-produced probability: {human_probability:.2f}%)")
# 3. Create the Gradio interface
iface = gr.Interface(
fn=predict_long_text_with_chunking, # Use the new function to handle both short and long texts
inputs=gr.Textbox(lines=10, label="Enter your full report/text here to check AI/Human ratio"), # Reverted label to English
outputs=gr.Textbox(label="Analysis Results"),
title="AI to Human Text Ratio Tester Tool (Long Report Analysis Supported)", # Reverted title to English
description="This tool predicts the percentage of AI-generated text compared to human-written text. Please **paste your entire report, including titles, paragraphs, bullet points, indices, etc.,** into the box below. For long texts, the tool will automatically break them into chunks for analysis and aggregate the results.", # Reverted description to English
theme="huggingface"
)
# 4. Launch the interface
if __name__ == "__main__":
iface.launch()