Spaces:

MonkeyDAnh
/

my-ai-text-detector

Sleeping

App Files Files Community

my-ai-text-detector / app.py

MonkeyDAnh

Update app.py

58e49ab verified 5 months ago

raw

history blame contribute delete

5.33 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import torch.nn.functional as F
	import numpy as np # To calculate mean

	# --- Global Configuration Parameters ---
	model_name = "MonkeyDAnh/my-awesome-ai-detector-roberta-base-v4-human-vs-machine-finetune"
	MAX_LENGTH_CHUNK = 512 # Max chunk size for RoBERTa
	CHUNK_OVERLAP = 128 # Number of tokens to overlap between chunks

	# 1. Load model and tokenizer from Hugging Face Hub
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)

	# Check for GPU and move model to GPU if available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	model.eval() # Set model to evaluation mode

	# Get label mapping from model config (if available)
	id_to_label = model.config.id2label if hasattr(model.config, 'id2label') else {0: "human-produced", 1: "machine-generated"}
	label_to_id = model.config.label2id if hasattr(model.config, 'label2id') else {"human-produced": 0, "machine-generated": 1}

	# Get IDs for "machine-generated" and "human-produced" labels for easy access
	AI_LABEL_ID = label_to_id.get("machine-generated", 1)
	HUMAN_LABEL_ID = label_to_id.get("human-produced", 0)

	# 2. Define the long text processing function (chunking and aggregation)
	def predict_long_text_with_chunking(text):
	"""
	This function processes long text by splitting it into chunks,
	predicting for each chunk, and aggregating the results.
	"""
	if not text.strip():
	return "Please enter text for analysis."

	# Tokenize the original text to get total token count
	tokens = tokenizer.encode(text, add_special_tokens=False)
	total_tokens = len(tokens)

	# If the text is short enough, process as a single chunk
	if total_tokens <= MAX_LENGTH_CHUNK:
	return predict_single_chunk(text)

	# If the text is long, proceed with chunking
	chunk_ai_probabilities = []

	# Use `return_overflowing_tokens=True` to let the tokenizer automatically create chunks
	# and `stride` for overlap
	tokenized_inputs = tokenizer(
	text,
	truncation=True,
	max_length=MAX_LENGTH_CHUNK,
	padding=False,
	return_overflowing_tokens=True,
	stride=CHUNK_OVERLAP
	)

	# Iterate through each chunk for prediction
	for i in range(len(tokenized_inputs["input_ids"])):
	chunk_input_ids = tokenized_inputs["input_ids"][i]
	chunk_attention_mask = tokenized_inputs["attention_mask"][i]

	inputs = {
	"input_ids": torch.tensor([chunk_input_ids]).to(device),
	"attention_mask": torch.tensor([chunk_attention_mask]).to(device)
	}

	with torch.no_grad():
	outputs = model(**inputs)

	probabilities = F.softmax(outputs.logits, dim=-1)
	# AI probability for this chunk
	ai_prob_chunk = probabilities[0, AI_LABEL_ID].item() * 100
	chunk_ai_probabilities.append(ai_prob_chunk)

	# Aggregate results: take the average of AI probabilities from all chunks
	if chunk_ai_probabilities:
	avg_ai_probability = np.mean(chunk_ai_probabilities)
	else:
	# This case is unlikely if text.strip() is not empty
	return "Could not analyze text. Please try again."

	avg_human_probability = 100 - avg_ai_probability

	return (f"Long Text Analysis (Chunked):\n"
	f"AI-generated probability: {avg_ai_probability:.2f}% (Human-produced probability: {avg_human_probability:.2f}%)"
	f"\n\nProcessed {len(chunk_ai_probabilities)} text chunks with max length {MAX_LENGTH_CHUNK} tokens and {CHUNK_OVERLAP} token overlap.")

	# Prediction function for a single chunk (can be called by the main function)
	def predict_single_chunk(text):
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH_CHUNK)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model(**inputs)

	probabilities = F.softmax(outputs.logits, dim=-1)

	ai_probability = probabilities[0, AI_LABEL_ID].item() * 100
	human_probability = probabilities[0, HUMAN_LABEL_ID].item() * 100

	return (f"Short Text Analysis (Single Chunk):\n"
	f"AI-generated probability: {ai_probability:.2f}% (Human-produced probability: {human_probability:.2f}%)")


	# 3. Create the Gradio interface
	iface = gr.Interface(
	fn=predict_long_text_with_chunking, # Use the new function to handle both short and long texts
	inputs=gr.Textbox(lines=10, label="Enter your full report/text here to check AI/Human ratio"), # Reverted label to English
	outputs=gr.Textbox(label="Analysis Results"),
	title="AI to Human Text Ratio Tester Tool (Long Report Analysis Supported)", # Reverted title to English
	description="This tool predicts the percentage of AI-generated text compared to human-written text. Please paste your entire report, including titles, paragraphs, bullet points, indices, etc., into the box below. For long texts, the tool will automatically break them into chunks for analysis and aggregate the results.", # Reverted description to English
	theme="huggingface"
	)

	# 4. Launch the interface
	if __name__ == "__main__":
	iface.launch()