Spaces:

Jaja-09
/

authorchecks-backend

Sleeping

App Files Files Community

authorchecks-backend / model_handler.py

Jaja-09

tweak(confidence): blend modified prob with chunk majority; floor when unanimous

a5365c5 2 months ago

raw

history blame

39.6 kB

	"""
	Model Handler for Two-Branch AI Detection Model
	Combines DeBERTa embeddings with sentiment features
	Uses XGBoost for final classification
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel, AutoModelForSequenceClassification
	import os
	import logging
	from typing import Dict, Any, Optional, List, Tuple
	import numpy as np
	from pathlib import Path
	import xgboost as xgb
	import json
	import nltk
	from nltk.tokenize import sent_tokenize

	# Download NLTK data
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt', quiet=True)

	logger = logging.getLogger(__name__)


	class DesklibAIDetectionModel(PreTrainedModel):
	"""
	DeBERTa-based AI detection model
	Architecture from desklib/ai-text-detector-v1.01
	"""
	config_class = AutoConfig

	def __init__(self, config):
	super().__init__(config)
	# Initialize the base transformer model
	self.model = AutoModel.from_config(config)
	# Define a classifier head
	self.classifier = nn.Linear(config.hidden_size, 1)
	# Initialize weights
	self.init_weights()

	def forward(self, input_ids, attention_mask=None, labels=None):
	# Forward pass through the transformer
	outputs = self.model(input_ids, attention_mask=attention_mask)
	last_hidden_state = outputs[0]

	# Mean pooling
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
	sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
	sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
	pooled_output = sum_embeddings / sum_mask

	# Classifier
	logits = self.classifier(pooled_output)
	loss = None
	if labels is not None:
	loss_fct = nn.BCEWithLogitsLoss()
	loss = loss_fct(logits.view(-1), labels.float())

	output = {"logits": logits}
	if loss is not None:
	output["loss"] = loss
	return output


	class AIDetectionModelHandler:
	"""
	Handles Two-Branch AI detection:
	- DeBERTa for semantic embeddings
	- Sentiment features (avg_polarity, polarity_variance)
	- XGBoost for final classification
	"""

	def __init__(self, model_path: Optional[str] = None, max_length: int = 512):
	"""
	Initialize the model handler

	Args:
	model_path: Path to the model directory (default: env MODEL_PATH or /app/model)
	max_length: Maximum token length for input text
	"""
	self.max_length = max_length
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.deberta_model = None
	self.tokenizer = None
	self.sentiment_model = None
	self.sentiment_tokenizer = None
	self.xgboost_model = None
	self.model_loaded = False

	# Default model paths
	if model_path is None:
	# Prefer explicit env var
	env_model_path = os.getenv("MODEL_PATH")
	if env_model_path and os.path.exists(env_model_path):
	model_path = env_model_path
	elif os.path.exists("/app/model"):
	model_path = "/app/model"
	else:
	# Fallback to legacy relative path
	backend_dir = Path(__file__).parent
	model_path = str(backend_dir.parent / "model" / "model")

	self.model_path = model_path
	# XGBoost file is expected inside the same folder as the other model artifacts
	self.xgboost_path = str(Path(model_path) / "xgboost_model.json")

	# Load the models
	self._load_models()

	def _load_models(self):
	"""Load DeBERTa, sentiment model, and XGBoost classifier"""
	try:
	logger.info(f"Loading models from: {self.model_path}")
	logger.info(f"Using device: {self.device}")

	# Check if model path exists
	if not os.path.exists(self.model_path):
	logger.error(f"Model path does not exist: {self.model_path}")
	raise FileNotFoundError(f"Model not found at {self.model_path}")

	# 1. Load DeBERTa tokenizer and model
	logger.info("Loading DeBERTa tokenizer...")
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

	logger.info("Loading DeBERTa model...")
	self.deberta_model = DesklibAIDetectionModel.from_pretrained(self.model_path)
	self.deberta_model.to(self.device)
	self.deberta_model.eval()

	# 2. Load sentiment analysis model (DistilBERT)
	logger.info("Loading sentiment model...")
	sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
	self.sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
	self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
	self.sentiment_model.to(self.device)
	self.sentiment_model.eval()

	# 3. Load XGBoost model
	if os.path.exists(self.xgboost_path):
	logger.info(f"Loading XGBoost model from: {self.xgboost_path}")
	self.xgboost_model = xgb.Booster()
	self.xgboost_model.load_model(self.xgboost_path)
	logger.info("✅ XGBoost model loaded!")
	else:
	logger.warning(f"XGBoost model not found at {self.xgboost_path}, using DeBERTa only")
	self.xgboost_model = None

	self.model_loaded = True
	logger.info("✅ All models loaded successfully!")

	except Exception as e:
	logger.error(f"Failed to load models: {e}", exc_info=True)
	self.model_loaded = False
	raise

	def is_loaded(self) -> bool:
	"""Check if model is loaded"""
	return self.model_loaded

	def get_sentiment_scores(self, text: str) -> List[float]:
	"""
	Extract sentiment scores for each sentence using DistilBERT

	Args:
	text: Input text

	Returns:
	List of sentiment scores (polarity) for each sentence
	"""
	try:
	# Tokenize into sentences
	sentences = sent_tokenize(text)
	if not sentences:
	return [0.5] # Neutral if no sentences

	scores = []

	with torch.no_grad():
	for sentence in sentences:
	# Tokenize sentence
	inputs = self.sentiment_tokenizer(
	sentence,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=512
	)
	inputs = {k: v.to(self.device) for k, v in inputs.items()}

	# Get sentiment prediction
	outputs = self.sentiment_model(**inputs)
	logits = outputs.logits
	probabilities = F.softmax(logits, dim=-1)

	# Get positive sentiment probability (index 1)
	pos_prob = probabilities[0][1].item()
	# Convert to polarity score (-1 to 1, where 0.5 is neutral)
	polarity = (pos_prob - 0.5) * 2 # Maps [0,1] to [-1,1]
	scores.append(polarity)

	return scores

	except Exception as e:
	logger.error(f"Error extracting sentiment scores: {e}")
	return [0.0] # Return neutral on error

	def extract_sentiment_features(self, text: str) -> np.ndarray:
	"""
	Extract avg_polarity and polarity_variance from text

	Args:
	text: Input text

	Returns:
	Numpy array with [avg_polarity, polarity_variance]
	"""
	sentiment_scores = self.get_sentiment_scores(text)

	# Calculate features
	avg_polarity = float(np.mean(sentiment_scores)) if sentiment_scores else 0.0
	polarity_variance = float(np.var(sentiment_scores)) if len(sentiment_scores) > 1 else 0.0

	return np.array([avg_polarity, polarity_variance], dtype=np.float32)

	def get_deberta_embeddings(self, text: str) -> np.ndarray:
	"""
	Get DeBERTa embeddings for text using mean pooling

	Args:
	text: Input text

	Returns:
	Numpy array of embeddings
	"""
	try:
	# Tokenize input
	encoded = self.tokenizer(
	text,
	padding='max_length',
	truncation=True,
	max_length=self.max_length,
	return_tensors='pt'
	)

	input_ids = encoded['input_ids'].to(self.device)
	attention_mask = encoded['attention_mask'].to(self.device)

	# Get embeddings
	with torch.no_grad():
	outputs = self.deberta_model.model(input_ids=input_ids, attention_mask=attention_mask)
	last_hidden_state = outputs[0]

	# Mean pooling
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
	sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
	sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
	pooled_output = sum_embeddings / sum_mask

	# Convert to numpy
	embeddings = pooled_output.cpu().numpy().flatten()

	return embeddings

	except Exception as e:
	logger.error(f"Error extracting DeBERTa embeddings: {e}", exc_info=True)
	raise

	def predict_probability(self, text: str, threshold: float = 0.5) -> Dict[str, Any]:
	"""
	Predict if text is AI-generated using two-branch architecture

	Args:
	text: Input text to analyze
	threshold: Classification threshold (default: 0.5)

	Returns:
	Dictionary with probability, label, sentiment features
	"""
	if not self.model_loaded:
	raise RuntimeError("Model not loaded. Cannot perform prediction.")

	try:
	# Extract sentiment features
	logger.info("Extracting sentiment features...")
	sentiment_features = self.extract_sentiment_features(text)
	avg_polarity = float(sentiment_features[0])
	polarity_variance = float(sentiment_features[1])

	# If XGBoost is available, use the full two-branch pipeline
	if self.xgboost_model is not None:
	logger.info("Using XGBoost two-branch model...")

	# Get DeBERTa embeddings
	deberta_embeddings = self.get_deberta_embeddings(text)

	# Combine features: DeBERTa embeddings + sentiment features
	combined_features = np.concatenate([deberta_embeddings, sentiment_features])

	# Create DMatrix for XGBoost
	dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))

	# Predict
	probability = float(self.xgboost_model.predict(dmatrix)[0])

	else:
	# Fallback to DeBERTa only
	logger.info("Using DeBERTa model only (XGBoost not found)...")

	encoded = self.tokenizer(
	text,
	padding='max_length',
	truncation=True,
	max_length=self.max_length,
	return_tensors='pt'
	)

	input_ids = encoded['input_ids'].to(self.device)
	attention_mask = encoded['attention_mask'].to(self.device)

	with torch.no_grad():
	outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
	logits = outputs["logits"]
	probability = torch.sigmoid(logits).item()

	label = 1 if probability >= threshold else 0

	return {
	"probability": probability,
	"label": label,
	"classification": "ai" if label == 1 else "human",
	"confidence": probability if label == 1 else (1 - probability),
	"sentiment_features": {
	"avg_polarity": avg_polarity,
	"polarity_variance": polarity_variance
	}
	}

	except Exception as e:
	logger.error(f"Prediction error: {e}", exc_info=True)
	raise

	def predict_single_text_xgboost(self, text: str) -> Tuple[float, int]:
	"""
	Predict AI probability and label for a single text using XGBoost model

	Args:
	text: Input text to analyze

	Returns:
	Tuple of (probability, label) where label is 0 for human, 1 for AI
	"""
	try:
	# Extract sentiment features
	sentiment_features = self.extract_sentiment_features(text)
	avg_polarity = float(sentiment_features[0])
	polarity_variance = float(sentiment_features[1])

	# If XGBoost is available, use the full two-branch pipeline
	if self.xgboost_model is not None:
	# Get DeBERTa embeddings
	deberta_embeddings = self.get_deberta_embeddings(text)

	# Combine features: DeBERTa embeddings + sentiment features
	combined_features = np.concatenate([deberta_embeddings, sentiment_features])

	# Create DMatrix for XGBoost
	dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))

	# Predict
	probability = float(self.xgboost_model.predict(dmatrix)[0])

	else:
	# Fallback to DeBERTa only
	encoded = self.tokenizer(
	text,
	padding='max_length',
	truncation=True,
	max_length=self.max_length,
	return_tensors='pt'
	)

	input_ids = encoded['input_ids'].to(self.device)
	attention_mask = encoded['attention_mask'].to(self.device)

	with torch.no_grad():
	outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
	logits = outputs["logits"]
	probability = torch.sigmoid(logits).item()

	label = 1 if probability >= 0.5 else 0

	return probability, label

	except Exception as e:
	logger.error(f"Single text prediction error: {e}", exc_info=True)
	raise

	def detect_mixed_text_chunk_based(self, text: str, chunk_size: int = 4, overlap: int = 1, min_chunk_length: int = 50) -> Dict[str, Any]:
	"""
	Improved mixed text detection using chunk-based analysis that influences overall probability

	Args:
	text: Input text string
	chunk_size: Number of sentences per chunk (default: 4)
	overlap: Number of sentences to overlap between chunks (default: 1)
	min_chunk_length: Minimum character length for a chunk to be analyzed

	Returns:
	Dictionary with prediction results and analysis details

	Note:
	Input validation: Text must be 200-7000 words. Dynamic chunking: 4-5 sentences
	analyzed as whole, then chunk size varies:
	- 6-10 sentences: 3 sentences per chunk
	- 11-20 sentences: 4 sentences per chunk
	- 21-30 sentences: 5 sentences per chunk
	- 31+ sentences: 6 sentences per chunk
	Uses overlapping chunks to capture transitions between AI and human content.
	"""
	# Get overall prediction (your current method)
	overall_prob, overall_label = self.predict_single_text_xgboost(text)

	# Split text into sentences
	sentences = sent_tokenize(text)

	# Validate input text length (80-7000 words)
	total_words = len(text.split())
	if total_words < 80:
	return {
	'prediction': 'Human' if overall_label == 0 else 'AI',
	'confidence': abs(overall_prob - 0.5) * 2,
	'is_mixed': False,
	'reason': f'Text too short for analysis ({total_words} words, minimum 80 words required)',
	'overall_probability': overall_prob,
	'modified_probability': overall_prob,
	'chunk_analysis': []
	}
	elif total_words > 7000:
	return {
	'prediction': 'Human' if overall_label == 0 else 'AI',
	'confidence': abs(overall_prob - 0.5) * 2,
	'is_mixed': False,
	'reason': f'Text too long for analysis ({total_words} words, maximum 7000 words allowed)',
	'overall_probability': overall_prob,
	'modified_probability': overall_prob,
	'chunk_analysis': []
	}

	# Dynamic chunking based on total sentence count
	total_sentences = len(sentences)

	# For 4-5 sentences, analyze as whole (no chunking)
	if total_sentences <= 5:
	return {
	'prediction': 'Human' if overall_label == 0 else 'AI',
	'confidence': abs(overall_prob - 0.5) * 2,
	'is_mixed': False,
	'reason': f'Analyzing {total_sentences} sentences as whole (4-5 sentence range)',
	'overall_probability': overall_prob,
	'modified_probability': overall_prob,
	'chunk_analysis': []
	}

	# Dynamic chunk size based on total sentences
	if total_sentences <= 10:
	dynamic_chunk_size = 3
	elif total_sentences <= 20:
	dynamic_chunk_size = 4
	elif total_sentences <= 30:
	dynamic_chunk_size = 5
	else:
	dynamic_chunk_size = 6 # For very long texts

	# Ensure we have enough sentences for at least 2 chunks
	if total_sentences < dynamic_chunk_size * 2:
	return {
	'prediction': 'Human' if overall_label == 0 else 'AI',
	'confidence': abs(overall_prob - 0.5) * 2,
	'is_mixed': False,
	'reason': f'Text too short for chunk analysis ({total_sentences} sentences, need at least {dynamic_chunk_size * 2})',
	'overall_probability': overall_prob,
	'modified_probability': overall_prob,
	'chunk_analysis': []
	}

	# Create overlapping chunks
	chunks = []
	chunk_predictions = []
	chunk_probabilities = []

	logger.info(f"Analyzing text with {total_sentences} sentences using dynamic chunk size of {dynamic_chunk_size}...")

	for i in range(0, len(sentences) - dynamic_chunk_size + 1, dynamic_chunk_size - overlap):
	# Create chunk from sentences
	chunk_sentences = sentences[i:i + dynamic_chunk_size]
	chunk_text = ' '.join(chunk_sentences)

	# Only analyze chunks that meet minimum length requirement
	if len(chunk_text.strip()) >= min_chunk_length:
	chunks.append(chunk_text)

	# Analyze this chunk
	prob, label = self.predict_single_text_xgboost(chunk_text)
	chunk_predictions.append((prob, label))
	chunk_probabilities.append(prob)

	logger.info(f" Chunk {len(chunks)}: {chunk_text[:60]}... → {'AI' if label == 1 else 'Human'} ({prob:.3f})")

	if len(chunk_predictions) < 2:
	return {
	'prediction': 'Human' if overall_label == 0 else 'AI',
	'confidence': abs(overall_prob - 0.5) * 2,
	'is_mixed': False,
	'reason': 'Too few chunks for mixed analysis',
	'overall_probability': overall_prob,
	'modified_probability': overall_prob,
	'chunk_analysis': chunk_predictions
	}

	# Count human vs AI chunks
	human_chunks = sum(1 for _, label in chunk_predictions if label == 0)
	ai_chunks = sum(1 for _, label in chunk_predictions if label == 1)
	total_chunks = len(chunk_predictions)

	# Mixed text detection logic
	is_mixed = human_chunks > 0 and ai_chunks > 0
	mixed_ratio = min(human_chunks, ai_chunks) / total_chunks

	logger.info(f"\nChunk Analysis Summary:")
	logger.info(f" Total chunks analyzed: {total_chunks}")
	logger.info(f" Human chunks: {human_chunks}")
	logger.info(f" AI chunks: {ai_chunks}")
	logger.info(f" Mixed ratio: {mixed_ratio:.2f}")

	# MODIFY OVERALL PROBABILITY BASED ON CHUNK ANALYSIS
	if is_mixed and mixed_ratio > 0.25: # At least 25% of each type
	# Calculate weighted average of chunk probabilities
	# Weight by chunk length (longer chunks have more influence)
	chunk_weights = [len(chunk) for chunk in chunks]
	total_weight = sum(chunk_weights)

	# Calculate weighted average probability
	weighted_prob = sum(prob * weight for prob, weight in zip(chunk_probabilities, chunk_weights)) / total_weight

	# Blend original overall probability with chunk-based probability
	# More chunks = more influence from chunk analysis
	chunk_influence = min(total_chunks / 5.0, 1.0) # Max influence at 5+ chunks
	modified_prob = (overall_prob * (1 - chunk_influence)) + (weighted_prob * chunk_influence)

	final_prediction = 'Mixed'
	confidence = 1.0 - mixed_ratio # Lower confidence for mixed text

	logger.info(f" → MIXED TEXT DETECTED!")
	logger.info(f" → Original overall probability: {overall_prob:.3f}")
	logger.info(f" → Weighted chunk probability: {weighted_prob:.3f}")
	logger.info(f" → Chunk influence factor: {chunk_influence:.3f}")
	logger.info(f" → Modified probability: {modified_prob:.3f}")

	else:
	# Pure text - use chunk analysis to refine overall probability
	chunk_avg_prob = np.mean(chunk_probabilities)

	# Blend overall and chunk probabilities (chunks have 30% influence for pure text)
	modified_prob = (overall_prob * 0.7) + (chunk_avg_prob * 0.3)

	final_prediction = 'Human' if modified_prob < 0.5 else 'AI'
	# Base confidence from modified probability (0..1)
	base_confidence = abs(modified_prob - 0.5) * 2

	# For short texts/few chunks, incorporate chunk-majority evidence to avoid
	# under-confident results when the label is clear but probability is near 0.5.
	if total_chunks > 0:
	majority_ratio = max(human_chunks, ai_chunks) / total_chunks # e.g., 3/4 => 0.75
	combined_confidence = max(
	base_confidence,
	0.6 * majority_ratio + 0.4 * base_confidence
	)
	# If every chunk agrees, ensure a reasonable floor
	if majority_ratio == 1.0 and total_chunks >= 3:
	combined_confidence = max(combined_confidence, 0.85)
	confidence = min(0.99, combined_confidence)
	else:
	confidence = base_confidence

	logger.info(f" → Pure {final_prediction} text")
	logger.info(f" → Original overall probability: {overall_prob:.3f}")
	logger.info(f" → Average chunk probability: {chunk_avg_prob:.3f}")
	logger.info(f" → Modified probability: {modified_prob:.3f}")

	return {
	'prediction': final_prediction,
	'confidence': confidence,
	'is_mixed': is_mixed,
	'mixed_ratio': mixed_ratio,
	'human_chunks': human_chunks,
	'ai_chunks': ai_chunks,
	'total_chunks': total_chunks,
	'overall_probability': overall_prob,
	'modified_probability': modified_prob,
	'chunk_probabilities': chunk_probabilities,
	'chunk_analysis': chunk_predictions,
	'chunk_size': chunk_size,
	'overlap': overlap
	}

	def detect_ai(self, text: str) -> Dict[str, Any]:
	"""
	AI detection with chunk-based mixed text analysis

	Args:
	text: Input text

	Returns:
	Detection results with sentiment features and mixed text analysis
	"""
	# Use chunk-based detection for better mixed text handling
	chunk_result = self.detect_mixed_text_chunk_based(text)

	# Get sentiment features for explanation
	sentiment_features = self.extract_sentiment_features(text)
	avg_pol = float(sentiment_features[0])
	pol_var = float(sentiment_features[1])

	# Generate explanation based on prediction type
	confidence_pct = chunk_result["confidence"] * 100
	prediction = chunk_result["prediction"]

	if confidence_pct > 90:
	certainty = "very high confidence"
	elif confidence_pct > 75:
	certainty = "high confidence"
	elif confidence_pct > 60:
	certainty = "moderate confidence"
	else:
	certainty = "low confidence"

	# Generate explanation based on prediction type
	if prediction == "Mixed":
	explanation = f"This text appears to be a mixture of AI-generated and human-authored text."
	explanation += " This mixed composition suggests the text may have been collaboratively written or heavily edited."

	# Add sentiment insights for mixed text
	if pol_var > 0.60:
	explanation += " High emotional variation across sections indicates significant style differences between parts."
	elif pol_var >= 0.36:
	explanation += " Moderate emotional variation suggests different writing styles in various sections."
	else:
	explanation += " Low emotional variation may indicate consistent editing or similar writing styles throughout."

	elif prediction == "AI":
	explanation = f"This text is classified as AI-Generated with {certainty}."
	explanation += " The text exhibits patterns typical of AI-generated content, including consistent structure and predictable phrasing."
	if pol_var <= 0.10:
	explanation += " Very low emotional variation which is typical of AI texts with uniform style."
	elif pol_var <= 0.35:
	explanation += " Low emotional variation which is common in AI-generated content."
	elif pol_var <= 0.60:
	explanation += " Moderate emotional variation which is rare in AI, possibly presenting multiple viewpoints."
	else:
	explanation += " High emotional variation is unusual for AI, may indicate balanced argument structure."
	else: # Human
	explanation = f"This text is classified as Human-Authored with {certainty}."
	explanation += " The text shows characteristics of human writing, such as natural variations and organic flow."
	if pol_var > 0.60:
	explanation += " High emotional variation which is typical of human writing with emotional swings in debates, reviews, and narratives."
	elif pol_var >= 0.36:
	explanation += " Moderate emotional variation which shows human-like sentiment shifts."
	elif pol_var >= 0.11:
	explanation += " Low emotional variation which may indicate formal or academic human writing."
	else:
	explanation += " Very low emotional variation indicates consistent tone with focused perspective."

	# Convert prediction to classification format for backward compatibility
	classification_map = {"AI": "ai", "Human": "human", "Mixed": "mixed"}
	classification = classification_map.get(prediction, "unknown")

	return {
	"classification": classification,
	"prediction": prediction,
	"probability": chunk_result["modified_probability"],
	"confidence": confidence_pct,
	"explanation": explanation,
	"sentiment_features": {
	"avg_polarity": avg_pol,
	"polarity_variance": pol_var
	},
	"mixed_analysis": {
	"is_mixed": chunk_result["is_mixed"],
	"mixed_ratio": chunk_result.get("mixed_ratio", 0),
	"human_chunks": chunk_result.get("human_chunks", 0),
	"ai_chunks": chunk_result.get("ai_chunks", 0),
	"total_chunks": chunk_result.get("total_chunks", 0),
	"overall_probability": chunk_result["overall_probability"],
	"modified_probability": chunk_result["modified_probability"]
	}
	}

	def analyze_text(self, text: str) -> Dict[str, Any]:
	"""
	Comprehensive text analysis combining AI detection with sentiment features

	Args:
	text: Input text to analyze

	Returns:
	Complete analysis results with model-based sentiment features
	"""
	# Validate input text length (80-7000 words)
	total_words = len(text.split())
	if total_words < 80:
	raise ValueError(f"Text too short for analysis ({total_words} words, minimum 80 words required)")
	elif total_words > 7000:
	raise ValueError(f"Text too long for analysis ({total_words} words, maximum 7000 words allowed)")

	# Get AI detection results (includes sentiment features from model)
	ai_detection = self.detect_ai(text)
	model_sentiment = ai_detection.get("sentiment_features", {})

	# Perform basic text analysis
	words = text.split()
	sentences = [s.strip() for s in text.replace('!', '.').replace('?', '.').split('.') if s.strip()]

	# Calculate basic metrics
	word_count = len(words)
	sentence_count = len(sentences)
	avg_word_length = np.mean([len(w) for w in words]) if words else 0
	avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0

	# Determine complexity based on AI probability and text metrics
	is_ai = ai_detection["classification"] == "ai"
	is_mixed = ai_detection["classification"] == "mixed"
	ai_prob = ai_detection["probability"]

	# Handle different prediction types
	if is_mixed:
	formality = "mixed"
	complexity = "variable"
	tone = "Mixed (AI/Human)"
	audience = "Variable"
	elif is_ai:
	formality = "formal" if ai_prob > 0.7 else "neutral"
	complexity = "complex" if avg_word_length > 6 else "moderate"
	tone = "Professional"
	audience = "General to Academic"
	else:
	formality = "casual" if avg_word_length < 5 else "neutral"
	complexity = "simple" if avg_sentence_length < 15 else "moderate"
	tone = "Conversational"
	audience = "General Public"

	# Generate insights based on detection results
	insights = []

	if is_mixed and ai_detection["confidence"] > 60:
	mixed_analysis = ai_detection.get("mixed_analysis", {})
	insights.append({
	"type": "observation",
	"title": "Mixed Content Detected",
	"description": f"This text contains both AI-generated and human-authored sections ({ai_detection['confidence']:.1f}% confidence).",
	"suggestion": "Consider reviewing the text for consistency and ensuring all sections align with your intended voice and style."
	})
	insights.append({
	"type": "observation",
	"title": "Content Composition",
	"description": f"Analysis found {mixed_analysis.get('human_chunks', 0)} human-like sections and {mixed_analysis.get('ai_chunks', 0)} AI-like sections.",
	"suggestion": "The mixed nature suggests collaborative writing or heavy editing. Consider standardizing the writing style throughout."
	})
	elif is_ai and ai_detection["confidence"] > 75:
	insights.append({
	"type": "observation",
	"title": "AI-Generated Content Detected",
	"description": f"This text shows strong indicators of AI generation ({ai_detection['confidence']:.1f}% confidence).",
	"suggestion": "Consider adding personal anecdotes, varied sentence structures, or unique perspectives to make it more human-like."
	})
	elif not is_ai and ai_detection["confidence"] > 75:
	insights.append({
	"type": "strength",
	"title": "Human Writing Characteristics",
	"description": f"This text exhibits clear human writing patterns ({ai_detection['confidence']:.1f}% confidence)."
	})

	# Sentence variety analysis
	if sentence_count > 2:
	sentence_lengths = [len(s.split()) for s in sentences]
	std_dev = np.std(sentence_lengths)
	if std_dev < 3:
	insights.append({
	"type": "improvement",
	"title": "Sentence Variety",
	"description": "Sentences have similar lengths, which may indicate AI generation.",
	"suggestion": "Vary sentence lengths to create more natural rhythm."
	})
	else:
	insights.append({
	"type": "strength",
	"title": "Good Sentence Variety",
	"description": "Text shows natural variation in sentence structure."
	})

	# Generate emotions based on model sentiment polarity (data-driven ranges)
	avg_polarity = model_sentiment.get("avg_polarity", 0)
	emotions = []

	if avg_polarity >= 0.71:
	emotions.append({"emotion": "very_positive", "score": min(abs(avg_polarity), 1.0), "intensity": "high"})
	elif avg_polarity >= 0.30:
	emotions.append({"emotion": "positive", "score": min(abs(avg_polarity), 1.0), "intensity": "medium"})
	elif avg_polarity >= -0.29:
	emotions.append({"emotion": "neutral", "score": 0.8, "intensity": "medium"})
	else:
	emotions.append({"emotion": "negative", "score": min(abs(avg_polarity), 1.0), "intensity": "high"})

	# Construct full analysis response with model sentiment features
	polarity_variance = model_sentiment.get("polarity_variance", 0)

	return {
	"advancedSentiment": {
	"emotions": emotions,
	"confidence": 70 + (ai_detection["confidence"] * 0.3),
	"context": f"The text appears to be {'AI-Generated' if ai_detection['classification'] == 'ai' else 'Human-Authored'} based on linguistic patterns and sentiment analysis.",
	"avg_polarity": model_sentiment.get("avg_polarity", 0),
	"polarity_variance": polarity_variance
	},
	"topics": [
	{
	"topic": "General Content",
	"relevance": 0.8,
	"keywords": words[:5] if len(words) >= 5 else words
	}
	],
	"writingStyle": {
	"tone": tone,
	"formality": formality,
	"complexity": complexity,
	"style": [formality, complexity, tone],
	"audience": audience,
	"sentiment_consistency": "very_low" if polarity_variance <= 0.10 else "low" if polarity_variance <= 0.35 else "moderate" if polarity_variance <= 0.60 else "high"
	},
	"insights": insights,
	"plagiarismRisk": {
	"score": int(ai_prob * 100) if is_ai else (int(ai_prob * 70) if is_mixed else 10),
	"level": "high" if is_ai and ai_prob > 0.8 else "medium" if (is_ai or is_mixed) else "low",
	"details": f"{'High' if is_ai else 'Moderate' if is_mixed else 'Low'} similarity to AI-generated patterns detected."
	},
	"contentQuality": {
	"overall": int(85 - (ai_prob * 20)) if is_ai else (int(80 - (ai_prob * 15)) if is_mixed else 90),
	"clarity": int(90 - (ai_prob * 10)) if not is_mixed else int(85 - (ai_prob * 8)),
	"coherence": int(88 - (ai_prob * 8)) if not is_mixed else int(82 - (ai_prob * 6)),
	"engagement": int(75 - (ai_prob * 25)) if not is_mixed else int(70 - (ai_prob * 20)),
	"originality": int(60 - (ai_prob * 40)) if is_ai else (int(70 - (ai_prob * 30)) if is_mixed else 85)
	},
	"aiOrHuman": ai_detection["classification"],
	"aiOrHumanConfidence": ai_detection["confidence"],
	"aiOrHumanExplanation": ai_detection["explanation"]
	}

	def get_model_info(self) -> Dict[str, Any]:
	"""Get information about the loaded models"""
	return {
	"model_loaded": self.model_loaded,
	"model_path": self.model_path,
	"device": str(self.device),
	"max_length": self.max_length,
	"architecture": "Two-Branch (DeBERTa + Sentiment Features)",
	"primary_model": "DeBERTa-v3-large (desklib/ai-text-detector-v1.01)",
	"sentiment_model": "DistilBERT-SST-2",
	"classifier": "XGBoost" if self.xgboost_model is not None else "DeBERTa Linear",
	"features": [
	"DeBERTa embeddings (1024 dimensions)",
	"Average sentiment polarity",
	"Sentiment polarity variance"
	],
	"description": "Two-branch model for detecting AI-Generated vs Human-Authored text using DeBERTa semantic embeddings combined with sentiment features"
	}