Jaja-09 commited on
Commit
78a22a6
·
1 Parent(s): b8013b4

backend: add FastAPI app, requirements, Dockerfile (HF model download)

Browse files
Files changed (3) hide show
  1. app.py +256 -0
  2. model_handler.py +859 -0
  3. requirements.txt +25 -0
app.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Backend Server for AuthorCheck AI Detection
3
+ Uses DeBERTa model for AI-generated text detection
4
+ """
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel, Field
9
+ from typing import List, Optional
10
+ import uvicorn
11
+ from model_handler import AIDetectionModelHandler
12
+ import logging
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Initialize FastAPI app
22
+ app = FastAPI(
23
+ title="AuthorCheck API",
24
+ description="AI-powered text analysis and detection API",
25
+ version="1.0.0"
26
+ )
27
+
28
+ # Configure CORS
29
+ app.add_middleware(
30
+ CORSMiddleware,
31
+ allow_origins=["*"], # In production, replace with your frontend URL
32
+ allow_credentials=True,
33
+ allow_methods=["*"],
34
+ allow_headers=["*"],
35
+ )
36
+
37
+ # Initialize model handler
38
+ model_handler = AIDetectionModelHandler()
39
+
40
+
41
+ # Request/Response Models
42
+ class AnalysisRequest(BaseModel):
43
+ text: str = Field(..., min_length=1, description="Text to analyze")
44
+ analysisTypes: Optional[List[str]] = Field(
45
+ default=["all"],
46
+ description="Types of analysis to perform"
47
+ )
48
+ model: Optional[str] = Field(
49
+ default="ai-detector",
50
+ description="Model to use for analysis"
51
+ )
52
+
53
+
54
+ class EmotionData(BaseModel):
55
+ emotion: str
56
+ score: float
57
+ intensity: str
58
+
59
+
60
+ class AdvancedSentiment(BaseModel):
61
+ emotions: List[EmotionData]
62
+ confidence: float
63
+ context: str
64
+ avg_polarity: Optional[float] = None
65
+ polarity_variance: Optional[float] = None
66
+
67
+
68
+ class TopicData(BaseModel):
69
+ topic: str
70
+ relevance: float
71
+ keywords: List[str]
72
+
73
+
74
+ class WritingStyle(BaseModel):
75
+ tone: str
76
+ formality: str
77
+ complexity: str
78
+ style: List[str]
79
+ audience: str
80
+ sentiment_consistency: Optional[str] = None
81
+
82
+
83
+ class Insight(BaseModel):
84
+ type: str
85
+ title: str
86
+ description: str
87
+ suggestion: Optional[str] = None
88
+
89
+
90
+ class PlagiarismRisk(BaseModel):
91
+ score: int
92
+ level: str
93
+ details: str
94
+
95
+
96
+ class ContentQuality(BaseModel):
97
+ overall: int
98
+ clarity: int
99
+ coherence: int
100
+ engagement: int
101
+ originality: int
102
+
103
+
104
+ class AnalysisResponse(BaseModel):
105
+ advancedSentiment: AdvancedSentiment
106
+ topics: List[TopicData]
107
+ writingStyle: WritingStyle
108
+ insights: List[Insight]
109
+ plagiarismRisk: PlagiarismRisk
110
+ contentQuality: ContentQuality
111
+ aiOrHuman: str
112
+ aiOrHumanConfidence: float
113
+ aiOrHumanExplanation: str
114
+
115
+
116
+ # API Endpoints
117
+ @app.get("/")
118
+ async def root():
119
+ """Root endpoint - API health check"""
120
+ return {
121
+ "status": "online",
122
+ "message": "AuthorCheck API is running",
123
+ "version": "1.0.0"
124
+ }
125
+
126
+
127
+ @app.get("/health")
128
+ async def health_check():
129
+ """Health check endpoint"""
130
+ try:
131
+ model_loaded = model_handler.is_loaded()
132
+ return {
133
+ "status": "healthy" if model_loaded else "degraded",
134
+ "model_loaded": model_loaded,
135
+ "model_type": "DeBERTa AI Detector"
136
+ }
137
+ except Exception as e:
138
+ logger.error(f"Health check failed: {e}")
139
+ return {
140
+ "status": "unhealthy",
141
+ "error": str(e)
142
+ }
143
+
144
+
145
+ @app.post("/api/analyze", response_model=AnalysisResponse)
146
+ async def analyze_text(request: AnalysisRequest):
147
+ """
148
+ Analyze text using the DeBERTa AI detection model
149
+
150
+ Returns comprehensive analysis including:
151
+ - AI vs Human detection
152
+ - Sentiment analysis
153
+ - Topic detection
154
+ - Writing style analysis
155
+ - Content quality metrics
156
+ """
157
+ try:
158
+ if not request.text or len(request.text.strip()) == 0:
159
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
160
+
161
+ # Check text length for meaningful analysis (200-7000 words)
162
+ word_count = len(request.text.split())
163
+ if word_count < 200:
164
+ raise HTTPException(
165
+ status_code=400,
166
+ detail="Text is too short for analysis. Please provide at least 200 words for accurate AI detection and sentiment analysis."
167
+ )
168
+
169
+ if word_count > 7000:
170
+ raise HTTPException(
171
+ status_code=400,
172
+ detail="Text is too long for analysis. Maximum 7,000 words allowed."
173
+ )
174
+
175
+ # Perform AI detection using the model
176
+ logger.info(f"Analyzing text of length: {len(request.text)}")
177
+ analysis_result = model_handler.analyze_text(request.text)
178
+
179
+ logger.info(f"Analysis complete: {analysis_result['aiOrHuman']} ({analysis_result['aiOrHumanConfidence']:.2f}%)")
180
+
181
+ return analysis_result
182
+
183
+ except HTTPException:
184
+ raise
185
+ except Exception as e:
186
+ logger.error(f"Analysis error: {e}", exc_info=True)
187
+ raise HTTPException(
188
+ status_code=500,
189
+ detail=f"Analysis failed: {str(e)}"
190
+ )
191
+
192
+
193
+ @app.post("/api/detect")
194
+ async def detect_ai(request: AnalysisRequest):
195
+ """
196
+ Simple endpoint for AI detection only
197
+ Returns just the AI/Human classification
198
+ """
199
+ try:
200
+ if not request.text or len(request.text.strip()) == 0:
201
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
202
+
203
+ # Check text length (200-7000 words)
204
+ word_count = len(request.text.split())
205
+ if word_count < 200:
206
+ raise HTTPException(
207
+ status_code=400,
208
+ detail="Text is too short. Please provide at least 200 words."
209
+ )
210
+ elif word_count > 7000:
211
+ raise HTTPException(
212
+ status_code=400,
213
+ detail="Text is too long. Maximum 7,000 words allowed."
214
+ )
215
+
216
+ result = model_handler.detect_ai(request.text)
217
+
218
+ return {
219
+ "text": request.text[:100] + "..." if len(request.text) > 100 else request.text,
220
+ "classification": result["classification"],
221
+ "prediction": result.get("prediction", result["classification"]),
222
+ "probability": result["probability"],
223
+ "confidence": result["confidence"],
224
+ "explanation": result["explanation"],
225
+ "mixed_analysis": result.get("mixed_analysis", None)
226
+ }
227
+
228
+ except HTTPException:
229
+ raise
230
+ except Exception as e:
231
+ logger.error(f"Detection error: {e}", exc_info=True)
232
+ raise HTTPException(
233
+ status_code=500,
234
+ detail=f"Detection failed: {str(e)}"
235
+ )
236
+
237
+
238
+ @app.get("/api/model/info")
239
+ async def model_info():
240
+ """Get information about the loaded model"""
241
+ try:
242
+ return model_handler.get_model_info()
243
+ except Exception as e:
244
+ logger.error(f"Model info error: {e}")
245
+ raise HTTPException(status_code=500, detail=str(e))
246
+
247
+
248
+ # Run the server
249
+ if __name__ == "__main__":
250
+ uvicorn.run(
251
+ "app:app",
252
+ host="0.0.0.0",
253
+ port=8000,
254
+ reload=True,
255
+ log_level="info"
256
+ )
model_handler.py ADDED
@@ -0,0 +1,859 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model Handler for Two-Branch AI Detection Model
3
+ Combines DeBERTa embeddings with sentiment features
4
+ Uses XGBoost for final classification
5
+ """
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel, AutoModelForSequenceClassification
11
+ import os
12
+ import logging
13
+ from typing import Dict, Any, Optional, List, Tuple
14
+ import numpy as np
15
+ from pathlib import Path
16
+ import xgboost as xgb
17
+ import json
18
+ import nltk
19
+ from nltk.tokenize import sent_tokenize
20
+
21
+ # Download NLTK data
22
+ try:
23
+ nltk.data.find('tokenizers/punkt')
24
+ except LookupError:
25
+ nltk.download('punkt', quiet=True)
26
+
27
+ try:
28
+ nltk.data.find('tokenizers/punkt_tab')
29
+ except LookupError:
30
+ nltk.download('punkt_tab', quiet=True)
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class DesklibAIDetectionModel(PreTrainedModel):
36
+ """
37
+ DeBERTa-based AI detection model
38
+ Architecture from desklib/ai-text-detector-v1.01
39
+ """
40
+ config_class = AutoConfig
41
+
42
+ def __init__(self, config):
43
+ super().__init__(config)
44
+ # Initialize the base transformer model
45
+ self.model = AutoModel.from_config(config)
46
+ # Define a classifier head
47
+ self.classifier = nn.Linear(config.hidden_size, 1)
48
+ # Initialize weights
49
+ self.init_weights()
50
+
51
+ def forward(self, input_ids, attention_mask=None, labels=None):
52
+ # Forward pass through the transformer
53
+ outputs = self.model(input_ids, attention_mask=attention_mask)
54
+ last_hidden_state = outputs[0]
55
+
56
+ # Mean pooling
57
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
58
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
59
+ sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
60
+ pooled_output = sum_embeddings / sum_mask
61
+
62
+ # Classifier
63
+ logits = self.classifier(pooled_output)
64
+ loss = None
65
+ if labels is not None:
66
+ loss_fct = nn.BCEWithLogitsLoss()
67
+ loss = loss_fct(logits.view(-1), labels.float())
68
+
69
+ output = {"logits": logits}
70
+ if loss is not None:
71
+ output["loss"] = loss
72
+ return output
73
+
74
+
75
+ class AIDetectionModelHandler:
76
+ """
77
+ Handles Two-Branch AI detection:
78
+ - DeBERTa for semantic embeddings
79
+ - Sentiment features (avg_polarity, polarity_variance)
80
+ - XGBoost for final classification
81
+ """
82
+
83
+ def __init__(self, model_path: Optional[str] = None, max_length: int = 512):
84
+ """
85
+ Initialize the model handler
86
+
87
+ Args:
88
+ model_path: Path to the model directory (default: ../model/model)
89
+ max_length: Maximum token length for input text
90
+ """
91
+ self.max_length = max_length
92
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
93
+ self.deberta_model = None
94
+ self.tokenizer = None
95
+ self.sentiment_model = None
96
+ self.sentiment_tokenizer = None
97
+ self.xgboost_model = None
98
+ self.model_loaded = False
99
+
100
+ # Default model paths
101
+ if model_path is None:
102
+ backend_dir = Path(__file__).parent
103
+ model_path = str(backend_dir.parent / "model" / "model")
104
+
105
+ self.model_path = model_path
106
+ self.xgboost_path = str(Path(model_path).parent / "xgboost_model.json")
107
+
108
+ # Load the models
109
+ self._load_models()
110
+
111
+ def _load_models(self):
112
+ """Load DeBERTa, sentiment model, and XGBoost classifier"""
113
+ try:
114
+ logger.info(f"Loading models from: {self.model_path}")
115
+ logger.info(f"Using device: {self.device}")
116
+
117
+ # Check if model path exists
118
+ if not os.path.exists(self.model_path):
119
+ logger.error(f"Model path does not exist: {self.model_path}")
120
+ raise FileNotFoundError(f"Model not found at {self.model_path}")
121
+
122
+ # 1. Load DeBERTa tokenizer and model
123
+ logger.info("Loading DeBERTa tokenizer...")
124
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
125
+
126
+ logger.info("Loading DeBERTa model...")
127
+ self.deberta_model = DesklibAIDetectionModel.from_pretrained(self.model_path)
128
+ self.deberta_model.to(self.device)
129
+ self.deberta_model.eval()
130
+
131
+ # 2. Load sentiment analysis model (DistilBERT)
132
+ logger.info("Loading sentiment model...")
133
+ sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
134
+ self.sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
135
+ self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
136
+ self.sentiment_model.to(self.device)
137
+ self.sentiment_model.eval()
138
+
139
+ # 3. Load XGBoost model
140
+ if os.path.exists(self.xgboost_path):
141
+ logger.info(f"Loading XGBoost model from: {self.xgboost_path}")
142
+ self.xgboost_model = xgb.Booster()
143
+ self.xgboost_model.load_model(self.xgboost_path)
144
+ logger.info("✅ XGBoost model loaded!")
145
+ else:
146
+ logger.warning(f"XGBoost model not found at {self.xgboost_path}, using DeBERTa only")
147
+ self.xgboost_model = None
148
+
149
+ self.model_loaded = True
150
+ logger.info("✅ All models loaded successfully!")
151
+
152
+ except Exception as e:
153
+ logger.error(f"Failed to load models: {e}", exc_info=True)
154
+ self.model_loaded = False
155
+ raise
156
+
157
+ def is_loaded(self) -> bool:
158
+ """Check if model is loaded"""
159
+ return self.model_loaded
160
+
161
+ def get_sentiment_scores(self, text: str) -> List[float]:
162
+ """
163
+ Extract sentiment scores for each sentence using DistilBERT
164
+
165
+ Args:
166
+ text: Input text
167
+
168
+ Returns:
169
+ List of sentiment scores (polarity) for each sentence
170
+ """
171
+ try:
172
+ # Tokenize into sentences
173
+ sentences = sent_tokenize(text)
174
+ if not sentences:
175
+ return [0.5] # Neutral if no sentences
176
+
177
+ scores = []
178
+
179
+ with torch.no_grad():
180
+ for sentence in sentences:
181
+ # Tokenize sentence
182
+ inputs = self.sentiment_tokenizer(
183
+ sentence,
184
+ return_tensors="pt",
185
+ padding=True,
186
+ truncation=True,
187
+ max_length=512
188
+ )
189
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
190
+
191
+ # Get sentiment prediction
192
+ outputs = self.sentiment_model(**inputs)
193
+ logits = outputs.logits
194
+ probabilities = F.softmax(logits, dim=-1)
195
+
196
+ # Get positive sentiment probability (index 1)
197
+ pos_prob = probabilities[0][1].item()
198
+ # Convert to polarity score (-1 to 1, where 0.5 is neutral)
199
+ polarity = (pos_prob - 0.5) * 2 # Maps [0,1] to [-1,1]
200
+ scores.append(polarity)
201
+
202
+ return scores
203
+
204
+ except Exception as e:
205
+ logger.error(f"Error extracting sentiment scores: {e}")
206
+ return [0.0] # Return neutral on error
207
+
208
+ def extract_sentiment_features(self, text: str) -> np.ndarray:
209
+ """
210
+ Extract avg_polarity and polarity_variance from text
211
+
212
+ Args:
213
+ text: Input text
214
+
215
+ Returns:
216
+ Numpy array with [avg_polarity, polarity_variance]
217
+ """
218
+ sentiment_scores = self.get_sentiment_scores(text)
219
+
220
+ # Calculate features
221
+ avg_polarity = float(np.mean(sentiment_scores)) if sentiment_scores else 0.0
222
+ polarity_variance = float(np.var(sentiment_scores)) if len(sentiment_scores) > 1 else 0.0
223
+
224
+ return np.array([avg_polarity, polarity_variance], dtype=np.float32)
225
+
226
+ def get_deberta_embeddings(self, text: str) -> np.ndarray:
227
+ """
228
+ Get DeBERTa embeddings for text using mean pooling
229
+
230
+ Args:
231
+ text: Input text
232
+
233
+ Returns:
234
+ Numpy array of embeddings
235
+ """
236
+ try:
237
+ # Tokenize input
238
+ encoded = self.tokenizer(
239
+ text,
240
+ padding='max_length',
241
+ truncation=True,
242
+ max_length=self.max_length,
243
+ return_tensors='pt'
244
+ )
245
+
246
+ input_ids = encoded['input_ids'].to(self.device)
247
+ attention_mask = encoded['attention_mask'].to(self.device)
248
+
249
+ # Get embeddings
250
+ with torch.no_grad():
251
+ outputs = self.deberta_model.model(input_ids=input_ids, attention_mask=attention_mask)
252
+ last_hidden_state = outputs[0]
253
+
254
+ # Mean pooling
255
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
256
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
257
+ sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
258
+ pooled_output = sum_embeddings / sum_mask
259
+
260
+ # Convert to numpy
261
+ embeddings = pooled_output.cpu().numpy().flatten()
262
+
263
+ return embeddings
264
+
265
+ except Exception as e:
266
+ logger.error(f"Error extracting DeBERTa embeddings: {e}", exc_info=True)
267
+ raise
268
+
269
+ def predict_probability(self, text: str, threshold: float = 0.5) -> Dict[str, Any]:
270
+ """
271
+ Predict if text is AI-generated using two-branch architecture
272
+
273
+ Args:
274
+ text: Input text to analyze
275
+ threshold: Classification threshold (default: 0.5)
276
+
277
+ Returns:
278
+ Dictionary with probability, label, sentiment features
279
+ """
280
+ if not self.model_loaded:
281
+ raise RuntimeError("Model not loaded. Cannot perform prediction.")
282
+
283
+ try:
284
+ # Extract sentiment features
285
+ logger.info("Extracting sentiment features...")
286
+ sentiment_features = self.extract_sentiment_features(text)
287
+ avg_polarity = float(sentiment_features[0])
288
+ polarity_variance = float(sentiment_features[1])
289
+
290
+ # If XGBoost is available, use the full two-branch pipeline
291
+ if self.xgboost_model is not None:
292
+ logger.info("Using XGBoost two-branch model...")
293
+
294
+ # Get DeBERTa embeddings
295
+ deberta_embeddings = self.get_deberta_embeddings(text)
296
+
297
+ # Combine features: DeBERTa embeddings + sentiment features
298
+ combined_features = np.concatenate([deberta_embeddings, sentiment_features])
299
+
300
+ # Create DMatrix for XGBoost
301
+ dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))
302
+
303
+ # Predict
304
+ probability = float(self.xgboost_model.predict(dmatrix)[0])
305
+
306
+ else:
307
+ # Fallback to DeBERTa only
308
+ logger.info("Using DeBERTa model only (XGBoost not found)...")
309
+
310
+ encoded = self.tokenizer(
311
+ text,
312
+ padding='max_length',
313
+ truncation=True,
314
+ max_length=self.max_length,
315
+ return_tensors='pt'
316
+ )
317
+
318
+ input_ids = encoded['input_ids'].to(self.device)
319
+ attention_mask = encoded['attention_mask'].to(self.device)
320
+
321
+ with torch.no_grad():
322
+ outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
323
+ logits = outputs["logits"]
324
+ probability = torch.sigmoid(logits).item()
325
+
326
+ label = 1 if probability >= threshold else 0
327
+
328
+ return {
329
+ "probability": probability,
330
+ "label": label,
331
+ "classification": "ai" if label == 1 else "human",
332
+ "confidence": probability if label == 1 else (1 - probability),
333
+ "sentiment_features": {
334
+ "avg_polarity": avg_polarity,
335
+ "polarity_variance": polarity_variance
336
+ }
337
+ }
338
+
339
+ except Exception as e:
340
+ logger.error(f"Prediction error: {e}", exc_info=True)
341
+ raise
342
+
343
+ def predict_single_text_xgboost(self, text: str) -> Tuple[float, int]:
344
+ """
345
+ Predict AI probability and label for a single text using XGBoost model
346
+
347
+ Args:
348
+ text: Input text to analyze
349
+
350
+ Returns:
351
+ Tuple of (probability, label) where label is 0 for human, 1 for AI
352
+ """
353
+ try:
354
+ # Extract sentiment features
355
+ sentiment_features = self.extract_sentiment_features(text)
356
+ avg_polarity = float(sentiment_features[0])
357
+ polarity_variance = float(sentiment_features[1])
358
+
359
+ # If XGBoost is available, use the full two-branch pipeline
360
+ if self.xgboost_model is not None:
361
+ # Get DeBERTa embeddings
362
+ deberta_embeddings = self.get_deberta_embeddings(text)
363
+
364
+ # Combine features: DeBERTa embeddings + sentiment features
365
+ combined_features = np.concatenate([deberta_embeddings, sentiment_features])
366
+
367
+ # Create DMatrix for XGBoost
368
+ dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))
369
+
370
+ # Predict
371
+ probability = float(self.xgboost_model.predict(dmatrix)[0])
372
+
373
+ else:
374
+ # Fallback to DeBERTa only
375
+ encoded = self.tokenizer(
376
+ text,
377
+ padding='max_length',
378
+ truncation=True,
379
+ max_length=self.max_length,
380
+ return_tensors='pt'
381
+ )
382
+
383
+ input_ids = encoded['input_ids'].to(self.device)
384
+ attention_mask = encoded['attention_mask'].to(self.device)
385
+
386
+ with torch.no_grad():
387
+ outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
388
+ logits = outputs["logits"]
389
+ probability = torch.sigmoid(logits).item()
390
+
391
+ label = 1 if probability >= 0.5 else 0
392
+
393
+ return probability, label
394
+
395
+ except Exception as e:
396
+ logger.error(f"Single text prediction error: {e}", exc_info=True)
397
+ raise
398
+
399
+ def detect_mixed_text_chunk_based(self, text: str, chunk_size: int = 4, overlap: int = 1, min_chunk_length: int = 50) -> Dict[str, Any]:
400
+ """
401
+ Improved mixed text detection using chunk-based analysis that influences overall probability
402
+
403
+ Args:
404
+ text: Input text string
405
+ chunk_size: Number of sentences per chunk (default: 4)
406
+ overlap: Number of sentences to overlap between chunks (default: 1)
407
+ min_chunk_length: Minimum character length for a chunk to be analyzed
408
+
409
+ Returns:
410
+ Dictionary with prediction results and analysis details
411
+
412
+ Note:
413
+ Input validation: Text must be 200-7000 words. Dynamic chunking: 4-5 sentences
414
+ analyzed as whole, then chunk size varies:
415
+ - 6-10 sentences: 3 sentences per chunk
416
+ - 11-20 sentences: 4 sentences per chunk
417
+ - 21-30 sentences: 5 sentences per chunk
418
+ - 31+ sentences: 6 sentences per chunk
419
+ Uses overlapping chunks to capture transitions between AI and human content.
420
+ """
421
+ # Get overall prediction (your current method)
422
+ overall_prob, overall_label = self.predict_single_text_xgboost(text)
423
+
424
+ # Split text into sentences
425
+ sentences = sent_tokenize(text)
426
+
427
+ # Validate input text length (200-7000 words)
428
+ total_words = len(text.split())
429
+ if total_words < 200:
430
+ return {
431
+ 'prediction': 'Human' if overall_label == 0 else 'AI',
432
+ 'confidence': abs(overall_prob - 0.5) * 2,
433
+ 'is_mixed': False,
434
+ 'reason': f'Text too short for analysis ({total_words} words, minimum 200 words required)',
435
+ 'overall_probability': overall_prob,
436
+ 'modified_probability': overall_prob,
437
+ 'chunk_analysis': []
438
+ }
439
+ elif total_words > 7000:
440
+ return {
441
+ 'prediction': 'Human' if overall_label == 0 else 'AI',
442
+ 'confidence': abs(overall_prob - 0.5) * 2,
443
+ 'is_mixed': False,
444
+ 'reason': f'Text too long for analysis ({total_words} words, maximum 7000 words allowed)',
445
+ 'overall_probability': overall_prob,
446
+ 'modified_probability': overall_prob,
447
+ 'chunk_analysis': []
448
+ }
449
+
450
+ # Dynamic chunking based on total sentence count
451
+ total_sentences = len(sentences)
452
+
453
+ # For 4-5 sentences, analyze as whole (no chunking)
454
+ if total_sentences <= 5:
455
+ return {
456
+ 'prediction': 'Human' if overall_label == 0 else 'AI',
457
+ 'confidence': abs(overall_prob - 0.5) * 2,
458
+ 'is_mixed': False,
459
+ 'reason': f'Analyzing {total_sentences} sentences as whole (4-5 sentence range)',
460
+ 'overall_probability': overall_prob,
461
+ 'modified_probability': overall_prob,
462
+ 'chunk_analysis': []
463
+ }
464
+
465
+ # Dynamic chunk size based on total sentences
466
+ if total_sentences <= 10:
467
+ dynamic_chunk_size = 3
468
+ elif total_sentences <= 20:
469
+ dynamic_chunk_size = 4
470
+ elif total_sentences <= 30:
471
+ dynamic_chunk_size = 5
472
+ else:
473
+ dynamic_chunk_size = 6 # For very long texts
474
+
475
+ # Ensure we have enough sentences for at least 2 chunks
476
+ if total_sentences < dynamic_chunk_size * 2:
477
+ return {
478
+ 'prediction': 'Human' if overall_label == 0 else 'AI',
479
+ 'confidence': abs(overall_prob - 0.5) * 2,
480
+ 'is_mixed': False,
481
+ 'reason': f'Text too short for chunk analysis ({total_sentences} sentences, need at least {dynamic_chunk_size * 2})',
482
+ 'overall_probability': overall_prob,
483
+ 'modified_probability': overall_prob,
484
+ 'chunk_analysis': []
485
+ }
486
+
487
+ # Create overlapping chunks
488
+ chunks = []
489
+ chunk_predictions = []
490
+ chunk_probabilities = []
491
+
492
+ logger.info(f"Analyzing text with {total_sentences} sentences using dynamic chunk size of {dynamic_chunk_size}...")
493
+
494
+ for i in range(0, len(sentences) - dynamic_chunk_size + 1, dynamic_chunk_size - overlap):
495
+ # Create chunk from sentences
496
+ chunk_sentences = sentences[i:i + dynamic_chunk_size]
497
+ chunk_text = ' '.join(chunk_sentences)
498
+
499
+ # Only analyze chunks that meet minimum length requirement
500
+ if len(chunk_text.strip()) >= min_chunk_length:
501
+ chunks.append(chunk_text)
502
+
503
+ # Analyze this chunk
504
+ prob, label = self.predict_single_text_xgboost(chunk_text)
505
+ chunk_predictions.append((prob, label))
506
+ chunk_probabilities.append(prob)
507
+
508
+ logger.info(f" Chunk {len(chunks)}: {chunk_text[:60]}... → {'AI' if label == 1 else 'Human'} ({prob:.3f})")
509
+
510
+ if len(chunk_predictions) < 2:
511
+ return {
512
+ 'prediction': 'Human' if overall_label == 0 else 'AI',
513
+ 'confidence': abs(overall_prob - 0.5) * 2,
514
+ 'is_mixed': False,
515
+ 'reason': 'Too few chunks for mixed analysis',
516
+ 'overall_probability': overall_prob,
517
+ 'modified_probability': overall_prob,
518
+ 'chunk_analysis': chunk_predictions
519
+ }
520
+
521
+ # Count human vs AI chunks
522
+ human_chunks = sum(1 for _, label in chunk_predictions if label == 0)
523
+ ai_chunks = sum(1 for _, label in chunk_predictions if label == 1)
524
+ total_chunks = len(chunk_predictions)
525
+
526
+ # Mixed text detection logic
527
+ is_mixed = human_chunks > 0 and ai_chunks > 0
528
+ mixed_ratio = min(human_chunks, ai_chunks) / total_chunks
529
+
530
+ logger.info(f"\nChunk Analysis Summary:")
531
+ logger.info(f" Total chunks analyzed: {total_chunks}")
532
+ logger.info(f" Human chunks: {human_chunks}")
533
+ logger.info(f" AI chunks: {ai_chunks}")
534
+ logger.info(f" Mixed ratio: {mixed_ratio:.2f}")
535
+
536
+ # MODIFY OVERALL PROBABILITY BASED ON CHUNK ANALYSIS
537
+ if is_mixed and mixed_ratio > 0.25: # At least 25% of each type
538
+ # Calculate weighted average of chunk probabilities
539
+ # Weight by chunk length (longer chunks have more influence)
540
+ chunk_weights = [len(chunk) for chunk in chunks]
541
+ total_weight = sum(chunk_weights)
542
+
543
+ # Calculate weighted average probability
544
+ weighted_prob = sum(prob * weight for prob, weight in zip(chunk_probabilities, chunk_weights)) / total_weight
545
+
546
+ # Blend original overall probability with chunk-based probability
547
+ # More chunks = more influence from chunk analysis
548
+ chunk_influence = min(total_chunks / 5.0, 1.0) # Max influence at 5+ chunks
549
+ modified_prob = (overall_prob * (1 - chunk_influence)) + (weighted_prob * chunk_influence)
550
+
551
+ final_prediction = 'Mixed'
552
+ confidence = 1.0 - mixed_ratio # Lower confidence for mixed text
553
+
554
+ logger.info(f" → MIXED TEXT DETECTED!")
555
+ logger.info(f" → Original overall probability: {overall_prob:.3f}")
556
+ logger.info(f" → Weighted chunk probability: {weighted_prob:.3f}")
557
+ logger.info(f" → Chunk influence factor: {chunk_influence:.3f}")
558
+ logger.info(f" → Modified probability: {modified_prob:.3f}")
559
+
560
+ else:
561
+ # Pure text - use chunk analysis to refine overall probability
562
+ chunk_avg_prob = np.mean(chunk_probabilities)
563
+
564
+ # Blend overall and chunk probabilities (chunks have 30% influence for pure text)
565
+ modified_prob = (overall_prob * 0.7) + (chunk_avg_prob * 0.3)
566
+
567
+ final_prediction = 'Human' if modified_prob < 0.5 else 'AI'
568
+ confidence = abs(modified_prob - 0.5) * 2
569
+
570
+ logger.info(f" → Pure {final_prediction} text")
571
+ logger.info(f" → Original overall probability: {overall_prob:.3f}")
572
+ logger.info(f" → Average chunk probability: {chunk_avg_prob:.3f}")
573
+ logger.info(f" → Modified probability: {modified_prob:.3f}")
574
+
575
+ return {
576
+ 'prediction': final_prediction,
577
+ 'confidence': confidence,
578
+ 'is_mixed': is_mixed,
579
+ 'mixed_ratio': mixed_ratio,
580
+ 'human_chunks': human_chunks,
581
+ 'ai_chunks': ai_chunks,
582
+ 'total_chunks': total_chunks,
583
+ 'overall_probability': overall_prob,
584
+ 'modified_probability': modified_prob,
585
+ 'chunk_probabilities': chunk_probabilities,
586
+ 'chunk_analysis': chunk_predictions,
587
+ 'chunk_size': chunk_size,
588
+ 'overlap': overlap
589
+ }
590
+
591
+ def detect_ai(self, text: str) -> Dict[str, Any]:
592
+ """
593
+ AI detection with chunk-based mixed text analysis
594
+
595
+ Args:
596
+ text: Input text
597
+
598
+ Returns:
599
+ Detection results with sentiment features and mixed text analysis
600
+ """
601
+ # Use chunk-based detection for better mixed text handling
602
+ chunk_result = self.detect_mixed_text_chunk_based(text)
603
+
604
+ # Get sentiment features for explanation
605
+ sentiment_features = self.extract_sentiment_features(text)
606
+ avg_pol = float(sentiment_features[0])
607
+ pol_var = float(sentiment_features[1])
608
+
609
+ # Generate explanation based on prediction type
610
+ confidence_pct = chunk_result["confidence"] * 100
611
+ prediction = chunk_result["prediction"]
612
+
613
+ if confidence_pct > 90:
614
+ certainty = "very high confidence"
615
+ elif confidence_pct > 75:
616
+ certainty = "high confidence"
617
+ elif confidence_pct > 60:
618
+ certainty = "moderate confidence"
619
+ else:
620
+ certainty = "low confidence"
621
+
622
+ # Generate explanation based on prediction type
623
+ if prediction == "Mixed":
624
+ explanation = f"This text appears to be a mixture of AI-generated and human-authored text."
625
+ explanation += " This mixed composition suggests the text may have been collaboratively written or heavily edited."
626
+
627
+ # Add sentiment insights for mixed text
628
+ if pol_var > 0.60:
629
+ explanation += " High emotional variation across sections indicates significant style differences between parts."
630
+ elif pol_var >= 0.36:
631
+ explanation += " Moderate emotional variation suggests different writing styles in various sections."
632
+ else:
633
+ explanation += " Low emotional variation may indicate consistent editing or similar writing styles throughout."
634
+
635
+ elif prediction == "AI":
636
+ explanation = f"This text is classified as AI-Generated with {certainty}."
637
+ explanation += " The text exhibits patterns typical of AI-generated content, including consistent structure and predictable phrasing."
638
+ if pol_var <= 0.10:
639
+ explanation += " Very low emotional variation which is typical of AI texts with uniform style."
640
+ elif pol_var <= 0.35:
641
+ explanation += " Low emotional variation which is common in AI-generated content."
642
+ elif pol_var <= 0.60:
643
+ explanation += " Moderate emotional variation which is rare in AI, possibly presenting multiple viewpoints."
644
+ else:
645
+ explanation += " High emotional variation is unusual for AI, may indicate balanced argument structure."
646
+ else: # Human
647
+ explanation = f"This text is classified as Human-Authored with {certainty}."
648
+ explanation += " The text shows characteristics of human writing, such as natural variations and organic flow."
649
+ if pol_var > 0.60:
650
+ explanation += " High emotional variation which is typical of human writing with emotional swings in debates, reviews, and narratives."
651
+ elif pol_var >= 0.36:
652
+ explanation += " Moderate emotional variation which shows human-like sentiment shifts."
653
+ elif pol_var >= 0.11:
654
+ explanation += " Low emotional variation which may indicate formal or academic human writing."
655
+ else:
656
+ explanation += " Very low emotional variation indicates consistent tone with focused perspective."
657
+
658
+ # Convert prediction to classification format for backward compatibility
659
+ classification_map = {"AI": "ai", "Human": "human", "Mixed": "mixed"}
660
+ classification = classification_map.get(prediction, "unknown")
661
+
662
+ return {
663
+ "classification": classification,
664
+ "prediction": prediction,
665
+ "probability": chunk_result["modified_probability"],
666
+ "confidence": confidence_pct,
667
+ "explanation": explanation,
668
+ "sentiment_features": {
669
+ "avg_polarity": avg_pol,
670
+ "polarity_variance": pol_var
671
+ },
672
+ "mixed_analysis": {
673
+ "is_mixed": chunk_result["is_mixed"],
674
+ "mixed_ratio": chunk_result.get("mixed_ratio", 0),
675
+ "human_chunks": chunk_result.get("human_chunks", 0),
676
+ "ai_chunks": chunk_result.get("ai_chunks", 0),
677
+ "total_chunks": chunk_result.get("total_chunks", 0),
678
+ "overall_probability": chunk_result["overall_probability"],
679
+ "modified_probability": chunk_result["modified_probability"]
680
+ }
681
+ }
682
+
683
+ def analyze_text(self, text: str) -> Dict[str, Any]:
684
+ """
685
+ Comprehensive text analysis combining AI detection with sentiment features
686
+
687
+ Args:
688
+ text: Input text to analyze
689
+
690
+ Returns:
691
+ Complete analysis results with model-based sentiment features
692
+ """
693
+ # Validate input text length (200-7000 words)
694
+ total_words = len(text.split())
695
+ if total_words < 200:
696
+ raise ValueError(f"Text too short for analysis ({total_words} words, minimum 200 words required)")
697
+ elif total_words > 7000:
698
+ raise ValueError(f"Text too long for analysis ({total_words} words, maximum 7000 words allowed)")
699
+
700
+ # Get AI detection results (includes sentiment features from model)
701
+ ai_detection = self.detect_ai(text)
702
+ model_sentiment = ai_detection.get("sentiment_features", {})
703
+
704
+ # Perform basic text analysis
705
+ words = text.split()
706
+ sentences = [s.strip() for s in text.replace('!', '.').replace('?', '.').split('.') if s.strip()]
707
+
708
+ # Calculate basic metrics
709
+ word_count = len(words)
710
+ sentence_count = len(sentences)
711
+ avg_word_length = np.mean([len(w) for w in words]) if words else 0
712
+ avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
713
+
714
+ # Determine complexity based on AI probability and text metrics
715
+ is_ai = ai_detection["classification"] == "ai"
716
+ is_mixed = ai_detection["classification"] == "mixed"
717
+ ai_prob = ai_detection["probability"]
718
+
719
+ # Handle different prediction types
720
+ if is_mixed:
721
+ formality = "mixed"
722
+ complexity = "variable"
723
+ tone = "Mixed (AI/Human)"
724
+ audience = "Variable"
725
+ elif is_ai:
726
+ formality = "formal" if ai_prob > 0.7 else "neutral"
727
+ complexity = "complex" if avg_word_length > 6 else "moderate"
728
+ tone = "Professional"
729
+ audience = "General to Academic"
730
+ else:
731
+ formality = "casual" if avg_word_length < 5 else "neutral"
732
+ complexity = "simple" if avg_sentence_length < 15 else "moderate"
733
+ tone = "Conversational"
734
+ audience = "General Public"
735
+
736
+ # Generate insights based on detection results
737
+ insights = []
738
+
739
+ if is_mixed and ai_detection["confidence"] > 60:
740
+ mixed_analysis = ai_detection.get("mixed_analysis", {})
741
+ insights.append({
742
+ "type": "observation",
743
+ "title": "Mixed Content Detected",
744
+ "description": f"This text contains both AI-generated and human-authored sections ({ai_detection['confidence']:.1f}% confidence).",
745
+ "suggestion": "Consider reviewing the text for consistency and ensuring all sections align with your intended voice and style."
746
+ })
747
+ insights.append({
748
+ "type": "observation",
749
+ "title": "Content Composition",
750
+ "description": f"Analysis found {mixed_analysis.get('human_chunks', 0)} human-like sections and {mixed_analysis.get('ai_chunks', 0)} AI-like sections.",
751
+ "suggestion": "The mixed nature suggests collaborative writing or heavy editing. Consider standardizing the writing style throughout."
752
+ })
753
+ elif is_ai and ai_detection["confidence"] > 75:
754
+ insights.append({
755
+ "type": "observation",
756
+ "title": "AI-Generated Content Detected",
757
+ "description": f"This text shows strong indicators of AI generation ({ai_detection['confidence']:.1f}% confidence).",
758
+ "suggestion": "Consider adding personal anecdotes, varied sentence structures, or unique perspectives to make it more human-like."
759
+ })
760
+ elif not is_ai and ai_detection["confidence"] > 75:
761
+ insights.append({
762
+ "type": "strength",
763
+ "title": "Human Writing Characteristics",
764
+ "description": f"This text exhibits clear human writing patterns ({ai_detection['confidence']:.1f}% confidence)."
765
+ })
766
+
767
+ # Sentence variety analysis
768
+ if sentence_count > 2:
769
+ sentence_lengths = [len(s.split()) for s in sentences]
770
+ std_dev = np.std(sentence_lengths)
771
+ if std_dev < 3:
772
+ insights.append({
773
+ "type": "improvement",
774
+ "title": "Sentence Variety",
775
+ "description": "Sentences have similar lengths, which may indicate AI generation.",
776
+ "suggestion": "Vary sentence lengths to create more natural rhythm."
777
+ })
778
+ else:
779
+ insights.append({
780
+ "type": "strength",
781
+ "title": "Good Sentence Variety",
782
+ "description": "Text shows natural variation in sentence structure."
783
+ })
784
+
785
+ # Generate emotions based on model sentiment polarity (data-driven ranges)
786
+ avg_polarity = model_sentiment.get("avg_polarity", 0)
787
+ emotions = []
788
+
789
+ if avg_polarity >= 0.71:
790
+ emotions.append({"emotion": "very_positive", "score": min(abs(avg_polarity), 1.0), "intensity": "high"})
791
+ elif avg_polarity >= 0.30:
792
+ emotions.append({"emotion": "positive", "score": min(abs(avg_polarity), 1.0), "intensity": "medium"})
793
+ elif avg_polarity >= -0.29:
794
+ emotions.append({"emotion": "neutral", "score": 0.8, "intensity": "medium"})
795
+ else:
796
+ emotions.append({"emotion": "negative", "score": min(abs(avg_polarity), 1.0), "intensity": "high"})
797
+
798
+ # Construct full analysis response with model sentiment features
799
+ polarity_variance = model_sentiment.get("polarity_variance", 0)
800
+
801
+ return {
802
+ "advancedSentiment": {
803
+ "emotions": emotions,
804
+ "confidence": 70 + (ai_detection["confidence"] * 0.3),
805
+ "context": f"The text appears to be {'AI-Generated' if ai_detection['classification'] == 'ai' else 'Human-Authored'} based on linguistic patterns and sentiment analysis.",
806
+ "avg_polarity": model_sentiment.get("avg_polarity", 0),
807
+ "polarity_variance": polarity_variance
808
+ },
809
+ "topics": [
810
+ {
811
+ "topic": "General Content",
812
+ "relevance": 0.8,
813
+ "keywords": words[:5] if len(words) >= 5 else words
814
+ }
815
+ ],
816
+ "writingStyle": {
817
+ "tone": tone,
818
+ "formality": formality,
819
+ "complexity": complexity,
820
+ "style": [formality, complexity, tone],
821
+ "audience": audience,
822
+ "sentiment_consistency": "very_low" if polarity_variance <= 0.10 else "low" if polarity_variance <= 0.35 else "moderate" if polarity_variance <= 0.60 else "high"
823
+ },
824
+ "insights": insights,
825
+ "plagiarismRisk": {
826
+ "score": int(ai_prob * 100) if is_ai else (int(ai_prob * 70) if is_mixed else 10),
827
+ "level": "high" if is_ai and ai_prob > 0.8 else "medium" if (is_ai or is_mixed) else "low",
828
+ "details": f"{'High' if is_ai else 'Moderate' if is_mixed else 'Low'} similarity to AI-generated patterns detected."
829
+ },
830
+ "contentQuality": {
831
+ "overall": int(85 - (ai_prob * 20)) if is_ai else (int(80 - (ai_prob * 15)) if is_mixed else 90),
832
+ "clarity": int(90 - (ai_prob * 10)) if not is_mixed else int(85 - (ai_prob * 8)),
833
+ "coherence": int(88 - (ai_prob * 8)) if not is_mixed else int(82 - (ai_prob * 6)),
834
+ "engagement": int(75 - (ai_prob * 25)) if not is_mixed else int(70 - (ai_prob * 20)),
835
+ "originality": int(60 - (ai_prob * 40)) if is_ai else (int(70 - (ai_prob * 30)) if is_mixed else 85)
836
+ },
837
+ "aiOrHuman": ai_detection["classification"],
838
+ "aiOrHumanConfidence": ai_detection["confidence"],
839
+ "aiOrHumanExplanation": ai_detection["explanation"]
840
+ }
841
+
842
+ def get_model_info(self) -> Dict[str, Any]:
843
+ """Get information about the loaded models"""
844
+ return {
845
+ "model_loaded": self.model_loaded,
846
+ "model_path": self.model_path,
847
+ "device": str(self.device),
848
+ "max_length": self.max_length,
849
+ "architecture": "Two-Branch (DeBERTa + Sentiment Features)",
850
+ "primary_model": "DeBERTa-v3-large (desklib/ai-text-detector-v1.01)",
851
+ "sentiment_model": "DistilBERT-SST-2",
852
+ "classifier": "XGBoost" if self.xgboost_model is not None else "DeBERTa Linear",
853
+ "features": [
854
+ "DeBERTa embeddings (1024 dimensions)",
855
+ "Average sentiment polarity",
856
+ "Sentiment polarity variance"
857
+ ],
858
+ "description": "Two-branch model for detecting AI-Generated vs Human-Authored text using DeBERTa semantic embeddings combined with sentiment features"
859
+ }
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI and Server
2
+ fastapi>=0.104.1
3
+ uvicorn[standard]>=0.24.0
4
+ python-multipart>=0.0.6
5
+ pydantic>=2.5.0
6
+
7
+ # Machine Learning & AI (Compatible with Python 3.13)
8
+ torch>=2.8.0
9
+ transformers>=4.35.0
10
+ safetensors>=0.4.0
11
+ xgboost>=2.0.0
12
+ nltk>=3.8.0
13
+ scikit-learn>=1.3.0
14
+
15
+ # Data Processing
16
+ numpy>=1.26.2
17
+ pandas>=2.1.3
18
+
19
+ # Utilities
20
+ python-dotenv>=1.0.0
21
+ requests>=2.31.0
22
+
23
+ # Optional: For better performance
24
+ # accelerate>=0.24.1
25
+ # optimum>=1.14.0