Spaces:

Jaja-09
/

authorchecks-backend

Sleeping

App Files Files Community

Jaja-09 commited on Oct 12

Commit

78a22a6

1 Parent(s): b8013b4

backend: add FastAPI app, requirements, Dockerfile (HF model download)

Browse files

Files changed (3) hide show

app.py +256 -0
model_handler.py +859 -0
requirements.txt +25 -0

app.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""
+FastAPI Backend Server for AuthorCheck AI Detection
+Uses DeBERTa model for AI-generated text detection
+"""
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from typing import List, Optional
+import uvicorn
+from model_handler import AIDetectionModelHandler
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(
+    title="AuthorCheck API",
+    description="AI-powered text analysis and detection API",
+    version="1.0.0"
+)
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with your frontend URL
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize model handler
+model_handler = AIDetectionModelHandler()
+# Request/Response Models
+class AnalysisRequest(BaseModel):
+    text: str = Field(..., min_length=1, description="Text to analyze")
+    analysisTypes: Optional[List[str]] = Field(
+        default=["all"],
+        description="Types of analysis to perform"
+    )
+    model: Optional[str] = Field(
+        default="ai-detector",
+        description="Model to use for analysis"
+    )
+class EmotionData(BaseModel):
+    emotion: str
+    score: float
+    intensity: str
+class AdvancedSentiment(BaseModel):
+    emotions: List[EmotionData]
+    confidence: float
+    context: str
+    avg_polarity: Optional[float] = None
+    polarity_variance: Optional[float] = None
+class TopicData(BaseModel):
+    topic: str
+    relevance: float
+    keywords: List[str]
+class WritingStyle(BaseModel):
+    tone: str
+    formality: str
+    complexity: str
+    style: List[str]
+    audience: str
+    sentiment_consistency: Optional[str] = None
+class Insight(BaseModel):
+    type: str
+    title: str
+    description: str
+    suggestion: Optional[str] = None
+class PlagiarismRisk(BaseModel):
+    score: int
+    level: str
+    details: str
+class ContentQuality(BaseModel):
+    overall: int
+    clarity: int
+    coherence: int
+    engagement: int
+    originality: int
+class AnalysisResponse(BaseModel):
+    advancedSentiment: AdvancedSentiment
+    topics: List[TopicData]
+    writingStyle: WritingStyle
+    insights: List[Insight]
+    plagiarismRisk: PlagiarismRisk
+    contentQuality: ContentQuality
+    aiOrHuman: str
+    aiOrHumanConfidence: float
+    aiOrHumanExplanation: str
+# API Endpoints
+@app.get("/")
+async def root():
+    """Root endpoint - API health check"""
+    return {
+        "status": "online",
+        "message": "AuthorCheck API is running",
+        "version": "1.0.0"
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    try:
+        model_loaded = model_handler.is_loaded()
+        return {
+            "status": "healthy" if model_loaded else "degraded",
+            "model_loaded": model_loaded,
+            "model_type": "DeBERTa AI Detector"
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return {
+            "status": "unhealthy",
+            "error": str(e)
+        }
+@app.post("/api/analyze", response_model=AnalysisResponse)
+async def analyze_text(request: AnalysisRequest):
+    """
+    Analyze text using the DeBERTa AI detection model
+    Returns comprehensive analysis including:
+    - AI vs Human detection
+    - Sentiment analysis
+    - Topic detection
+    - Writing style analysis
+    - Content quality metrics
+    """
+    try:
+        if not request.text or len(request.text.strip()) == 0:
+            raise HTTPException(status_code=400, detail="Text cannot be empty")
+        # Check text length for meaningful analysis (200-7000 words)
+        word_count = len(request.text.split())
+        if word_count < 200:
+            raise HTTPException(
+                status_code=400,
+                detail="Text is too short for analysis. Please provide at least 200 words for accurate AI detection and sentiment analysis."
+            )
+        if word_count > 7000:
+            raise HTTPException(
+                status_code=400,
+                detail="Text is too long for analysis. Maximum 7,000 words allowed."
+            )
+        # Perform AI detection using the model
+        logger.info(f"Analyzing text of length: {len(request.text)}")
+        analysis_result = model_handler.analyze_text(request.text)
+        logger.info(f"Analysis complete: {analysis_result['aiOrHuman']} ({analysis_result['aiOrHumanConfidence']:.2f}%)")
+        return analysis_result
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Analysis error: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Analysis failed: {str(e)}"
+        )
+@app.post("/api/detect")
+async def detect_ai(request: AnalysisRequest):
+    """
+    Simple endpoint for AI detection only
+    Returns just the AI/Human classification
+    """
+    try:
+        if not request.text or len(request.text.strip()) == 0:
+            raise HTTPException(status_code=400, detail="Text cannot be empty")
+        # Check text length (200-7000 words)
+        word_count = len(request.text.split())
+        if word_count < 200:
+            raise HTTPException(
+                status_code=400,
+                detail="Text is too short. Please provide at least 200 words."
+            )
+        elif word_count > 7000:
+            raise HTTPException(
+                status_code=400,
+                detail="Text is too long. Maximum 7,000 words allowed."
+            )
+        result = model_handler.detect_ai(request.text)
+        return {
+            "text": request.text[:100] + "..." if len(request.text) > 100 else request.text,
+            "classification": result["classification"],
+            "prediction": result.get("prediction", result["classification"]),
+            "probability": result["probability"],
+            "confidence": result["confidence"],
+            "explanation": result["explanation"],
+            "mixed_analysis": result.get("mixed_analysis", None)
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Detection error: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Detection failed: {str(e)}"
+        )
+@app.get("/api/model/info")
+async def model_info():
+    """Get information about the loaded model"""
+    try:
+        return model_handler.get_model_info()
+    except Exception as e:
+        logger.error(f"Model info error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# Run the server
+if __name__ == "__main__":
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=True,
+        log_level="info"
+    )

model_handler.py ADDED Viewed

	@@ -0,0 +1,859 @@

+"""
+Model Handler for Two-Branch AI Detection Model
+Combines DeBERTa embeddings with sentiment features
+Uses XGBoost for final classification
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel, AutoModelForSequenceClassification
+import os
+import logging
+from typing import Dict, Any, Optional, List, Tuple
+import numpy as np
+from pathlib import Path
+import xgboost as xgb
+import json
+import nltk
+from nltk.tokenize import sent_tokenize
+# Download NLTK data
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt', quiet=True)
+try:
+    nltk.data.find('tokenizers/punkt_tab')
+except LookupError:
+    nltk.download('punkt_tab', quiet=True)
+logger = logging.getLogger(__name__)
+class DesklibAIDetectionModel(PreTrainedModel):
+    """
+    DeBERTa-based AI detection model
+    Architecture from desklib/ai-text-detector-v1.01
+    """
+    config_class = AutoConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # Initialize the base transformer model
+        self.model = AutoModel.from_config(config)
+        # Define a classifier head
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        # Initialize weights
+        self.init_weights()
+    def forward(self, input_ids, attention_mask=None, labels=None):
+        # Forward pass through the transformer
+        outputs = self.model(input_ids, attention_mask=attention_mask)
+        last_hidden_state = outputs[0]
+        # Mean pooling
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
+        sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
+        pooled_output = sum_embeddings / sum_mask
+        # Classifier
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.BCEWithLogitsLoss()
+            loss = loss_fct(logits.view(-1), labels.float())
+        output = {"logits": logits}
+        if loss is not None:
+            output["loss"] = loss
+        return output
+class AIDetectionModelHandler:
+    """
+    Handles Two-Branch AI detection:
+    - DeBERTa for semantic embeddings
+    - Sentiment features (avg_polarity, polarity_variance)
+    - XGBoost for final classification
+    """
+    def __init__(self, model_path: Optional[str] = None, max_length: int = 512):
+        """
+        Initialize the model handler
+        Args:
+            model_path: Path to the model directory (default: ../model/model)
+            max_length: Maximum token length for input text
+        """
+        self.max_length = max_length
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.deberta_model = None
+        self.tokenizer = None
+        self.sentiment_model = None
+        self.sentiment_tokenizer = None
+        self.xgboost_model = None
+        self.model_loaded = False
+        # Default model paths
+        if model_path is None:
+            backend_dir = Path(__file__).parent
+            model_path = str(backend_dir.parent / "model" / "model")
+        self.model_path = model_path
+        self.xgboost_path = str(Path(model_path).parent / "xgboost_model.json")
+        # Load the models
+        self._load_models()
+    def _load_models(self):
+        """Load DeBERTa, sentiment model, and XGBoost classifier"""
+        try:
+            logger.info(f"Loading models from: {self.model_path}")
+            logger.info(f"Using device: {self.device}")
+            # Check if model path exists
+            if not os.path.exists(self.model_path):
+                logger.error(f"Model path does not exist: {self.model_path}")
+                raise FileNotFoundError(f"Model not found at {self.model_path}")
+            # 1. Load DeBERTa tokenizer and model
+            logger.info("Loading DeBERTa tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            logger.info("Loading DeBERTa model...")
+            self.deberta_model = DesklibAIDetectionModel.from_pretrained(self.model_path)
+            self.deberta_model.to(self.device)
+            self.deberta_model.eval()
+            # 2. Load sentiment analysis model (DistilBERT)
+            logger.info("Loading sentiment model...")
+            sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+            self.sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
+            self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
+            self.sentiment_model.to(self.device)
+            self.sentiment_model.eval()
+            # 3. Load XGBoost model
+            if os.path.exists(self.xgboost_path):
+                logger.info(f"Loading XGBoost model from: {self.xgboost_path}")
+                self.xgboost_model = xgb.Booster()
+                self.xgboost_model.load_model(self.xgboost_path)
+                logger.info("✅ XGBoost model loaded!")
+            else:
+                logger.warning(f"XGBoost model not found at {self.xgboost_path}, using DeBERTa only")
+                self.xgboost_model = None
+            self.model_loaded = True
+            logger.info("✅ All models loaded successfully!")
+        except Exception as e:
+            logger.error(f"Failed to load models: {e}", exc_info=True)
+            self.model_loaded = False
+            raise
+    def is_loaded(self) -> bool:
+        """Check if model is loaded"""
+        return self.model_loaded
+    def get_sentiment_scores(self, text: str) -> List[float]:
+        """
+        Extract sentiment scores for each sentence using DistilBERT
+        Args:
+            text: Input text
+        Returns:
+            List of sentiment scores (polarity) for each sentence
+        """
+        try:
+            # Tokenize into sentences
+            sentences = sent_tokenize(text)
+            if not sentences:
+                return [0.5]  # Neutral if no sentences
+            scores = []
+            with torch.no_grad():
+                for sentence in sentences:
+                    # Tokenize sentence
+                    inputs = self.sentiment_tokenizer(
+                        sentence,
+                        return_tensors="pt",
+                        padding=True,
+                        truncation=True,
+                        max_length=512
+                    )
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                    # Get sentiment prediction
+                    outputs = self.sentiment_model(**inputs)
+                    logits = outputs.logits
+                    probabilities = F.softmax(logits, dim=-1)
+                    # Get positive sentiment probability (index 1)
+                    pos_prob = probabilities[0][1].item()
+                    # Convert to polarity score (-1 to 1, where 0.5 is neutral)
+                    polarity = (pos_prob - 0.5) * 2  # Maps [0,1] to [-1,1]
+                    scores.append(polarity)
+            return scores
+        except Exception as e:
+            logger.error(f"Error extracting sentiment scores: {e}")
+            return [0.0]  # Return neutral on error
+    def extract_sentiment_features(self, text: str) -> np.ndarray:
+        """
+        Extract avg_polarity and polarity_variance from text
+        Args:
+            text: Input text
+        Returns:
+            Numpy array with [avg_polarity, polarity_variance]
+        """
+        sentiment_scores = self.get_sentiment_scores(text)
+        # Calculate features
+        avg_polarity = float(np.mean(sentiment_scores)) if sentiment_scores else 0.0
+        polarity_variance = float(np.var(sentiment_scores)) if len(sentiment_scores) > 1 else 0.0
+        return np.array([avg_polarity, polarity_variance], dtype=np.float32)
+    def get_deberta_embeddings(self, text: str) -> np.ndarray:
+        """
+        Get DeBERTa embeddings for text using mean pooling
+        Args:
+            text: Input text
+        Returns:
+            Numpy array of embeddings
+        """
+        try:
+            # Tokenize input
+            encoded = self.tokenizer(
+                text,
+                padding='max_length',
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors='pt'
+            )
+            input_ids = encoded['input_ids'].to(self.device)
+            attention_mask = encoded['attention_mask'].to(self.device)
+            # Get embeddings
+            with torch.no_grad():
+                outputs = self.deberta_model.model(input_ids=input_ids, attention_mask=attention_mask)
+                last_hidden_state = outputs[0]
+                # Mean pooling
+                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+                sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
+                sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
+                pooled_output = sum_embeddings / sum_mask
+            # Convert to numpy
+            embeddings = pooled_output.cpu().numpy().flatten()
+            return embeddings
+        except Exception as e:
+            logger.error(f"Error extracting DeBERTa embeddings: {e}", exc_info=True)
+            raise
+    def predict_probability(self, text: str, threshold: float = 0.5) -> Dict[str, Any]:
+        """
+        Predict if text is AI-generated using two-branch architecture
+        Args:
+            text: Input text to analyze
+            threshold: Classification threshold (default: 0.5)
+        Returns:
+            Dictionary with probability, label, sentiment features
+        """
+        if not self.model_loaded:
+            raise RuntimeError("Model not loaded. Cannot perform prediction.")
+        try:
+            # Extract sentiment features
+            logger.info("Extracting sentiment features...")
+            sentiment_features = self.extract_sentiment_features(text)
+            avg_polarity = float(sentiment_features[0])
+            polarity_variance = float(sentiment_features[1])
+            # If XGBoost is available, use the full two-branch pipeline
+            if self.xgboost_model is not None:
+                logger.info("Using XGBoost two-branch model...")
+                # Get DeBERTa embeddings
+                deberta_embeddings = self.get_deberta_embeddings(text)
+                # Combine features: DeBERTa embeddings + sentiment features
+                combined_features = np.concatenate([deberta_embeddings, sentiment_features])
+                # Create DMatrix for XGBoost
+                dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))
+                # Predict
+                probability = float(self.xgboost_model.predict(dmatrix)[0])
+            else:
+                # Fallback to DeBERTa only
+                logger.info("Using DeBERTa model only (XGBoost not found)...")
+                encoded = self.tokenizer(
+                    text,
+                    padding='max_length',
+                    truncation=True,
+                    max_length=self.max_length,
+                    return_tensors='pt'
+                )
+                input_ids = encoded['input_ids'].to(self.device)
+                attention_mask = encoded['attention_mask'].to(self.device)
+                with torch.no_grad():
+                    outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
+                    logits = outputs["logits"]
+                    probability = torch.sigmoid(logits).item()
+            label = 1 if probability >= threshold else 0
+            return {
+                "probability": probability,
+                "label": label,
+                "classification": "ai" if label == 1 else "human",
+                "confidence": probability if label == 1 else (1 - probability),
+                "sentiment_features": {
+                    "avg_polarity": avg_polarity,
+                    "polarity_variance": polarity_variance
+                }
+            }
+        except Exception as e:
+            logger.error(f"Prediction error: {e}", exc_info=True)
+            raise
+    def predict_single_text_xgboost(self, text: str) -> Tuple[float, int]:
+        """
+        Predict AI probability and label for a single text using XGBoost model
+        Args:
+            text: Input text to analyze
+        Returns:
+            Tuple of (probability, label) where label is 0 for human, 1 for AI
+        """
+        try:
+            # Extract sentiment features
+            sentiment_features = self.extract_sentiment_features(text)
+            avg_polarity = float(sentiment_features[0])
+            polarity_variance = float(sentiment_features[1])
+            # If XGBoost is available, use the full two-branch pipeline
+            if self.xgboost_model is not None:
+                # Get DeBERTa embeddings
+                deberta_embeddings = self.get_deberta_embeddings(text)
+                # Combine features: DeBERTa embeddings + sentiment features
+                combined_features = np.concatenate([deberta_embeddings, sentiment_features])
+                # Create DMatrix for XGBoost
+                dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))
+                # Predict
+                probability = float(self.xgboost_model.predict(dmatrix)[0])
+            else:
+                # Fallback to DeBERTa only
+                encoded = self.tokenizer(
+                    text,
+                    padding='max_length',
+                    truncation=True,
+                    max_length=self.max_length,
+                    return_tensors='pt'
+                )
+                input_ids = encoded['input_ids'].to(self.device)
+                attention_mask = encoded['attention_mask'].to(self.device)
+                with torch.no_grad():
+                    outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
+                    logits = outputs["logits"]
+                    probability = torch.sigmoid(logits).item()
+            label = 1 if probability >= 0.5 else 0
+            return probability, label
+        except Exception as e:
+            logger.error(f"Single text prediction error: {e}", exc_info=True)
+            raise
+    def detect_mixed_text_chunk_based(self, text: str, chunk_size: int = 4, overlap: int = 1, min_chunk_length: int = 50) -> Dict[str, Any]:
+        """
+        Improved mixed text detection using chunk-based analysis that influences overall probability
+        Args:
+            text: Input text string
+            chunk_size: Number of sentences per chunk (default: 4)
+            overlap: Number of sentences to overlap between chunks (default: 1)
+            min_chunk_length: Minimum character length for a chunk to be analyzed
+        Returns:
+            Dictionary with prediction results and analysis details
+        Note:
+            Input validation: Text must be 200-7000 words. Dynamic chunking: 4-5 sentences
+            analyzed as whole, then chunk size varies:
+            - 6-10 sentences: 3 sentences per chunk
+            - 11-20 sentences: 4 sentences per chunk
+            - 21-30 sentences: 5 sentences per chunk
+            - 31+ sentences: 6 sentences per chunk
+            Uses overlapping chunks to capture transitions between AI and human content.
+        """
+        # Get overall prediction (your current method)
+        overall_prob, overall_label = self.predict_single_text_xgboost(text)
+        # Split text into sentences
+        sentences = sent_tokenize(text)
+        # Validate input text length (200-7000 words)
+        total_words = len(text.split())
+        if total_words < 200:
+            return {
+                'prediction': 'Human' if overall_label == 0 else 'AI',
+                'confidence': abs(overall_prob - 0.5) * 2,
+                'is_mixed': False,
+                'reason': f'Text too short for analysis ({total_words} words, minimum 200 words required)',
+                'overall_probability': overall_prob,
+                'modified_probability': overall_prob,
+                'chunk_analysis': []
+            }
+        elif total_words > 7000:
+            return {
+                'prediction': 'Human' if overall_label == 0 else 'AI',
+                'confidence': abs(overall_prob - 0.5) * 2,
+                'is_mixed': False,
+                'reason': f'Text too long for analysis ({total_words} words, maximum 7000 words allowed)',
+                'overall_probability': overall_prob,
+                'modified_probability': overall_prob,
+                'chunk_analysis': []
+            }
+        # Dynamic chunking based on total sentence count
+        total_sentences = len(sentences)
+        # For 4-5 sentences, analyze as whole (no chunking)
+        if total_sentences <= 5:
+            return {
+                'prediction': 'Human' if overall_label == 0 else 'AI',
+                'confidence': abs(overall_prob - 0.5) * 2,
+                'is_mixed': False,
+                'reason': f'Analyzing {total_sentences} sentences as whole (4-5 sentence range)',
+                'overall_probability': overall_prob,
+                'modified_probability': overall_prob,
+                'chunk_analysis': []
+            }
+        # Dynamic chunk size based on total sentences
+        if total_sentences <= 10:
+            dynamic_chunk_size = 3
+        elif total_sentences <= 20:
+            dynamic_chunk_size = 4
+        elif total_sentences <= 30:
+            dynamic_chunk_size = 5
+        else:
+            dynamic_chunk_size = 6  # For very long texts
+        # Ensure we have enough sentences for at least 2 chunks
+        if total_sentences < dynamic_chunk_size * 2:
+            return {
+                'prediction': 'Human' if overall_label == 0 else 'AI',
+                'confidence': abs(overall_prob - 0.5) * 2,
+                'is_mixed': False,
+                'reason': f'Text too short for chunk analysis ({total_sentences} sentences, need at least {dynamic_chunk_size * 2})',
+                'overall_probability': overall_prob,
+                'modified_probability': overall_prob,
+                'chunk_analysis': []
+            }
+        # Create overlapping chunks
+        chunks = []
+        chunk_predictions = []
+        chunk_probabilities = []
+        logger.info(f"Analyzing text with {total_sentences} sentences using dynamic chunk size of {dynamic_chunk_size}...")
+        for i in range(0, len(sentences) - dynamic_chunk_size + 1, dynamic_chunk_size - overlap):
+            # Create chunk from sentences
+            chunk_sentences = sentences[i:i + dynamic_chunk_size]
+            chunk_text = ' '.join(chunk_sentences)
+            # Only analyze chunks that meet minimum length requirement
+            if len(chunk_text.strip()) >= min_chunk_length:
+                chunks.append(chunk_text)
+                # Analyze this chunk
+                prob, label = self.predict_single_text_xgboost(chunk_text)
+                chunk_predictions.append((prob, label))
+                chunk_probabilities.append(prob)
+                logger.info(f"  Chunk {len(chunks)}: {chunk_text[:60]}... → {'AI' if label == 1 else 'Human'} ({prob:.3f})")
+        if len(chunk_predictions) < 2:
+            return {
+                'prediction': 'Human' if overall_label == 0 else 'AI',
+                'confidence': abs(overall_prob - 0.5) * 2,
+                'is_mixed': False,
+                'reason': 'Too few chunks for mixed analysis',
+                'overall_probability': overall_prob,
+                'modified_probability': overall_prob,
+                'chunk_analysis': chunk_predictions
+            }
+        # Count human vs AI chunks
+        human_chunks = sum(1 for _, label in chunk_predictions if label == 0)
+        ai_chunks = sum(1 for _, label in chunk_predictions if label == 1)
+        total_chunks = len(chunk_predictions)
+        # Mixed text detection logic
+        is_mixed = human_chunks > 0 and ai_chunks > 0
+        mixed_ratio = min(human_chunks, ai_chunks) / total_chunks
+        logger.info(f"\nChunk Analysis Summary:")
+        logger.info(f"  Total chunks analyzed: {total_chunks}")
+        logger.info(f"  Human chunks: {human_chunks}")
+        logger.info(f"  AI chunks: {ai_chunks}")
+        logger.info(f"  Mixed ratio: {mixed_ratio:.2f}")
+        # MODIFY OVERALL PROBABILITY BASED ON CHUNK ANALYSIS
+        if is_mixed and mixed_ratio > 0.25:  # At least 25% of each type
+            # Calculate weighted average of chunk probabilities
+            # Weight by chunk length (longer chunks have more influence)
+            chunk_weights = [len(chunk) for chunk in chunks]
+            total_weight = sum(chunk_weights)
+            # Calculate weighted average probability
+            weighted_prob = sum(prob * weight for prob, weight in zip(chunk_probabilities, chunk_weights)) / total_weight
+            # Blend original overall probability with chunk-based probability
+            # More chunks = more influence from chunk analysis
+            chunk_influence = min(total_chunks / 5.0, 1.0)  # Max influence at 5+ chunks
+            modified_prob = (overall_prob * (1 - chunk_influence)) + (weighted_prob * chunk_influence)
+            final_prediction = 'Mixed'
+            confidence = 1.0 - mixed_ratio  # Lower confidence for mixed text
+            logger.info(f"  → MIXED TEXT DETECTED!")
+            logger.info(f"  → Original overall probability: {overall_prob:.3f}")
+            logger.info(f"  → Weighted chunk probability: {weighted_prob:.3f}")
+            logger.info(f"  → Chunk influence factor: {chunk_influence:.3f}")
+            logger.info(f"  → Modified probability: {modified_prob:.3f}")
+        else:
+            # Pure text - use chunk analysis to refine overall probability
+            chunk_avg_prob = np.mean(chunk_probabilities)
+            # Blend overall and chunk probabilities (chunks have 30% influence for pure text)
+            modified_prob = (overall_prob * 0.7) + (chunk_avg_prob * 0.3)
+            final_prediction = 'Human' if modified_prob < 0.5 else 'AI'
+            confidence = abs(modified_prob - 0.5) * 2
+            logger.info(f"  → Pure {final_prediction} text")
+            logger.info(f"  → Original overall probability: {overall_prob:.3f}")
+            logger.info(f"  → Average chunk probability: {chunk_avg_prob:.3f}")
+            logger.info(f"  → Modified probability: {modified_prob:.3f}")
+        return {
+            'prediction': final_prediction,
+            'confidence': confidence,
+            'is_mixed': is_mixed,
+            'mixed_ratio': mixed_ratio,
+            'human_chunks': human_chunks,
+            'ai_chunks': ai_chunks,
+            'total_chunks': total_chunks,
+            'overall_probability': overall_prob,
+            'modified_probability': modified_prob,
+            'chunk_probabilities': chunk_probabilities,
+            'chunk_analysis': chunk_predictions,
+            'chunk_size': chunk_size,
+            'overlap': overlap
+        }
+    def detect_ai(self, text: str) -> Dict[str, Any]:
+        """
+        AI detection with chunk-based mixed text analysis
+        Args:
+            text: Input text
+        Returns:
+            Detection results with sentiment features and mixed text analysis
+        """
+        # Use chunk-based detection for better mixed text handling
+        chunk_result = self.detect_mixed_text_chunk_based(text)
+        # Get sentiment features for explanation
+        sentiment_features = self.extract_sentiment_features(text)
+        avg_pol = float(sentiment_features[0])
+        pol_var = float(sentiment_features[1])
+        # Generate explanation based on prediction type
+        confidence_pct = chunk_result["confidence"] * 100
+        prediction = chunk_result["prediction"]
+        if confidence_pct > 90:
+            certainty = "very high confidence"
+        elif confidence_pct > 75:
+            certainty = "high confidence"
+        elif confidence_pct > 60:
+            certainty = "moderate confidence"
+        else:
+            certainty = "low confidence"
+        # Generate explanation based on prediction type
+        if prediction == "Mixed":
+            explanation = f"This text appears to be a mixture of AI-generated and human-authored text."
+            explanation += " This mixed composition suggests the text may have been collaboratively written or heavily edited."
+            # Add sentiment insights for mixed text
+            if pol_var > 0.60:
+                explanation += " High emotional variation across sections indicates significant style differences between parts."
+            elif pol_var >= 0.36:
+                explanation += " Moderate emotional variation suggests different writing styles in various sections."
+            else:
+                explanation += " Low emotional variation may indicate consistent editing or similar writing styles throughout."
+        elif prediction == "AI":
+            explanation = f"This text is classified as AI-Generated with {certainty}."
+            explanation += " The text exhibits patterns typical of AI-generated content, including consistent structure and predictable phrasing."
+            if pol_var <= 0.10:
+                explanation += " Very low emotional variation which is typical of AI texts with uniform style."
+            elif pol_var <= 0.35:
+                explanation += " Low emotional variation which is common in AI-generated content."
+            elif pol_var <= 0.60:
+                explanation += " Moderate emotional variation which is rare in AI, possibly presenting multiple viewpoints."
+            else:
+                explanation += " High emotional variation is unusual for AI, may indicate balanced argument structure."
+        else:  # Human
+            explanation = f"This text is classified as Human-Authored with {certainty}."
+            explanation += " The text shows characteristics of human writing, such as natural variations and organic flow."
+            if pol_var > 0.60:
+                explanation += " High emotional variation which is typical of human writing with emotional swings in debates, reviews, and narratives."
+            elif pol_var >= 0.36:
+                explanation += " Moderate emotional variation which shows human-like sentiment shifts."
+            elif pol_var >= 0.11:
+                explanation += " Low emotional variation which may indicate formal or academic human writing."
+            else:
+                explanation += " Very low emotional variation indicates consistent tone with focused perspective."
+        # Convert prediction to classification format for backward compatibility
+        classification_map = {"AI": "ai", "Human": "human", "Mixed": "mixed"}
+        classification = classification_map.get(prediction, "unknown")
+        return {
+            "classification": classification,
+            "prediction": prediction,
+            "probability": chunk_result["modified_probability"],
+            "confidence": confidence_pct,
+            "explanation": explanation,
+            "sentiment_features": {
+                "avg_polarity": avg_pol,
+                "polarity_variance": pol_var
+            },
+            "mixed_analysis": {
+                "is_mixed": chunk_result["is_mixed"],
+                "mixed_ratio": chunk_result.get("mixed_ratio", 0),
+                "human_chunks": chunk_result.get("human_chunks", 0),
+                "ai_chunks": chunk_result.get("ai_chunks", 0),
+                "total_chunks": chunk_result.get("total_chunks", 0),
+                "overall_probability": chunk_result["overall_probability"],
+                "modified_probability": chunk_result["modified_probability"]
+            }
+        }
+    def analyze_text(self, text: str) -> Dict[str, Any]:
+        """
+        Comprehensive text analysis combining AI detection with sentiment features
+        Args:
+            text: Input text to analyze
+        Returns:
+            Complete analysis results with model-based sentiment features
+        """
+        # Validate input text length (200-7000 words)
+        total_words = len(text.split())
+        if total_words < 200:
+            raise ValueError(f"Text too short for analysis ({total_words} words, minimum 200 words required)")
+        elif total_words > 7000:
+            raise ValueError(f"Text too long for analysis ({total_words} words, maximum 7000 words allowed)")
+        # Get AI detection results (includes sentiment features from model)
+        ai_detection = self.detect_ai(text)
+        model_sentiment = ai_detection.get("sentiment_features", {})
+        # Perform basic text analysis
+        words = text.split()
+        sentences = [s.strip() for s in text.replace('!', '.').replace('?', '.').split('.') if s.strip()]
+        # Calculate basic metrics
+        word_count = len(words)
+        sentence_count = len(sentences)
+        avg_word_length = np.mean([len(w) for w in words]) if words else 0
+        avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
+        # Determine complexity based on AI probability and text metrics
+        is_ai = ai_detection["classification"] == "ai"
+        is_mixed = ai_detection["classification"] == "mixed"
+        ai_prob = ai_detection["probability"]
+        # Handle different prediction types
+        if is_mixed:
+            formality = "mixed"
+            complexity = "variable"
+            tone = "Mixed (AI/Human)"
+            audience = "Variable"
+        elif is_ai:
+            formality = "formal" if ai_prob > 0.7 else "neutral"
+            complexity = "complex" if avg_word_length > 6 else "moderate"
+            tone = "Professional"
+            audience = "General to Academic"
+        else:
+            formality = "casual" if avg_word_length < 5 else "neutral"
+            complexity = "simple" if avg_sentence_length < 15 else "moderate"
+            tone = "Conversational"
+            audience = "General Public"
+        # Generate insights based on detection results
+        insights = []
+        if is_mixed and ai_detection["confidence"] > 60:
+            mixed_analysis = ai_detection.get("mixed_analysis", {})
+            insights.append({
+                "type": "observation",
+                "title": "Mixed Content Detected",
+                "description": f"This text contains both AI-generated and human-authored sections ({ai_detection['confidence']:.1f}% confidence).",
+                "suggestion": "Consider reviewing the text for consistency and ensuring all sections align with your intended voice and style."
+            })
+            insights.append({
+                "type": "observation",
+                "title": "Content Composition",
+                "description": f"Analysis found {mixed_analysis.get('human_chunks', 0)} human-like sections and {mixed_analysis.get('ai_chunks', 0)} AI-like sections.",
+                "suggestion": "The mixed nature suggests collaborative writing or heavy editing. Consider standardizing the writing style throughout."
+            })
+        elif is_ai and ai_detection["confidence"] > 75:
+            insights.append({
+                "type": "observation",
+                "title": "AI-Generated Content Detected",
+                "description": f"This text shows strong indicators of AI generation ({ai_detection['confidence']:.1f}% confidence).",
+                "suggestion": "Consider adding personal anecdotes, varied sentence structures, or unique perspectives to make it more human-like."
+            })
+        elif not is_ai and ai_detection["confidence"] > 75:
+            insights.append({
+                "type": "strength",
+                "title": "Human Writing Characteristics",
+                "description": f"This text exhibits clear human writing patterns ({ai_detection['confidence']:.1f}% confidence)."
+            })
+        # Sentence variety analysis
+        if sentence_count > 2:
+            sentence_lengths = [len(s.split()) for s in sentences]
+            std_dev = np.std(sentence_lengths)
+            if std_dev < 3:
+                insights.append({
+                    "type": "improvement",
+                    "title": "Sentence Variety",
+                    "description": "Sentences have similar lengths, which may indicate AI generation.",
+                    "suggestion": "Vary sentence lengths to create more natural rhythm."
+                })
+            else:
+                insights.append({
+                    "type": "strength",
+                    "title": "Good Sentence Variety",
+                    "description": "Text shows natural variation in sentence structure."
+                })
+        # Generate emotions based on model sentiment polarity (data-driven ranges)
+        avg_polarity = model_sentiment.get("avg_polarity", 0)
+        emotions = []
+        if avg_polarity >= 0.71:
+            emotions.append({"emotion": "very_positive", "score": min(abs(avg_polarity), 1.0), "intensity": "high"})
+        elif avg_polarity >= 0.30:
+            emotions.append({"emotion": "positive", "score": min(abs(avg_polarity), 1.0), "intensity": "medium"})
+        elif avg_polarity >= -0.29:
+            emotions.append({"emotion": "neutral", "score": 0.8, "intensity": "medium"})
+        else:
+            emotions.append({"emotion": "negative", "score": min(abs(avg_polarity), 1.0), "intensity": "high"})
+        # Construct full analysis response with model sentiment features
+        polarity_variance = model_sentiment.get("polarity_variance", 0)
+        return {
+            "advancedSentiment": {
+                "emotions": emotions,
+                "confidence": 70 + (ai_detection["confidence"] * 0.3),
+                "context": f"The text appears to be {'AI-Generated' if ai_detection['classification'] == 'ai' else 'Human-Authored'} based on linguistic patterns and sentiment analysis.",
+                "avg_polarity": model_sentiment.get("avg_polarity", 0),
+                "polarity_variance": polarity_variance
+            },
+            "topics": [
+                {
+                    "topic": "General Content",
+                    "relevance": 0.8,
+                    "keywords": words[:5] if len(words) >= 5 else words
+                }
+            ],
+            "writingStyle": {
+                "tone": tone,
+                "formality": formality,
+                "complexity": complexity,
+                "style": [formality, complexity, tone],
+                "audience": audience,
+                "sentiment_consistency": "very_low" if polarity_variance <= 0.10 else "low" if polarity_variance <= 0.35 else "moderate" if polarity_variance <= 0.60 else "high"
+            },
+            "insights": insights,
+            "plagiarismRisk": {
+                "score": int(ai_prob * 100) if is_ai else (int(ai_prob * 70) if is_mixed else 10),
+                "level": "high" if is_ai and ai_prob > 0.8 else "medium" if (is_ai or is_mixed) else "low",
+                "details": f"{'High' if is_ai else 'Moderate' if is_mixed else 'Low'} similarity to AI-generated patterns detected."
+            },
+            "contentQuality": {
+                "overall": int(85 - (ai_prob * 20)) if is_ai else (int(80 - (ai_prob * 15)) if is_mixed else 90),
+                "clarity": int(90 - (ai_prob * 10)) if not is_mixed else int(85 - (ai_prob * 8)),
+                "coherence": int(88 - (ai_prob * 8)) if not is_mixed else int(82 - (ai_prob * 6)),
+                "engagement": int(75 - (ai_prob * 25)) if not is_mixed else int(70 - (ai_prob * 20)),
+                "originality": int(60 - (ai_prob * 40)) if is_ai else (int(70 - (ai_prob * 30)) if is_mixed else 85)
+            },
+            "aiOrHuman": ai_detection["classification"],
+            "aiOrHumanConfidence": ai_detection["confidence"],
+            "aiOrHumanExplanation": ai_detection["explanation"]
+        }
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded models"""
+        return {
+            "model_loaded": self.model_loaded,
+            "model_path": self.model_path,
+            "device": str(self.device),
+            "max_length": self.max_length,
+            "architecture": "Two-Branch (DeBERTa + Sentiment Features)",
+            "primary_model": "DeBERTa-v3-large (desklib/ai-text-detector-v1.01)",
+            "sentiment_model": "DistilBERT-SST-2",
+            "classifier": "XGBoost" if self.xgboost_model is not None else "DeBERTa Linear",
+            "features": [
+                "DeBERTa embeddings (1024 dimensions)",
+                "Average sentiment polarity",
+                "Sentiment polarity variance"
+            ],
+            "description": "Two-branch model for detecting AI-Generated vs Human-Authored text using DeBERTa semantic embeddings combined with sentiment features"
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# FastAPI and Server
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+python-multipart>=0.0.6
+pydantic>=2.5.0
+# Machine Learning & AI (Compatible with Python 3.13)
+torch>=2.8.0
+transformers>=4.35.0
+safetensors>=0.4.0
+xgboost>=2.0.0
+nltk>=3.8.0
+scikit-learn>=1.3.0
+# Data Processing
+numpy>=1.26.2
+pandas>=2.1.3
+# Utilities
+python-dotenv>=1.0.0
+requests>=2.31.0
+# Optional: For better performance
+# accelerate>=0.24.1
+# optimum>=1.14.0