Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import random | |
| from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer | |
| import torch | |
| import io | |
| from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
| from pdfminer.converter import TextConverter | |
| from pdfminer.layout import LAParams | |
| from pdfminer.pdfpage import PDFPage | |
| from docx import Document | |
| # --- Streamlit Page Configuration --- | |
| st.set_page_config(page_title="AI & Plagiarism Detection", page_icon="π", layout="wide") | |
| # --- DeepSeek Theme --- | |
| DEEPSEEK_THEME = { | |
| "backgroundColor": "#282c34", | |
| "textColor": "#abb2bf", | |
| "inputAreaColor": "#3E4451", | |
| "accentColor": "#61afef", | |
| "sidebarColor": "#21252b", | |
| "font": "sans-serif", | |
| } | |
| # --- Function to Apply Theme --- | |
| def apply_theme(theme): | |
| st.markdown(f""" | |
| <style> | |
| body {{ | |
| color: {theme["textColor"]}; | |
| background-color: {theme["backgroundColor"]}; | |
| font-family: {theme["font"]}; | |
| }} | |
| .welcome-text {{ | |
| color: {theme["textColor"]}; | |
| font-size: 36px; | |
| font-weight: bold; | |
| text-align: center; | |
| margin-bottom: 20px; | |
| }} | |
| .output-box {{ | |
| background-color: {theme["inputAreaColor"]}; | |
| color: {theme["textColor"]}; | |
| padding: 10px; | |
| border-radius: 5px; | |
| margin-top: 20px; | |
| }} | |
| .stTextArea textarea {{ | |
| background-color: {theme["inputAreaColor"]}; | |
| color: {theme["textColor"]}; | |
| border: 1px solid {theme["accentColor"]}; | |
| border-radius: 5px; | |
| }} | |
| .stFileUploader > div > div:nth-child(1) > div > button {{ | |
| background-color: {theme["accentColor"]}; | |
| color: {theme["backgroundColor"]}; | |
| border-radius: 5px; | |
| }} | |
| .stMetricLabel {{ | |
| color: {theme["textColor"]} !important; | |
| }} | |
| .stMetricValue {{ | |
| color: {theme["textColor"]} !important; | |
| }} | |
| .streamlit-expanderHeader {{ | |
| color: {theme["textColor"]}; | |
| }} | |
| .streamlit-expanderContent {{ | |
| color: {theme["textColor"]}; | |
| }} | |
| [data-testid="stSidebar"] {{ | |
| background-color: {theme["sidebarColor"]}; | |
| color: {theme["textColor"]}; | |
| }} | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # --- Helper Functions --- | |
| def extract_text_from_pdf(pdf_file): | |
| resource_manager = PDFResourceManager() | |
| output_string = io.StringIO() | |
| laparams = LAParams() | |
| device = TextConverter(resource_manager, output_string, laparams=laparams) | |
| interpreter = PDFPageInterpreter(resource_manager, device) | |
| for page in PDFPage.get_pages(pdf_file, caching=True, check_extractable=True): | |
| interpreter.process_page(page) | |
| text = output_string.getvalue() | |
| device.close() | |
| output_string.close() | |
| return text | |
| def extract_text_from_docx(docx_file): | |
| doc = Document(docx_file) | |
| full_text = [] | |
| for paragraph in doc.paragraphs: | |
| full_text.append(paragraph.text) | |
| return '\n'.join(full_text) | |
| def split_text_into_chunks(text, tokenizer, max_length=512): | |
| chunks = [] | |
| tokens = tokenizer.tokenize(text) | |
| for i in range(0, len(tokens), max_length): | |
| chunk_tokens = tokens[i:i + max_length] | |
| chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens) | |
| chunks.append(chunk_text) | |
| return chunks | |
| def load_ai_detection_model(model_name="Hello-SimpleAI/chatgpt-detector-roberta"): | |
| try: | |
| ai_detection = pipeline("text-classification", model=model_name, truncation=True, max_length=512) | |
| return ai_detection | |
| except Exception as e: | |
| st.error(f"Error loading AI detection model: {e}") | |
| return None | |
| def load_plagiarism_model(model_name="jpwahle/longformer-base-plagiarism-detection"): | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| return tokenizer, model | |
| except Exception as e: | |
| st.error(f"Error loading plagiarism detection model: {e}") | |
| return None | |
| def detect_ai_content(text_chunks, ai_detection_model, ai_threshold=0.4): | |
| try: | |
| ai_percentages = [] | |
| for chunk in text_chunks: | |
| result = ai_detection_model(chunk) | |
| ai_label = result[0]['label'] | |
| ai_score = result[0]['score'] | |
| if ai_label == 'AI' and ai_score > ai_threshold: | |
| ai_percentages.append(ai_score) | |
| elif ai_label == 'Human' and ai_score < (1 - ai_threshold): | |
| ai_percentages.append(0) | |
| else: | |
| ai_percentages.append(0) | |
| return ai_percentages | |
| except Exception as e: | |
| st.error(f"Error during AI content detection: {e}") | |
| return None | |
| def plagiarism_check(text_chunks, tokenizer, model): | |
| try: | |
| plagiarized_count = 0 | |
| for chunk in text_chunks: | |
| inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predicted_class = torch.argmax(outputs.logits, dim=-1).item() | |
| if predicted_class == 1: | |
| plagiarized_count += 1 | |
| plagiarism_percentage = (plagiarized_count / len(text_chunks)) * 100 | |
| return plagiarism_percentage | |
| except Exception as e: | |
| st.error(f"Error during plagiarism detection: {e}") | |
| return None | |
| # --- Main Function --- | |
| def main(): | |
| # --- Apply DeepSeek Theme --- | |
| apply_theme(DEEPSEEK_THEME) | |
| # --- Sidebar --- | |
| with st.sidebar: | |
| st.markdown("<h1 style='color:#61afef;'>AI & Plagiarism</h1>", unsafe_allow_html=True) | |
| st.markdown("Navigation") | |
| menu_options = ["New Chat"] # Removed "My Profile" and "Get App" | |
| selected_option = st.radio("Choose an option", menu_options) | |
| st.markdown("---") | |
| st.markdown("Today") | |
| recent_chats = ["Chat 1", "Chat 2", "Chat 3"] | |
| for chat in recent_chats: | |
| st.markdown(f"- {chat}") | |
| # --- Main Content --- | |
| col1, col2 = st.columns([1, 3]) # Adjust the ratio as needed | |
| with col2: | |
| st.markdown("<h1 class='welcome-text'>Hi, I'm AI & Plagiarism Assistant.</h1>", unsafe_allow_html=True) | |
| st.markdown("How can I help you today?") | |
| # --- Input Area: Text Area and File Upload --- | |
| input_text = st.text_area("Message", "", height=200) | |
| uploaded_files = st.file_uploader("Attach documents (PDF or DOCX)", type=["pdf", "docx"], accept_multiple_files=True) | |
| # --- Load models --- | |
| ai_detection_model, tokenizer, plagiarism_model = load_models() | |
| # --- Process Input --- | |
| if input_text or uploaded_files: | |
| raw_text = "" | |
| # --- Process Uploaded Files --- | |
| if uploaded_files: | |
| with st.expander("Uploaded Files", expanded=False): | |
| for uploaded_file in uploaded_files: | |
| file_size = len(uploaded_file.getvalue()) | |
| if file_size > 1000000000: | |
| st.error(f"{uploaded_file.name}: File size exceeds the 1GB limit.") | |
| continue | |
| try: | |
| if uploaded_file.type == "application/pdf": | |
| extracted_text = extract_text_from_pdf(uploaded_file) | |
| raw_text += extracted_text + "\n" | |
| st.write(f"Extracted text from {uploaded_file.name}") | |
| elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
| extracted_text = extract_text_from_docx(uploaded_file) | |
| raw_text += extracted_text + "\n" | |
| st.write(f"Extracted text from {uploaded_file.name}") | |
| else: | |
| st.error(f"{uploaded_file.name}: Unsupported file type") | |
| continue | |
| except Exception as e: | |
| st.error(f"Error processing {uploaded_file.name}: {e}") | |
| continue | |
| # --- Append Manual Text --- | |
| raw_text += input_text.strip() | |
| # --- Split text into manageable chunks --- | |
| text_chunks = split_text_into_chunks(raw_text.strip(), tokenizer) | |
| # --- Process and Display Results --- | |
| process_and_display(text_chunks, "Combined Input", ai_detection_model, tokenizer, plagiarism_model) | |
| # --- Helper function to process text and display results --- | |
| def process_and_display(text_chunks, source_name, ai_detection_model, tokenizer, plagiarism_model): | |
| # AI Detection | |
| ai_percentage_avg = None | |
| human_percentage = None | |
| if ai_detection_model: | |
| ai_percentages = detect_ai_content(text_chunks, ai_detection_model) | |
| if ai_percentages: | |
| ai_percentage_avg = sum(ai_percentages) / len(ai_percentages) * 100 | |
| human_percentage = 100 - ai_percentage_avg | |
| # Plagiarism Check | |
| plagiarism_percentage = None | |
| if tokenizer and plagiarism_model: | |
| plagiarism_percentage = plagiarism_check(text_chunks, tokenizer, plagiarism_model) | |
| # --- Tiled Output --- | |
| with st.container(): | |
| st.markdown(f"<div class='output-box'><h3>{source_name}</h3></div>", unsafe_allow_html=True) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("<div class='output-box'><h4>AI Detection:</h4></div>", unsafe_allow_html=True) | |
| if ai_percentage_avg is not None: | |
| st.metric(label="AI Content", value=f"{ai_percentage_avg:.2f}%", delta="AI Generated") | |
| st.metric(label="Human Written", value=f"{human_percentage:.2f}%", delta="Humanized Text") | |
| else: | |
| st.write("AI Detection not available") | |
| with col2: | |
| st.markdown("<div class='output-box'><h4>Plagiarism Detection:</h4></div>", unsafe_allow_html=True) | |
| if plagiarism_percentage is not None: | |
| st.metric(label="Plagiarism", value=f"{plagiarism_percentage:.2f}%", delta="Plagiarized" if plagiarism_percentage > 0 else "Original") | |
| else: | |
| st.write("Plagiarism Detection not available") | |
| # --- Load models globally --- | |
| def load_models(): | |
| ai_detection_model = load_ai_detection_model() | |
| tokenizer, plagiarism_model = load_plagiarism_model() | |
| return ai_detection_model, tokenizer, plagiarism_model | |
| # --- Call Main --- | |
| if __name__ == "__main__": | |
| ai_detection_model, tokenizer, plagiarism_model = load_models() | |
| main() | |