Alishbah commited on
Commit
4d2cd01
·
verified ·
1 Parent(s): d9ae89a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -22
app.py CHANGED
@@ -1,31 +1,157 @@
1
  import streamlit as st
2
  from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  @st.cache_resource
5
- def load_model():
6
- """
7
- Loads a pre-trained sentiment analysis model from Hugging Face Transformers.
8
- """
9
- model = pipeline("sentiment-analysis") # You can replace this with your desired model
10
- return model
 
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def main():
13
- """
14
- Main function to run the Streamlit app.
15
- """
16
- st.title("Hugging Face Model Demo")
17
-
18
- # Load the model
19
- model = load_model()
20
-
21
- # Create an input text box
22
- input_text = st.text_input("Enter your text", "")
23
-
24
- # Create a button to trigger model inference
25
- if st.button("Analyze"):
26
- # Perform inference using the loaded model
27
- result = model(input_text)
28
- st.write("Prediction:", result[0]['label'], "| Score:", result[0]['score'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  if __name__ == "__main__":
31
  main()
 
1
  import streamlit as st
2
  from transformers import pipeline
3
+ import io
4
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
5
+ from pdfminer.converter import TextConverter
6
+ from pdfminer.layout import LAParams
7
+ from pdfminer.pdfpage import PDFPage
8
+ from docx import Document
9
+ import torch
10
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
11
+
12
+ # Functions for file processing
13
+ def extract_text_from_pdf(pdf_file):
14
+ resource_manager = PDFResourceManager()
15
+ output_string = io.StringIO()
16
+ codec = 'utf-8'
17
+ laparams = LAParams()
18
+ device = TextConverter(resource_manager, output_string, codec=codec, laparams=laparams)
19
+ interpreter = PDFPageInterpreter(resource_manager, device)
20
+
21
+ for page in PDFPage.get_pages(pdf_file, caching=True, check_extractable=True):
22
+ interpreter.process_page(page)
23
+ text = output_string.getvalue()
24
+ device.close()
25
+ output_string.close()
26
+ return text
27
+
28
+ def extract_text_from_docx(docx_file):
29
+ doc = Document(docx_file)
30
+ full_text = []
31
+ for paragraph in doc.paragraphs:
32
+ full_text.append(paragraph.text)
33
+ return '\n'.join(full_text)
34
+
35
+ # Functions for AI and Plagiarism detection
36
+ @st.cache_resource
37
+ def load_ai_detection_model():
38
+ try:
39
+ ai_detection = pipeline("text-classification", model="roberta-base-openai-detector")
40
+ return ai_detection
41
+ except Exception as e:
42
+ st.error(f"Error loading AI detection model: {e}")
43
+ return None
44
+
45
+ def detect_ai_content(text, ai_detection_model):
46
+ try:
47
+ result = ai_detection_model(text)
48
+ return result
49
+ except Exception as e:
50
+ st.error(f"Error during AI content detection: {e}")
51
+ return None
52
 
53
  @st.cache_resource
54
+ def load_plagiarism_model(model_name="jpwahle/longformer-base-plagiarism-detection"):
55
+ try:
56
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
57
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
58
+ return tokenizer, model
59
+ except Exception as e:
60
+ st.error(f"Error loading plagiarism detection model: {e}")
61
+ return None
62
 
63
+ def plagiarism_check(text, tokenizer, model):
64
+ try:
65
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
66
+ with torch.no_grad():
67
+ outputs = model(**inputs)
68
+ predicted_class = torch.argmax(outputs.logits, dim=-1).item()
69
+ return predicted_class
70
+ except Exception as e:
71
+ st.error(f"Error during plagiarism detection: {e}")
72
+ return None
73
+
74
+ # Streamlit app
75
  def main():
76
+ st.title("AI & Plagiarism Detection")
77
+
78
+ # Load models
79
+ ai_detection_model = load_ai_detection_model()
80
+ tokenizer, plagiarism_model = load_plagiarism_model()
81
+
82
+ # File uploader with custom styling
83
+ st.markdown(
84
+ """
85
+ <style>
86
+ .stFileUploader > div > div:nth-child(1) > div > button {
87
+ background-color: #4CAF50;
88
+ color: white;
89
+ padding: 10px 24px;
90
+ border: none;
91
+ border-radius: 4px;
92
+ cursor: pointer;
93
+ }
94
+ .stFileUploader > div > div:nth-child(1) > div > button:hover {
95
+ background-color: #367C39;
96
+ }
97
+ </style>
98
+ """,
99
+ unsafe_allow_html=True,
100
+ )
101
+ uploaded_file = st.file_uploader("Upload a file (PDF, DOCX)", type=["pdf", "docx"], help="Maximum file size: 1GB")
102
+
103
+ if uploaded_file is not None:
104
+ file_size = len(uploaded_file.getvalue())
105
+ if file_size > 1000000000: # 1 GB limit
106
+ st.error("File size exceeds the 1GB limit.")
107
+ return
108
+
109
+ try:
110
+ if uploaded_file.type == "application/pdf":
111
+ raw_text = extract_text_from_pdf(uploaded_file)
112
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
113
+ raw_text = extract_text_from_docx(uploaded_file)
114
+ else:
115
+ raw_text = None
116
+ st.error("Unsupported file type")
117
+ return
118
+ except Exception as e:
119
+ st.error(f"Error processing file: {e}")
120
+ return
121
+
122
+ if raw_text:
123
+ # AI Detection
124
+ if ai_detection_model:
125
+ ai_result = detect_ai_content(raw_text, ai_detection_model)
126
+ else:
127
+ ai_result = None
128
+
129
+ # Plagiarism Check
130
+ if tokenizer and plagiarism_model:
131
+ plagiarism_result = plagiarism_check(raw_text, tokenizer, plagiarism_model)
132
+ else:
133
+ plagiarism_result = None
134
+
135
+ # Report Generation
136
+ st.subheader("Analysis Report")
137
+ col1, col2 = st.columns(2)
138
+
139
+ with col1:
140
+ st.markdown("AI Detection:")
141
+ if ai_result:
142
+ ai_label = ai_result[0]['label']
143
+ ai_score = ai_result[0]['score']
144
+ st.metric(label="AI Content", value=f"{ai_score:.2%}", delta=ai_label)
145
+ else:
146
+ st.write("Not available")
147
+
148
+ with col2:
149
+ st.markdown("Plagiarism Detection:")
150
+ if plagiarism_result is not None:
151
+ plagiarism_status = "Plagiarized" if plagiarism_result == 1 else "Original"
152
+ st.metric(label="Plagiarism", value=plagiarism_status)
153
+ else:
154
+ st.write("Not available")
155
 
156
  if __name__ == "__main__":
157
  main()