Spaces:

khalil2233
/

MDSS

Runtime error

App Files Files Community

MDSS / app.py

khalil2233

ulpad

f168ba9 verified 10 months ago

raw

history blame contribute delete

6.23 kB

	import os
	import json
	import numpy as np
	import faiss
	import gradio as gr
	from datasets import load_dataset
	from sentence_transformers import SentenceTransformer
	from groq import Groq
	import nltk
	import re
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize, sent_tokenize
	from nltk.stem import WordNetLemmatizer
	from multiprocessing import Pool, cpu_count

	nltk.download("all")

	# Load stopwords and lemmatizer
	stop_words = set(stopwords.words("english"))
	lemmatizer = WordNetLemmatizer()

	# Load dataset
	def load_and_preprocess_dataset():
	"""Load and preprocess the dataset."""
	dataset = load_dataset("MedRAG/textbooks")
	print("Dataset loaded successfully.")
	return dataset

	# Preprocessing function
	def preprocess_text(text):
	"""Preprocess text by lowercasing, removing special characters, and lemmatizing."""
	text = text.lower() # Convert to lowercase
	text = re.sub(r"[^\w\s]", "", text) # Remove special characters
	words = word_tokenize(text) # Tokenization
	words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words] # Lemmatization & stopword removal
	return " ".join(words)

	# Chunking function
	def chunk_text(text, chunk_size=3):
	"""Split text into chunks of sentences."""
	sentences = sent_tokenize(text) # Split text into sentences
	return [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]

	# Generate embeddings in parallel
	def generate_embeddings_parallel(chunks):
	"""Generate embeddings for chunks in parallel."""
	embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	with Pool(cpu_count()) as pool:
	embeddings = pool.map(embed_model.encode, chunks)
	return embeddings

	# Generate embeddings for the dataset
	def generate_embeddings(dataset):
	"""Generate embeddings for the dataset."""
	print("Preprocessing dataset...")
	dataset = dataset.map(lambda row: {"cleaned_content": preprocess_text(row["content"])})
	dataset = dataset.map(lambda row: {"chunks": chunk_text(row["cleaned_content"])})

	print("Generating embeddings...")
	all_chunks = [chunk for row in dataset["train"]["chunks"] for chunk in row]
	embeddings = generate_embeddings_parallel(all_chunks)

	# Add embeddings to the dataset
	dataset = dataset.map(lambda row, idx: {"embedding": embeddings[idx]}, with_indices=True)
	return dataset

	# Create FAISS index
	def create_faiss_index(dataset):
	"""Create and save a FAISS index for the embeddings."""
	embeddings_np = np.array([np.array(row["embedding"]).flatten().tolist() for row in dataset["train"]], dtype=np.float32)
	index = faiss.IndexFlatL2(embeddings_np.shape[1])
	index.add(embeddings_np)
	faiss.write_index(index, "faiss_medical.index")
	print("FAISS index created and saved.")

	# Load FAISS index
	def load_faiss_index():
	"""Load the FAISS index."""
	index = faiss.read_index("faiss_medical.index")
	print("FAISS index loaded.")
	return index

	# Retrieve medical summary
	def retrieve_medical_summary(query, index, id_to_text, k=3):
	"""Retrieve the most relevant medical literature from FAISS."""
	embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	query_embedding = embed_model.encode([query])
	D, I = index.search(np.array(query_embedding).astype("float32"), k)
	retrieved_docs = [id_to_text.get(int(idx), "No relevant data found.") for idx in I[0]]
	retrieved_docs = [doc if isinstance(doc, str) else " ".join(doc) for doc in retrieved_docs]
	return "\n\n---\n\n".join(retrieved_docs) if retrieved_docs else "No relevant data found."

	# Generate medical answer using Groq
	def generate_medical_answer_groq(query, index, id_to_text):
	"""Generate a medical response using Groq's API."""
	retrieved_summary = retrieve_medical_summary(query, index, id_to_text)
	if not retrieved_summary or retrieved_summary == "No relevant data found.":
	return "No relevant medical data found. Please consult a healthcare professional."

	client = Groq(api_key=os.getenv("GROQ_API_KEY"))
	try:
	response = client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[
	{"role": "system", "content": "You are an expert AI specializing in medical knowledge."},
	{"role": "user", "content": f"Summarize the following medical literature and provide a structured medical answer:\n\n### Medical Literature ###\n{retrieved_summary}\n\n### Patient Question ###\n{query}\n\n### Medical Advice ###"}
	],
	max_tokens=500,
	temperature=0.3
	)
	return response.choices[0].message.content.strip()
	except Exception as e:
	return f"Error generating response: {str(e)}"

	# Gradio interface
	def ask_medical_question(question):
	"""Gradio interface for asking medical questions."""
	return generate_medical_answer_groq(question, index, id_to_text)

	# Main function
	def main():
	"""Main function to set up the system."""
	global index, id_to_text

	# Load and preprocess dataset
	dataset = load_and_preprocess_dataset()
	dataset = generate_embeddings(dataset)

	# Create FAISS index
	create_faiss_index(dataset)

	# Load FAISS index
	index = load_faiss_index()

	# Create ID to text mapping
	medical_texts = dataset["train"]["chunks"]
	id_to_text = {idx: text for idx, text in enumerate(medical_texts)}
	with open("id_to_text.json", "w") as f:
	json.dump(id_to_text, f)

	# Launch Gradio app
	iface = gr.Interface(
	fn=ask_medical_question,
	inputs=gr.Textbox(lines=2, placeholder="Enter your medical question here..."),
	outputs=gr.Textbox(lines=10, placeholder="AI-generated medical advice will appear here..."),
	title="Medical Question Answering System",
	description="Ask any medical question, and the AI will provide an answer based on medical literature."
	)
	iface.launch()

	# Run the main function
	if __name__ == "__main__":
	main()