Spaces:

sudhir2016
/

PPP_Demo

Sleeping

App Files Files Community

sudhir2016 commited on Nov 14

Commit

a5b8f58

verified ·

1 Parent(s): 0d05e09

Create app.py

Browse files

Files changed (1) hide show

app.py +77 -0

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gradio as gr, numpy as np
+import torch
+from transformers import EsmTokenizer,EsmForMaskedLM
+model_name = "facebook/esm2_t6_8M_UR50D"
+tokenizer = EsmTokenizer.from_pretrained(model_name)
+device = torch.device("cpu")
+model_mlm = EsmForMaskedLM.from_pretrained(model_name).to(device)
+# 2. Define the PLL Calculation Function
+def predict_ppp(sequence) -> float:
+    """
+    Calculates the ESM2 Pseudolog-Likelihood (PLL) for a single amino acid sequence.
+    PLL = sum_i( log P(x_i | x_{~i}) )
+    """
+    # Tokenize the sequence
+    # This automatically adds <CLS> and <EOS> tokens
+    input_ids = tokenizer(sequence, return_tensors='pt')['input_ids']
+    # The true sequence length (excluding special tokens)
+    L = len(sequence)
+    # The mask indices correspond to the AA sequence positions
+    # We ignore the first (<CLS>) and last (<EOS>) tokens.
+    mask_indices = torch.arange(1, L + 1)
+    # Accumulator for the log-likelihood sum
+    pll_sum = 0.0
+    # Iterate over each position in the sequence to mask it
+    for i in mask_indices:
+        # Create a copy of the input_ids
+        masked_input = input_ids.clone()
+        # Mask the current residue (token ID for MASK is 1)
+        masked_input[0, i] = tokenizer.mask_token_id
+        # Get model logits (unnormalized log-probabilities)
+        with torch.no_grad():
+            outputs = model_mlm(masked_input)
+            logits = outputs.logits # shape: (batch_size, seq_len, vocab_size)
+        # Extract the log-probabilities for the prediction at the masked position
+        # We use log_softmax to get log-probabilities
+        log_probs = torch.log_softmax(logits[0, i], dim=-1)
+        # Get the token ID of the *actual* residue at the masked position
+        target_token_id = input_ids[0, i].item()
+        # Get the log-probability of the actual residue
+        log_prob_of_target = log_probs[target_token_id].item()
+        # Add to the sum
+        pll_sum += log_prob_of_target
+        L = len(sequence)
+        ppp = np.exp(-pll_sum / L)
+    return ppp
+demo = gr.Interface(
+    fn=predict_ppp,
+    inputs=[
+        gr.Textbox(label="Enter Protein Amino Acid Sequence (1-letter code)",
+        placeholder="ACDEFGHIKLMNPQRSTVWY"),
+           ],
+    outputs="text",
+    title="Nano Protein Language Model for Pseudo Perplexity (PPP) prediction of a protein sequence",
+    description="Enter an amino acid sequence (using the 1-letter code) to predict its Pseudo Perplexity (PPP)",
+    examples=[
+        ["MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"], # Example sequence
+            ]
+)
+demo.launch()