Spaces:

cafierom
/

MoDrAg2-OpenAI

Paused

App Files Files Community

cafierom commited on Feb 23

Commit

426afd8

verified ·

1 Parent(s): b9805ff

Upload 6 files

Browse files

Files changed (6) hide show

app.py +129 -0
finetune_gpt.py +442 -0
modrag_molecule_functions.py +178 -0
modrag_property_functions.py +227 -0
modrag_protein_functions.py +763 -0
requirements.txt +26 -0

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from langchain_openai.chat_models import ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
+from google.colab import userdata
+from langchain_core.tools import tool
+from langgraph.graph import START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+import gradio as gr
+import spaces
+from PIL import Image
+from collections import Counter
+from typing import Annotated, TypedDict
+import time, sys, os
+sys.path.append('code')
+from modrag_molecule_functions import *
+from modrag_property_functions import *
+from modrag_protein_functions import *
+openai_key = os.getenv("OPENAI_API_KEY")
+tools = [name_node, smiles_node, related_node, structure_node,
+         substitution_node, lipinski_node, pharmfeature_node,
+         uniprot_node, listbioactives_node, getbioactives_node,
+         predict_node, gpt_node, pdb_node, find_node, docking_node,
+         target_node]
+model = ChatOpenAI(model_name="gpt-5.2", api_key=openai_key).bind_tools(tools)
+class State(TypedDict):
+  messages: Annotated[list, add_messages]
+def model_node(state: State) -> State:
+  res = model.invoke(state['messages'])
+  return {'messages': res}
+builder = StateGraph(State)
+builder.add_node('model', model_node)
+builder.add_node('tools', ToolNode(tools))
+builder.add_edge(START, 'model')
+builder.add_conditional_edges('model', tools_condition)
+builder.add_edge('tools',  'model')
+graph = builder.compile()
+sys_message = SystemMessage(content="You are a helpful cat who says nyan and meow a lot.")
+global messages
+messages = [sys_message]
+def start_chat():
+  '''
+  '''
+  global chat_history, messages, reasoning
+  chat_history = []
+  reasoning = []
+  messages = [sys_message]
+@spaces.GPU
+def chat_turn(prompt: str):
+  '''
+  '''
+  human_message = HumanMessage(content=prompt)
+  messages.append(human_message)
+  global chat_history
+  local_history = [prompt]
+  input = {
+      'messages' : messages
+  }
+  for c in graph.stream(input):
+    try:
+      ai_mes = c['model']['messages'].content
+      messages.append(AIMessage(ai_mes))
+      if ai_mes != '':
+        print(f'message is {ai_mes}')
+        local_history.append(ai_mes)
+    except:
+      pass
+    try:
+      if os.path.exists('current_image.png'):
+        if os.path.getmtime('current_image.png') > time.time() - 30:
+          img = Image.open('current_image.png')
+        else:
+          img = None
+      else:
+        img = None
+    except:
+      img = None
+    try:
+      reasoning.append(c['tools']['messages'][0].content)
+    except:
+      pass
+  if len(local_history) != 2:
+    local_history.append('no message')
+  chat_history.append(local_history)
+  return '', img, chat_history
+def send_reasoning():
+  global reasoning
+  return reasoning
+start_chat()
+with gr.Blocks(fill_height=True) as OpenAIMoDrAg:
+  gr.Markdown('''
+              # MoDrAg Chatbot using ChatGPT 5.2
+              - The *MOdular DRug design AGent*!
+              - This chatbot can answer questions about molecules, proteins, and their interactions.
+              It can also perform tasks such as predicting properties, finding similar molecules, and docking. Try it out!
+              - See the tool log box at the bottom for direct tool outputs.
+              ''')
+  chat = gr.Chatbot()
+  with gr.Row(equal_height = True):
+    msg = gr.Textbox(label = 'query', scale = 8)
+    sub_button = gr.Button("Submit", scale = 2)
+  clear = gr.ClearButton([msg, chat])
+  img_box = gr.Image()
+  reasoning_box = gr.Textbox(label="Tool logs", lines = 20)
+  msg.submit(chat_turn, [msg], [msg, img_box, chat]).then(send_reasoning, [], [reasoning_box])
+  sub_button.click(chat_turn, [msg], [msg, img_box, chat])
+  clear.click(start_chat, [], [])
+OpenAIMoDrAg.launch(mcp_server = True)

finetune_gpt.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import deepchem as dc
+import tensorflow as tf
+import numpy as np
+import random
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import Draw
+import os
+def finetune_gpt(df, chembl_id):
+  '''
+  accepts a dataframe with SMILES and uses deepchem to tokenize the dataset,
+  then uses tensorflow and a pre-trained model to fine tune the model on the dataset.
+  The pretrained model was trained on 305K molecules from the ZN15 dataset, including at least
+  50K that are bioactive.
+  Returns:
+    out_text: the generated molecules
+    img: the image of the generated molecules
+  requires files:
+    vocab.txt
+    vocab_305K.txt
+    GPT_ZN305_50epochs.weights.h5
+    layer_store_GPT_ZN305_50epochs.txt
+    ZN305K_smiles.csv
+  '''
+  # check to see if f"gen_smiles_{chembl_id}.csv" exists
+  if os.path.exists(f"gen_smiles_{chembl_id}.csv"):
+    df = pd.read_csv(f"gen_smiles_{chembl_id}.csv")
+    final_smiles = df["SMILES"].to_list()
+    final_mols = [Chem.MolFromSmiles(smile) for smile in final_smiles]
+  else:
+    # Prepare dataset from chembl ==========================================
+    if len(df) > 2000:
+      df = df.sample(n=2000, random_state=42)
+    smiles_list = df["SMILES"].to_list()
+    Xa = []
+    for smiles in smiles_list:
+      smiles = smiles.replace("[Na+].","").replace("[Cl-].","").replace(".[Cl-]","").replace(".[Na+]","")
+      smiles = smiles.replace("[K+].","").replace("[Br-].","").replace(".[K+]","").replace(".[Br-]","")
+      smiles = smiles.replace("[I-].","").replace(".[I-]","").replace("[Ca2+].","").replace(".[Ca2+]","")
+      Xa.append(smiles)
+    tokenizer=dc.feat.SmilesTokenizer(vocab_file="vocab.txt")
+    featname="SMILES Tokenizer"
+    fl = list(map(lambda x: tokenizer.encode(x),Xa))
+    biggest = 1
+    smallest = 200
+    for i in range(len(fl)):
+        temp = len(fl[i])
+        if temp > biggest:
+            biggest = temp
+        if temp < smallest:
+            smallest = temp
+    print(biggest, smallest)
+    string_length = smallest - 1
+    max_length = biggest
+    fl2 = list(map(lambda x: tokenizer.add_padding_tokens(x,max_length),fl))
+    fl2set=set()
+    for sublist in fl2:
+      fl2set.update(sublist)
+    new_vocab_size = len(fl2set)
+    print("New vocabulary size: ",new_vocab_size)
+    f = open("vocab_305K.txt", "r")
+    raw_lines = f.readlines()
+    f.close()
+    VOCAB_SIZE = len(raw_lines)
+    print("Vocabulary size for standard dataset: ",VOCAB_SIZE)
+    lines = []
+    for line in raw_lines:
+      lines.append(line.replace("\n",""))
+    novel_items = []
+    for item in fl2set:
+      item = tokenizer.decode([item])
+      item = tokenizer.convert_tokens_to_string(item)
+      item = item.replace(" ","")
+      if item not in lines:
+        print(f"{item} not in standard vocabulary")
+        novel_items.append(item)
+    if(len(novel_items) > 0):
+      print("This dataset is not compatible with the Foundation model vocabulary")
+    else:
+      print("This dataset is compatible with the Foundation model vocabulary")
+    if max_length > 166:
+      print("This dataset's context window is not compatible with the Foundation model.")
+    else:
+      print("This dataset's context window is compatible with the Foundation model")
+    smiles_removed_tokens = []
+    for i,smiles in enumerate(Xa):
+      bad_list = [True if (token in smiles) else False for token in novel_items]
+      if not any(bad_list):
+        smiles_removed_tokens.append(smiles)
+    smiles_no_long = []
+    for i,smiles in enumerate(smiles_removed_tokens):
+      if len(smiles) <= 166:
+        smiles_no_long.append(smiles)
+    print(f"Removed {len(Xa) - len(smiles_no_long)} entries from the list!")
+    new_dict = {"SMILES": smiles_no_long}
+    new_df = pd.DataFrame(new_dict)
+    Xa = []
+    for smiles in new_df['SMILES']:
+      Xa.append(smiles)
+    tokenizer=dc.feat.SmilesTokenizer(vocab_file="vocab_305K.txt")
+    featname="SMILES Tokenizer"
+    fl = list(map(lambda x: tokenizer.encode(x),Xa))
+    biggest = 1
+    smallest = 200
+    for i in range(len(fl)):
+        temp = len(fl[i])
+        if temp > biggest:
+            biggest = temp
+        if temp < smallest:
+            smallest = temp
+    print(biggest, smallest)
+    string_length = smallest - 1
+    max_length = biggest
+    fl2 = list(map(lambda x: tokenizer.add_padding_tokens(x,max_length),fl))
+    f = open("vocab_305K.txt", "r")
+    lines = f.readlines()
+    f.close()
+    VOCAB_SIZE = len(lines)
+    print("Vocabulary size for this dataset: ",VOCAB_SIZE)
+    x = []
+    y = []
+    i=0
+    for string in fl2:
+        x.append(string[0:max_length-1]) #string_length
+        y.append(string[1:max_length]) #string_length+1
+    fx = np.array(x)
+    fy = np.array(y)
+    print("Number of features and datapoints, targets: ",fx.shape,fy.shape)
+    # Load foundation model ==================================================
+    VOCAB_SIZE = 100
+    max_length = 166
+    num_new_blocks = 2
+    EMBEDDING_DIM = 256
+    N_HEADS = 4
+    KEY_DIM = 256
+    FEED_FORWARD_DIM = 256
+    inputs = tf.keras.layers.Input(shape=(None,),dtype=tf.int32)
+    x = TokenAndPositionEmbedding(max_length,VOCAB_SIZE,EMBEDDING_DIM)(inputs)
+    for i in range(num_new_blocks+2):
+      x, attentions_scores = TransformerBlock(N_HEADS,KEY_DIM,EMBEDDING_DIM,FEED_FORWARD_DIM)(x)
+    outputs = tf.keras.layers.Dense(VOCAB_SIZE,activation="softmax")(x)
+    gpt_ft = tf.keras.models.Model(inputs = inputs, outputs =[outputs, attentions_scores])
+    f = open("layer_store_GPT_ZN305_50epochs.txt", "r")
+    layer_name_store_raw = f.readlines()
+    f.close()
+    print("Reading in layers:")
+    layer_name_store = []
+    for line in layer_name_store_raw:
+        line = line.replace("\n","")
+        layer_name_store.append(line)
+        print(line)
+    print("===========================================")
+    new_layers = num_new_blocks + 1
+    for i,layer in enumerate(gpt_ft.layers[:-new_layers]):
+      layer.name = layer_name_store[i]
+      print(f"{layer.name} has been named!")
+    for i,layer in enumerate(gpt_ft.layers[-new_layers:-1]):
+      layer.name = f"transformer_block_X_{i+1}"
+      print(f"{layer.name} has been named!")
+    gpt_ft.layers[-1].name = "dense_X"
+    gpt_ft.load_weights("GPT_ZN305_50epochs.weights.h5", skip_mismatch=True)
+    for layer in gpt_ft.layers[0:-new_layers]:                 #make old layers freeze and only train new layers
+      layer.trainable=False
+      print(f"setting layer {layer.name} untrainable.")
+    for layer in gpt_ft.layers[-new_layers:]:
+      layer.trainable=True
+      print(f"setting layer {layer.name} trainable.")
+    # train new layers =======================================================
+    batch_size = 512
+    gpt_ft.compile("adam",loss=[tf.keras.losses.SparseCategoricalCrossentropy(),None])
+    gpt_ft.fit(fx,fy,epochs = 50, batch_size = batch_size)
+    # train all together =====================================================
+    for layer in gpt_ft.layers:
+      layer.trainable=True
+      print(f"setting layer {layer.name} trainable.")
+    gpt_ft.compile("adam",loss=[tf.keras.losses.SparseCategoricalCrossentropy(),None])
+    gpt_ft.fit(fx,fy,epochs = 25, batch_size = batch_size)
+    # make prompts ============================================================
+    df_prompts = pd.read_csv("ZN305K_smiles.csv")
+    Xap = []
+    for smiles in df_prompts["SMILES"]:
+      smiles = smiles.replace("[Na+].","").replace("[Cl-].","").replace(".[Cl-]","").replace(".[Na+]","")
+      smiles = smiles.replace("[K+].","").replace("[Br-].","").replace(".[K+]","").replace(".[Br-]","")
+      smiles = smiles.replace("[I-].","").replace(".[I-]","").replace("[Ca2+].","").replace(".[Ca2+]","")
+      Xap.append(smiles)
+    raw_prompts = random.choices(Xap,k=50)
+    test_string = []
+    for smile in raw_prompts:
+      test_string.append(smile[:2])
+    # inference ================================================================
+    tf.random.set_seed(42)
+    batch_length = len(test_string)
+    prompt_length = len(test_string[0])
+    test_xlist = np.empty([batch_length,prompt_length], dtype=int)
+    test_tokenized = list(map(lambda x: tokenizer.encode(x),test_string))
+    for i in range(batch_length):
+        test_xlist[i][:] = test_tokenized[i][:prompt_length]
+    test_array = np.array(test_xlist)
+    proba = np.empty([batch_length,VOCAB_SIZE])
+    rescaled_logits = np.empty([batch_length,VOCAB_SIZE])
+    preds = np.empty([batch_length])
+    gen_molecules = np.empty([batch_length])
+    c_final = 60 - prompt_length
+    sig_start = 0.10
+    TEMP = 1.5
+    for c in range(0,c_final,1):
+        c_o = int(c_final*sig_start)
+        T_int = TEMP*(1/(1+np.exp(-(c-c_o))))
+        results, _ = gpt_ft.predict(test_array)
+        if T_int < 0.015:
+            print(f"using zero temp generation with {T_int}.")
+            for j in range(batch_length):
+                preds[j] = tf.argmax(results[j][-1])
+                preds = list(map(lambda x: int(x),preds))
+        else:
+            print(f"using variable temp generation with {T_int}.")
+            for j in range(batch_length):
+                proba[j] = (results[j][-1:]) ** (1/T_int)
+                rescaled_logits[j] = ( proba[j][:] ) / np.sum(proba[j][:])
+                preds[j] = np.random.choice(len(rescaled_logits[j][:]),
+                                            p=rescaled_logits[j][:])
+                preds = list(map(lambda x: int(x),preds))
+        test_array = np.c_[test_array,preds]
+        print(test_array.shape)
+    gen_molecules = list(map(lambda x: tokenizer.decode(x),test_array))
+    gen_molecules = list(map(lambda x: tokenizer.convert_tokens_to_string(x),
+                              gen_molecules))
+    gen_molecules = list(map(lambda x: strip_smiles(x),gen_molecules))
+    mols, smiles = mols_from_smiles(gen_molecules)
+    final_smiles = []
+    final_mols = []
+    for smile, mol in zip(smiles,mols):
+        if smile not in final_smiles:
+            final_smiles.append(smile)
+            final_mols.append(mol)
+    final_dict = {"SMILES": final_smiles}
+    final_df = pd.DataFrame.from_dict(final_dict)
+    final_df.to_csv(f"gen_smiles_{chembl_id}.csv", index = False)
+  print(f"Generated {len(final_smiles)} unique molecules.")
+  img = Draw.MolsToGridImage(final_mols,molsPerRow=3,legends=final_smiles)
+  #img.save("Substitution_image.png")
+  out_text = f'The novel molecules generated by a GPT trained on {chembl_id} are: \n'
+  for smile in final_smiles:
+    out_text += f'{smile}\n'
+  return final_smiles, out_text, img
+def casual_attention_mask(batch_size,n_dest,n_src,dtype):
+  '''
+    Make a causal attention mask
+  '''
+  i = tf.range(n_dest)[:,None]
+  j = tf.range(n_src)
+  m = i >= j - n_src + n_dest
+  mask = tf.cast(m,dtype)
+  mask = tf.reshape(mask,[1,n_dest,n_src])
+  mult = tf.concat([tf.expand_dims(batch_size,-1),tf.constant([1,1],dtype=tf.int32)],0)
+  return tf.tile(mask,mult)
+class TransformerBlock(tf.keras.layers.Layer):
+  '''
+    Transformer block with multi-head attention.
+  '''
+  def __init__(self,num_heads,key_dim,embed_dim,ff_dim,dropout_rate=0.1):
+    super(TransformerBlock,self).__init__()
+    self.num_heads = num_heads
+    self.key_dim = key_dim
+    self.embed_dim = embed_dim
+    self.ff_dim = ff_dim
+    self.dropout_rate = dropout_rate
+    self.attn = tf.keras.layers.MultiHeadAttention(self.num_heads,self.key_dim,
+                                                    output_shape=self.embed_dim)
+    self.dropout_1 = tf.keras.layers.Dropout(self.dropout_rate)
+    self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=0.000001)
+    self.ffn_1 = tf.keras.layers.Dense(self.ff_dim,activation="relu")
+    self.ffn_2 = tf.keras.layers.Dense(self.embed_dim)
+    self.dropout_2 = tf.keras.layers.Dropout(self.dropout_rate)
+    self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=0.000001)
+  def call(self,inputs):
+    input_shape = tf.shape(inputs)
+    batch_size2 = input_shape[0]
+    seq_len = input_shape[1]
+    casual_mask = casual_attention_mask(batch_size2,seq_len,seq_len,tf.bool)
+    attention_output, attention_scores = self.attn(inputs,inputs,
+                                                    attention_mask=casual_mask,
+                                                    return_attention_scores=True)
+    attention_output = self.dropout_1(attention_output)
+    out1 = self.ln_1(inputs + attention_output)
+    ffn_1 = self.ffn_1(out1)
+    ffn_2 = self.ffn_2(ffn_1)
+    ffn_output = self.dropout_2(ffn_2)
+    return (self.ln_2(out1+ffn_output),attention_scores)
+  def get_config(self):
+    config = super().get_config()
+    config.update({"key_dim": self.key_dim, "embed_dim": self.embed_dim,
+                  "num_heads": self.num_heads,"ff_dim": self.ff_dim,
+                  "dropout_rate": self.dropout_rate})
+    return config
+class TokenAndPositionEmbedding(tf.keras.layers.Layer):
+  '''
+    Embeds tokens and positions.
+  '''
+  def __init__(self,max_len,vocab_size,embed_dim):
+    super(TokenAndPositionEmbedding,self).__init__()
+    self.max_len = max_len
+    self.vocab_size = vocab_size
+    self.embed_dim = embed_dim
+    self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size,
+                                                output_dim = embed_dim)
+    self.pos_emb = tf.keras.layers.Embedding(input_dim=max_len,output_dim=embed_dim)
+  def call(self,x):
+    maxlen = tf.shape(x)[-1]
+    positions = tf.range(start=0,limit=maxlen,delta=1)
+    positions = self.pos_emb(positions)
+    x = self.token_emb(x)
+    return x + positions
+  def get_config(self):
+    config = super().get_config()
+    config.update({"max_len": self.max_len, "vocab_size": self.vocab_size,
+                  "embed_dim": self.embed_dim})
+    return config
+def strip_smiles(input_string):
+  '''
+    Cleans un-needed tokens from the SMILES string.
+      Args:
+        input_string: SMILES string
+      Returns:
+        output_string: cleaned SMILES string
+  '''
+  output_string = input_string.replace(" ","").replace("[CLS]","").replace("[SEP]","").replace("[PAD]","")
+  output_string = output_string.replace("[Na+].","").replace(".[Na+]","")
+  return output_string
+def mols_from_smiles(input_smiles_list):
+  '''
+    Converts a list of SMILES strings to a list of RDKit molecules.
+      Args:
+        input_smiles_list: list of SMILES strings
+      Returns:
+        valid_mols: list of RDKit molecules
+        valid_smiles: list of SMILES strings
+  '''
+  valid_mols = []
+  valid_smiles = []
+  good_count = 0
+  for ti, smile in enumerate(input_smiles_list):
+    temp_mol = Chem.MolFromSmiles(smile)
+    if temp_mol != None:
+      valid_mols.append(temp_mol)
+      valid_smiles.append(smile)
+      good_count += 1
+    else:
+      print(f"SMILES {ti} was not valid!")
+  if len(valid_mols) == len(valid_smiles) == good_count:
+    print(f"Generated a total of {good_count} mol objects")
+  else:
+    print("mismatch!")
+  return valid_mols, valid_smiles

modrag_molecule_functions.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import matplotlib.pyplot as plt
+from rdkit import Chem
+from rdkit.Chem import AllChem, QED
+from rdkit.Chem import Draw
+from rdkit.Chem.Draw import MolsToGridImage
+from rdkit import rdBase
+from rdkit.Chem import rdMolAlign
+import os, re
+from rdkit import RDConfig
+import pubchempy as pcp
+from PIL import Image
+from collections import Counter
+from langchain_core.tools import tool
+@tool
+def name_node(smiles_list: list[str]) -> (list[str], str):
+  '''
+    Queries Pubchem for the name of the molecule based on the smiles string.
+      Args:
+        smiles_list: the list of input smiles strings
+      Returns:
+        names_list: the list of names of the molecules
+        name_string: a string of the tool results
+  '''
+  print("name tool")
+  print('===================================================')
+  names = []
+  name_string = ''
+  for smiles in smiles_list:
+    try:
+        res = pcp.get_compounds(smiles, "smiles")
+        name = res[0].iupac_name
+        names.append(name)
+        name_string += f'{smiles}: IUPAC molecule name: {name}\n'
+        print(smiles, name)
+        syn_list = pcp.get_synonyms(res[0].cid)
+        for alt_name in syn_list[0]['Synonym'][:5]:
+            name_string += f'{smiles}: alternative or common name: {alt_name}\n'
+    except:
+        name = "unknown"
+        name_string += f'{smiles}: Fail\n'
+  return names, name_string, None
+@tool
+def smiles_node(names_list: list[str]) -> (list[str], str):
+  '''
+    Queries Pubchem for the smiles string of the molecule based on the name.
+      Args:
+        names_list: the list of molecule names
+      Returns:
+        smiles_list: the list of smiles strings of the molecules
+        smiles_string: a string of the tool results
+  '''
+  print("smiles tool")
+  print('===================================================')
+  smiles_list = []
+  smiles_string = ''
+  for name in names_list:
+    try:
+        res = pcp.get_compounds(name, "name")
+        smiles = res[0].smiles
+        #smiles = smiles.replace('#','~')
+        smiles_list.append(smiles)
+        smiles_string += f'{name}: The SMILES string for the molecule is: {smiles}\n'
+    except:
+        smiles = "unknown"
+        smiles_string += f'{name}: Fail\n'
+  return smiles_list, smiles_string, None
+@tool
+def related_node(smiles_list: list[str]) -> (list[list[str]], str, list):
+  '''
+    Queries Pubchem for similar molecules based on the smiles string or name
+      Args:
+        smiles: the input smiles string, OR
+        name: the molecule name
+      Returns:
+        total_similar_list: a list of lists of similar molecules
+        related_string: a string of the tool results
+        all_images: a list of images of the similar molecules
+  '''
+  print("related tool")
+  print('===================================================')
+  total_similar_list = []
+  all_images = []
+  related_string = ''
+  for smiles in smiles_list:
+    try:
+        res = pcp.get_compounds(smiles, "smiles", searchtype="similarity",listkey_count=50)
+        related_string += f'The following molecules are similar to {smiles}: \n'
+        print('got related molecules with smiles')
+        sub_smiles = []
+        i = 0
+        for compound in res:
+            if i == 0:
+                print(compound.iupac_name)
+                i+=1
+            sub_smiles.append(compound.smiles)
+            related_string += f'Name: {compound.iupac_name}\n'
+            related_string += f'SMILES: {compound.smiles}\n'
+            related_string += f'Molecular Weight: {compound.molecular_weight}\n'
+            related_string += f'LogP: {compound.xlogp}\n'
+            related_string += '===================\n'
+        sub_mols = [Chem.MolFromSmiles(smile) for smile in sub_smiles]
+        legend = [str(compound.smiles) for compound in res]
+        total_similar_list.append(sub_smiles)
+        img = Draw.MolsToGridImage(sub_mols, legends=legend, molsPerRow=4, subImgSize=(250, 250))
+        #pic = img.data
+        all_images.append(img)
+    except:
+        related_string += f'{smiles}: Fail\n'
+        total_similar_list.append([])
+        all_images.append(None)
+  pic = img.data
+  with open('current_image.png', 'wb') as f:
+    f.write(pic)
+  img = Image.open('current_image.png')
+  return total_similar_list, related_string, img
+@tool
+def structure_node(smiles_list: list[str]) -> (list[str], str, list):
+  '''
+    Generates the 3D structure of the molecule based on the smiles string.
+      Args:
+        smiles: the input smiles string
+      Returns:
+        all_structures: a list of strings of the 3D structure of the molecule
+        output_string: a string of the chemical formulae.
+        all_images: a list of images of the 3D structure of the molecule
+  '''
+  print("structure tool")
+  all_mols = []
+  all_structures = []
+  output_string = ''
+  for smile in smiles_list:
+    mol = Chem.MolFromSmiles(smile)
+    molH = Chem.AddHs(mol)
+    AllChem.EmbedMolecule(molH)
+    AllChem.MMFFOptimizeMolecule(molH)
+    structure_string = ""
+    all_symbols = []
+    for atom in molH.GetAtoms():
+      symbol = atom.GetSymbol()
+      all_symbols.append(symbol)
+      pos = molH.GetConformer().GetAtomPosition(atom.GetIdx())
+      structure_string += f'{symbol}  {pos[0]}  {pos[1]}  {pos[2]}\n'
+    atom_freqs = Counter(all_symbols)
+    formula = ''.join([f'{atom}{count}' for atom, count in atom_freqs.items()])
+    output_string += f'For {smile}: Formula is: {formula}\n'
+    all_structures.append(structure_string)
+    all_mols.append(molH)
+  img = Draw.MolsToGridImage(all_mols, molsPerRow=3, subImgSize=(250, 250))
+  #save the image as current_image.png
+  pic = img.data
+  with open('current_image.png', 'wb') as f:
+    f.write(pic)
+  img = Image.open('current_image.png')
+  return all_structures, output_string, img

modrag_property_functions.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from rdkit import Chem
+from rdkit.Chem import AllChem, QED
+from rdkit.Chem import Draw
+from rdkit import rdBase
+from rdkit.Chem import rdMolAlign
+import os, re
+from rdkit import RDConfig
+from rdkit.Chem.Features.ShowFeats import _featColors as featColors
+from rdkit.Chem.FeatMaps import FeatMaps
+from PIL import Image
+from langchain_core.tools import tool
+fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef'))
+fmParams = {}
+for k in fdef.GetFeatureFamilies():
+    fparams = FeatMaps.FeatMapParams()
+    fmParams[k] = fparams
+@tool
+def substitution_node(smiles_list: list[str]) -> (list[str], str, list):
+  '''
+    A simple substitution routine that looks for a substituent on a phenyl ring and
+    substitutes different fragments in that location. Returns a list of novel molecules and their
+    QED score (1 is most drug-like, 0 is least drug-like).
+      Args:
+        smiles: the input smiles string
+      Returns:
+        new_smiles_list: a list of novel molecules and their QED scores.
+        new_smiles_string: a string of the tool results
+  '''
+  print("substitution tool")
+  print('===================================================')
+  new_fragments = ["c(Cl)c", "c(F)c", "c(O)c", "c(C)c", "c(OC)c", "c([NH3+])c",
+                   "c(Br)c", "c(C(F)(F)(F))c"]
+  total_sub_smiles_list = []
+  total_sub_smiles_string = ''
+  total_sub_images = []
+  for smiles in smiles_list:
+    try:
+        new_smiles = []
+        for fragment in new_fragments:
+            m = re.findall(r"c(\D\D*)c", smiles)
+            if len(m) != 0:
+                for group in m:
+                    #print(group)
+                    if fragment not in group:
+                        new_smile = smiles.replace(group[1:], fragment)
+                        new_smiles.append(new_smile)
+        qeds = []
+        for new_smile in new_smiles:
+            qeds.append(get_qed(new_smile))
+        original_qed = get_qed(smiles)
+        total_sub_smiles_string += "Substitution or Analogue creation tool results: \n"
+        total_sub_smiles_string += f"The original molecule SMILES was {smiles} with QED {original_qed}.\n"
+        total_sub_smiles_string += "Novel Molecules or Analogues and QED values: \n"
+        for i in range(len(new_smiles)):
+            total_sub_smiles_string += f"SMILES: {new_smiles[i]}, QED: {qeds[i]:.3f}\n"
+        total_sub_smiles_list.append(new_smiles)
+        mols = [Chem.MolFromSmiles(smile) for smile in new_smiles]
+        img = Draw.MolsToGridImage(mols,legends=new_smiles, molsPerRow=4, subImgSize=(250, 250))
+        total_sub_images.append(img)
+    except:
+        total_sub_smiles_list.append([])
+        total_sub_smiles_string += f"SMILES: {smiles}, Fail\n"
+        total_sub_images.append(None)
+  pic = img.data
+  with open('current_image.png', 'wb') as f:
+    f.write(pic)
+  img = Image.open('current_image.png')
+  return total_sub_smiles_list, total_sub_smiles_string, img
+def get_qed(smiles):
+  '''
+    Helper function to compute QED for a given molecule.
+      Args:
+        smiles: the input smiles string
+      Returns:
+        qed: the QED score of the molecule.
+  '''
+  mol = Chem.MolFromSmiles(smiles)
+  qed = Chem.QED.default(mol)
+  return qed
+@tool
+def lipinski_node(smiles_list: list[str]) -> (list[float], str):
+  '''
+    A tool to calculate QED and other lipinski properties of a molecule.
+      Args:
+        smiles: the input smiles string
+      Returns:
+        total_lipinski_list: a list of the QED and other lipinski properties of the molecules,
+                      including Molecular Weight, LogP, HBA, HBD, Polar Surface Area,
+                      Rotatable Bonds, Aromatic Rings and Undesireable Moieties.
+        total_lipinski_string: a string of the tool results
+  '''
+  print("lipinski tool")
+  print('===================================================')
+  total_lipinski_list = []
+  total_lipinski_string = ''
+  for smiles in smiles_list:
+    for ion in ['.[Na+]', '.[K+]', '.[Cl-]', '.[Br-]', '[Na+].', '[K+].', '[Cl-].', '[Br-].']:
+        smiles = smiles.replace(ion, '')
+    lipinski_list = []
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        qed = Chem.QED.default(mol)
+        p = Chem.QED.properties(mol)
+        mw = p[0]
+        logP = p[1]
+        hba = p[2]
+        hbd = p[3]
+        psa = p[4]
+        rb = p[5]
+        ar = p[6]
+        um = p[7]
+        lipinski_list.append(qed)
+        lipinski_list.append(mw)
+        lipinski_list.append(logP)
+        lipinski_list.append(hba)
+        lipinski_list.append(hbd)
+        lipinski_list.append(psa)
+        lipinski_list.append(rb)
+        lipinski_list.append(ar)
+        lipinski_list.append(um)
+        total_lipinski_string += f"Properties of SMILES: {smiles}: QED: {qed:.3f}\n"
+        total_lipinski_string += f"Molecular Weight: {mw:.3f}, LogP: {logP:.3f}\n"
+        total_lipinski_string += f"Hydrogen bond acceptors: {hba}, Hydrogen bond donors: {hbd}\n"
+        total_lipinski_string += f"Polar Surface Area: {psa:.3f}, Rotatable Bonds: {rb}\n"
+        total_lipinski_string += f"Aromatic Rings: {ar}, Undesireable moieties: {um}\n"
+        total_lipinski_string += "===================================================\n"
+        total_lipinski_list.append(lipinski_list)
+    except:
+        total_lipinski_list.append([])
+        total_lipinski_string += f"SMILES: {smiles}, Could not get properties\n"
+  return total_lipinski_list, total_lipinski_string, None
+@tool
+def pharmfeature_node(known_smiles: str, test_smiles: list[str]) -> (list[float], str):
+  '''
+    A tool to compare the pharmacophore features of a query molecule against
+    a those of a reference molecule and report the pharmacophore features of both and the feature
+    score of the query molecule.
+      Args:
+        known_smiles: the reference smiles string
+        test_smiles: the query smiles string
+      Returns:
+        total_pharmfeature_scores: a list of the pharmacophore feature scores of the query molecules.
+        total_pharmfeature_string: a string of the tool results
+  '''
+  print("pharmfeature tool")
+  print('===================================================')
+  keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder', 'Aromatic', 'LumpedHydrophobe')
+  feat_hash = {'Donor': 'Hydrogen bond donors', 'Acceptor': 'Hydrogen bond acceptors',
+               'NegIonizable': 'Negatively ionizable groups', 'PosIonizable': 'Positively ionizable groups',
+               'ZnBinder': 'Zinc Binders', 'Aromatic': 'Aromatic rings', 'LumpedHydrophobe': 'Hydrophobic/non-polar groups' }
+  smiles = [known_smiles, *test_smiles]
+  mols = [Chem.MolFromSmiles(x) for x in smiles]
+  mols = [Chem.AddHs(m) for m in mols]
+  ps = AllChem.ETKDGv3()
+  for m in mols:
+      AllChem.EmbedMolecule(m,ps)
+  total_pharmfeature_scores = []
+  total_pharmfeature_string = ''
+  #i = 1
+  for i in range(1, len(mols)):
+    o3d = rdMolAlign.GetO3A(mols[i],mols[0])
+    o3d.Align()
+    feat_vectors = []
+    for m in [mols[0], mols[i]]:
+        rawFeats = fdef.GetFeaturesForMol(m)
+        feat_vectors.append([f for f in rawFeats if f.GetFamily() in keep])
+    feat_maps = [FeatMaps.FeatMap(feats = x,weights=[1]*len(x),params=fmParams) for x in feat_vectors]
+    test_score = feat_maps[0].ScoreFeats(feat_maps[1].GetFeatures())/(feat_maps[0].GetNumFeatures())
+    feats_known = {}
+    feats_test = {}
+    for feat in feat_vectors[0]:
+        if feat.GetFamily() not in feats_known.keys():
+            feats_known[feat.GetFamily()]  = 1
+        else:
+            feats_known[feat.GetFamily()] += 1
+    for feat in feat_vectors[1]:
+        if feat.GetFamily() not in feats_test.keys():
+            feats_test[feat.GetFamily()]  = 1
+        else:
+            feats_test[feat.GetFamily()] += 1
+    total_pharmfeature_string += f"PharmFeature tool results for SMILES: {smiles[i]}: \n"
+    total_pharmfeature_string += f"The Pharmacophore Feature Overlap Score of the test molecule \
+versus the reference molecule is {test_score:.3f}. \n\n"
+    total_pharmfeature_scores.append(test_score)
+    for feat in feats_known.keys():
+        total_pharmfeature_string += f"There are {feats_known[feat]} {feat_hash[feat]} in the reference molecule. \n"
+    for feat in feats_test.keys():
+        total_pharmfeature_string += f"There are {feats_test[feat]} {feat_hash[feat]} in the test molecule. \n"
+    #i += 1
+    total_pharmfeature_string += "===================================================\n"
+  return total_pharmfeature_scores, total_pharmfeature_string, None

modrag_protein_functions.py ADDED Viewed

	@@ -0,0 +1,763 @@

+from rdkit import Chem
+from rdkit.Chem import AllChem, QED
+from rdkit.Chem import Draw
+from rdkit.Chem.Draw import MolsToGridImage
+from rdkit import rdBase
+from rdkit.Chem import rdMolAlign
+import os, re
+from rdkit import RDConfig
+from PIL import Image
+import numpy as np
+import pandas as pd
+from chembl_webresource_client.new_client import new_client
+from tqdm.auto import tqdm
+import requests, json
+from rcsbapi.search import TextQuery
+import itertools
+import lightgbm as lgb
+from lightgbm import LGBMRegressor
+import deepchem as dc
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.preprocessing import StandardScaler
+import tensorflow as tf
+import random
+from finetune_gpt import *
+from dockstring import load_target
+from langchain_core.tools import tool
+@tool
+def uniprot_node(protein_names: list[str], human_flag: bool = False) -> (list[str], str):
+  '''
+    This tool takes in the user requested protein and searches UNIPROT for matches.
+    It returns a string scontaining the protein ID, gene name, organism, and protein name.
+      Args:
+        query_protein: the name of the protein to search for.
+      Returns:
+        total_ids: a list of UNIPROT IDs for the given protein names.
+        protein_string: a string containing the protein ID, gene name, organism, and protein name.
+  '''
+  print("UNIPROT tool")
+  print('===================================================')
+  total_ids = []
+  protein_string = ''
+  for protein_name in protein_names:
+    try:
+      url = f'https://rest.uniprot.org/uniprotkb/search?query={protein_name}&format=tsv'
+      response = requests.get(url).text
+      f = open(f"{protein_name}_uniprot_ids.tsv", "w")
+      f.write(response)
+      f.close()
+      prot_df_raw = pd.read_csv(f'{protein_name}_uniprot_ids.tsv', sep='\t')
+      if human_flag:
+        prot_df = prot_df_raw[prot_df_raw['Organism'] == "Homo sapiens (Human)"]
+        print(f"Found {len(prot_df)} Human proteins out of {len(prot_df_raw)} total proteins")
+      else:
+        prot_df = prot_df_raw
+      prot_ids = prot_df['Entry'].tolist()
+      genes = prot_df['Gene Names'].tolist()
+      organisms = prot_df['Organism'].tolist()
+      names = prot_df['Protein names'].tolist()
+      sub_ids = []
+      for id, gene, organism, name in zip(prot_ids, genes, organisms, names):
+        protein_string += f'Protein {protein_name}, ID: {id}, Gene: {gene}, Organism: {organism}, Name: {name}\n'
+        sub_ids.append(id)
+      protein_string += '==========================================================================================\n'
+      total_ids.append(sub_ids)
+    except:
+      protein_string += f'No proteins found for {protein_name}'
+      protein_string += '==========================================================================================\n'
+      total_ids.append([])
+  return total_ids, protein_string, None
+def get_qed(smiles):
+  '''
+    Helper function to compute QED for a given molecule.
+      Args:
+        smiles: the input smiles string
+      Returns:
+        qed: the QED score of the molecule.
+  '''
+  mol = Chem.MolFromSmiles(smiles)
+  qed = Chem.QED.default(mol)
+  return qed
+@tool
+def listbioactives_node(up_ids_list: list[str]) -> (list[int], list[str], str):
+  '''
+    Accepts a UNIPROT ID and searches for bioactive molecules
+      Args:
+        up_ids_list: the UNIPROT IDs of the proteins to search for.
+      Returns:
+        total_bioacts_list: a list of the number of bioactive molecules for each protein
+        total_chembl_ids_list: a list of the ChEMBL IDs for each protein
+        bioact_string: a string containing the results of the search.
+  '''
+  print("List bioactives tool")
+  print('===================================================')
+  total_bioacts_list = []
+  total_chembl_ids_list = []
+  bioact_string = ''
+  for up_id in up_ids_list:
+    targets = new_client.target
+    bioact = new_client.activity
+    try:
+      target_info = targets.get(target_components__accession=up_id).only("target_chembl_id","organism", "pref_name", "target_type")
+      target_info = pd.DataFrame.from_records(target_info)
+      print(target_info)
+      if len(target_info) > 0:
+        print(f"Found info for Uniprot ID: {up_id}")
+      chembl_ids = target_info['target_chembl_id'].tolist()
+      chembl_ids = list(set(chembl_ids))
+      print(f"Found {len(chembl_ids)} unique ChEMBL IDs")
+      len_all_bioacts = []
+      for chembl_id in chembl_ids:
+        bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
+            "molecule_chembl_id",
+            "type",
+            "standard_units",
+            "relation",
+            "standard_value",
+        )
+        len_this_bioacts = len(bioact_chosen)
+        len_all_bioacts.append(len_this_bioacts)
+        bioact_string += f"For Uniprot {up_id}: length of Bioactivities for ChEMBL ID {chembl_id}: {len_this_bioacts}\n"
+      bioact_string += f'================================================================================================\n'
+      total_chembl_ids_list.append(chembl_ids)
+      total_bioacts_list.append(len_all_bioacts)
+    except:
+      bioact_string += f'No bioactives found for Uniprot {up_id}\n'
+      bioact_string += f'================================================================================================\n'
+      total_chembl_ids_list.append([])
+      total_bioacts_list.append([])
+  return total_bioacts_list, bioact_string, None
+@tool
+def getbioactives_node(chembl_ids_list: list[str]) -> (list[str], str):
+  '''
+    Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID
+      Args:
+        chembl_id: the chembl ID to query
+      Returns:
+        bioactives_list: a list of the bioactive molecules for each chembl ID
+        bioactives_string: a string containing the results of the search.
+        bioactives_images: a list of images for each bioactive molecule.
+  '''
+  print("Get bioactives tool")
+  print('===================================================')
+  bioactives_list = []
+  bioactives_images = []
+  bioactives_string = ''
+  for chembl_id in chembl_ids_list:
+    try:
+      #check if f'{chembl_id}_bioactives.csv' exists
+      chembl_id = chembl_id.upper()
+      if os.path.exists(f'{chembl_id}_bioactives.csv'):
+        print(f'Found {chembl_id}_bioactives.csv')
+        total_bioact_df = pd.read_csv(f'{chembl_id}_bioactives.csv')
+        print(f"number of records: {len(total_bioact_df)}")
+      else:
+        compounds = new_client.molecule
+        bioact = new_client.activity
+        bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
+            "molecule_chembl_id",
+            "type",
+            "standard_units",
+            "relation",
+            "standard_value",
+        )
+        chembl_ids = []
+        ic50s = []
+        for record in bioact_chosen:
+            if record["standard_units"] == 'nM':
+                chembl_ids.append(record["molecule_chembl_id"])
+                ic50s.append(float(record["standard_value"]))
+        bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
+        bioact_df = pd.DataFrame.from_dict(bioact_dict)
+        bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
+        print(f"Number of records: {len(bioact_df)}")
+        print(bioact_df.shape)
+        compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
+            "molecule_chembl_id",
+            "molecule_structures"
+        )
+        cids_list = []
+        smiles_list = []
+        for record in compounds_provider:
+            cid = record['molecule_chembl_id']
+            cids_list.append(cid)
+            if record['molecule_structures']:
+                if record['molecule_structures']['canonical_smiles']:
+                    smile = record['molecule_structures']['canonical_smiles']
+                else:
+                    print("No canonical smiles")
+                    smile = None
+            else:
+                print('no structures')
+                smile = None
+            smiles_list.append(smile)
+        new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
+        new_df = pd.DataFrame.from_dict(new_dict)
+        total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
+        print(f"number of records: {len(total_bioact_df)}")
+        total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
+        print(f"number of records after removing duplicates: {len(total_bioact_df)}")
+        total_bioact_df.dropna(axis=0, how='any', inplace=True)
+        total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
+        print(f"number of records after dropping Null values: {len(total_bioact_df)}")
+        total_bioact_df.sort_values(by=["IC50s"],inplace=True)
+        if len(total_bioact_df) > 0:
+          total_bioact_df.to_csv(f'{chembl_id}_bioactives.csv')
+      limit = 50
+      if len(total_bioact_df) > limit:
+        total_bioact_df = total_bioact_df.iloc[:limit]
+      bioact_tuple_list = []
+      bioactives_string += f'Results for top bioactivity (IC50 value) for molecules in ChEMBL ID: {chembl_id}. \n'
+      for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
+        bioactives_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
+        bioact_tuple_list.append((smile, ic50))
+      bioactives_string += f'=========================================================================================\n'
+      mols = [Chem.MolFromSmiles(smile) for smile in total_bioact_df['SMILES'].to_list()]
+      legends = [f'IC50: {ic50}' for ic50 in total_bioact_df['IC50s'].to_list()]
+      img = MolsToGridImage(mols, molsPerRow=5, legends=legends, subImgSize=(200,200))
+      bioactives_images.append(img)
+      bioactives_list.append(bioact_tuple_list)
+    except:
+      bioactives_list.append([])
+      bioactives_string += f'No bioactives found for ChEMBL ID: {chembl_id}\n'
+      bioactives_string += f'=========================================================================================\n'
+      bioactives_images.append(None)
+  try:
+    pic = img.data
+    with open('current_image.png', 'wb') as f:
+      f.write(pic)
+    img = Image.open('current_image.png')
+  except Exception as e:
+    print(f"Error occurred while processing image: {e}")
+    img = None
+  return bioactives_list, bioactives_string, img
+@tool
+def predict_node(smiles_list_in: list[str], chembl_id: str) -> (list[float],str):
+  '''
+    uses the current_bioactives.csv file from the get_bioactives node to fit the
+    Light GBM model and predict the IC50 for the current smiles.
+      Args:
+        smiles_list: the SMILES strings of the molecules to predict
+        chembl_id: the chembl ID to query
+      Returns:
+        preds: a list of predicted IC50 values for the input SMILES
+        preds_string: a string containing the predicted IC50 values for the input SMILES
+  '''
+  print("Predict Tool")
+  print('===================================================')
+  # if f'{chembl_id}_bioactives.csv' does not exist, call the bioactives node
+  if not os.path.exists(f'{chembl_id}_bioactives.csv'):
+    _, _, _ = getbioactives_node([chembl_id])
+  try:
+    chembl_id = chembl_id.upper()
+    df = pd.read_csv(f'{chembl_id}_bioactives.csv')
+    #if length of the dataframe is over 2000, take a random sample of 2000 points
+    if len(df) > 2000:
+      df = df.sample(n=2000, random_state=42)
+    y_raw = df["IC50s"].to_list()
+    smiles_list = df["SMILES"].to_list()
+    ions_to_clean = ["[Na+].",".[Na+]","[Cl-].",".[Cl-]","[K+].",".[K+]"]
+    Xa = []
+    y = []
+    for smile, value in zip(smiles_list, y_raw):
+      for ion in ions_to_clean:
+        smile = smile.replace(ion,"")
+      y.append(np.log10(value))
+      Xa.append(smile)
+    mols = [Chem.MolFromSmiles(smile) for smile in Xa]
+    print(f"Number of molecules: {len(mols)}")
+    featurizer=dc.feat.RDKitDescriptors()
+    featname="RDKitDescriptors"
+    f = featurizer.featurize(mols)
+    nan_indicies = np.isnan(f)
+    bad_rows = []
+    for i, row in enumerate(nan_indicies):
+        for item in row:
+            if item == True:
+                if i not in bad_rows:
+                    print(f"Row {i} has a NaN.")
+                    bad_rows.append(i)
+    print(f"Old dimensions are: {f.shape}.")
+    for j,i in enumerate(bad_rows):
+        k=i-j
+        f = np.delete(f,k,axis=0)
+        y = np.delete(y,k,axis=0)
+        Xa = np.delete(Xa,k,axis=0)
+        print(f"Deleting row {k} from arrays.")
+    print(f"New dimensions are: {f.shape}")
+    if f.shape[0] != len(y) or f.shape[0] != len(Xa):
+      raise ValueError("Number of rows in X and y do not match.")
+    X_train, X_test, y_train, y_test = train_test_split(f, y, test_size=0.2, random_state=42)
+    scaler = StandardScaler()
+    X_train = scaler.fit_transform(X_train)
+    X_test = scaler.transform(X_test)
+    model = LGBMRegressor(metric='rmse', max_depth = 50, verbose = -1, num_leaves = 31,
+                          feature_fraction = 0.8, min_data_in_leaf = 20)
+    modelname = "LightGBM Regressor"
+    model.fit(X_train, y_train)
+    train_score = model.score(X_train,y_train)
+    print(f"score for training set: {train_score:.3f}")
+    valid_score = model.score(X_test, y_test)
+    print(f"score for validation set: {valid_score:.3f}")
+  except:
+    return [], 'Model training failed, unable to predict.', None
+  preds = []
+  preds_string = ''
+  for smiles in smiles_list_in:
+    print(f"in predict node, smiles: {smiles}")
+    try:
+      for ion in ions_to_clean:
+        smiles = smiles.replace(ion,"")
+      test_mol = Chem.MolFromSmiles(smiles)
+      test_feat = featurizer.featurize([test_mol])
+      test_feat = scaler.transform(test_feat)
+      prediction = model.predict(test_feat)
+      test_ic50 = 10**(prediction[0])
+      print(f"Predicted IC50 for {smiles}: {test_ic50}")
+      preds_string += f"The predicted IC50 value for {smiles} is : {test_ic50:.3f} nM.\n"
+      preds.append(test_ic50)
+    except:
+      preds.append(None)
+      preds_string += f"The prediction for {smiles} failed.\n"
+  preds_string += f"The Bioactive data was fitted with the LightGMB model, using RDKit descriptors. The training score \
+was {train_score:.3f} and the testing score was {valid_score:.3f}. "
+  return preds, preds_string, None
+@tool
+def gpt_node(chembl_id: str) -> (list[str], str, Image.Image):
+  '''
+    Uses a Chembl dataset, previously stored in a CSV file by the get_bioactives node, to
+    to finetune a GPT model to generate novel molecules for the target protein.
+    Args:
+      chembl_id: the ChEMBL ID to query
+    returns:
+      smiles_list: a list of generated SMILES strings
+      gpt_string: a string containing the results of the GPT finetuning and generation.
+      img: an image containing the generated molecules.
+  '''
+  print("GPT node")
+  print('===================================================')
+  # if f'{chembl_id}_bioactives.csv' does not exist, call the bioactives node
+  chembl_id = chembl_id.upper()
+  if not os.path.exists(f'{chembl_id}_bioactives.csv'):
+    _, _, _ = getbioactives_node([chembl_id])
+  try:
+    df = pd.read_csv(f'{chembl_id}_bioactives.csv')
+    smiles_list, gpt_string, img = finetune_gpt(df, chembl_id)
+  except:
+    gpt_string = ''
+    smiles_list = []
+    img = None
+  return smiles_list, gpt_string, img
+def get_protein_from_pdb(pdb_id):
+  '''
+    Helper function to get the protein information from the PDB database.
+    Args:
+      pdb_id: the PDB ID of the protein
+    Returns:
+      r.text: the PDB information as a string
+  '''
+  url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
+  r = requests.get(url)
+  return r.text
+def one_to_three(one_seq):
+  '''
+    Converts a one-letter amino acid sequence to a three-letter sequence.
+    Args:
+      one_seq: the one-letter amino acid sequence
+    Returns:
+      three_seq: the three-letter amino acid sequence
+  '''
+  rev_aa_hash = {
+      'A': 'ALA',
+      'R': 'ARG',
+      'N': 'ASN',
+      'D': 'ASP',
+      'C': 'CYS',
+      'Q': 'GLN',
+      'E': 'GLU',
+      'G': 'GLY',
+      'H': 'HIS',
+      'I': 'ILE',
+      'L': 'LEU',
+      'K': 'LYS',
+      'M': 'MET',
+      'F': 'PHE',
+      'P': 'PRO',
+      'S': 'SER',
+      'T': 'THR',
+      'W': 'TRP',
+      'Y': 'TYR',
+      'V': 'VAL'
+  }
+  try:
+    three_seq = rev_aa_hash[one_seq]
+  except:
+    three_seq = 'X'
+  return three_seq
+def three_to_one(three_seq):
+  '''
+  Converts a three-letter amino acid sequence to a one-letter sequence.
+  Args:
+    three_seq: the three-letter amino acid sequence
+  Returns:
+    one_seq: the one-letter amino acid sequence
+  '''
+  aa_hash = {
+      'ALA': 'A',
+      'ARG': 'R',
+      'ASN': 'N',
+      'ASP': 'D',
+      'CYS': 'C',
+      'GLN': 'Q',
+      'GLU': 'E',
+      'GLY': 'G',
+      'HIS': 'H',
+      'ILE': 'I',
+      'LEU': 'L',
+      'LYS': 'K',
+      'MET': 'M',
+      'PHE': 'F',
+      'PRO': 'P',
+      'SER': 'S',
+      'THR': 'T',
+      'TRP': 'W',
+      'TYR': 'Y',
+      'VAL': 'V'
+  }
+  one_seq = []
+  for residue in three_seq:
+    try:
+      one_seq.append(aa_hash[residue])
+    except:
+      one_seq.append('X')
+  return one_seq
+@tool
+def pdb_node(test_pdb_list: list[str]) -> (list[str], str):
+  '''
+    Accepts a PDB ID and queires the protein databank for the sequence of the protein, as well as other
+    information such as ligands.
+      Args:
+        test_pdb_list: the PDB IDs to query
+      Returns:
+        all_seqs: a list of the sequences for each PDB ID
+        total_pdb_string: a string containing the results of the PDB query.
+      (collects all ligands but does not return them currently)
+  '''
+  print(f"pdb toolS")
+  print('===================================================')
+  total_pdb_string = ''
+  all_seqs = []
+  all_ligands = []
+  for test_pdb in test_pdb_list:
+    try:
+      pdb_str = get_protein_from_pdb(test_pdb)
+      chains = {}
+      other_molecules = {}
+      #print(pdb_str.split('\n')[0])
+      for line in pdb_str.split('\n'):
+        parts = line.split()
+        try:
+          if parts[0] == 'SEQRES':
+            if parts[2] not in chains:
+              chains[parts[2]] = []
+            chains[parts[2]].extend(parts[4:])
+          if parts[0] == 'HETNAM':
+            j = 1
+            if parts[1].strip() in ['2','3','4','5','6','7','8','9']:
+              j = 2
+            print(parts[j])
+            if parts[j] not in other_molecules:
+              other_molecules[parts[j]] = []
+            other_molecules[parts[j]].extend(parts[2:])
+        except:
+          print('Blank line')
+        chains_ol = {}
+        for chain in chains:
+          chains_ol[chain] = three_to_one(chains[chain])
+      sub_seqs = []
+      sub_ligands = []
+      total_pdb_string += f"Chains in PDB ID {test_pdb}: {', '.join(chains.keys())} \n"
+      for chain in chains_ol:
+        total_pdb_string += f"Chain {chain}: {''.join(chains_ol[chain])} \n"
+        sub_seqs.append(''.join(chains_ol[chain]))
+        print(f"Chain {chain}: {''.join(chains_ol[chain])}")
+      total_pdb_string += f"Ligands in PDB ID {test_pdb}.\n"
+      for mol in other_molecules:
+        total_pdb_string += f"Molecule {mol}: {''.join(other_molecules[mol])} \n"
+        sub_ligands.append(''.join(other_molecules[mol]))
+      total_pdb_string += f'=========================================================================================\n'
+      all_seqs.append(sub_seqs)
+      all_ligands.append(sub_ligands)
+    except:
+      total_pdb_string += f'Failed to get data for PDB ID {test_pdb}\n'
+      total_pdb_string += f'=========================================================================================\n'
+      all_seqs.append([])
+      all_ligands.append([])
+  return all_seqs, total_pdb_string, None
+@tool
+def find_node(test_protein_list: list[str]) -> (list[str], str):
+  '''
+    Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
+      Args:
+        test_protein_list: the protein names to query
+      Returns:
+        total_ids: a list of the PDB IDs for each protein name
+        pdb_string: a string containing the results of the PDB search.
+  '''
+  print(f"PDB search tool")
+  print('===================================================')
+  total_ids = []
+  pdb_string = ''
+  which_pdbs = 0
+  for test_protein in test_protein_list:
+    try:
+      query = TextQuery(value=test_protein)
+      results = query()
+      def pdb_gen():
+        for rid in results:
+          yield(rid)
+      take10 = itertools.islice(pdb_gen(), which_pdbs, which_pdbs+10, 1)
+      local_ids = []
+      pdb_string += f'10 PDBs that match the protein {test_protein} are: \n'
+      for pdb in take10:
+        data = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb}").json()
+        title = data['struct']['title']
+        pdb_string += f'PDB ID: {pdb}, with title: {title} \n'
+        local_ids.append(pdb)
+      total_ids.append(local_ids)
+    except:
+      pdb_string += f'Failed to get PDB IDs for protein {test_protein}\n'
+      total_ids.append([])
+  return total_ids, pdb_string, None
+@tool
+def docking_node(smiles_list: list[str], query_protein: str) -> (list[float], str):
+  '''
+    Docking tool: uses dockstring to dock the molecule into the protein
+    Args:
+      smiles_list: the SMILES strings of the molecules to dock
+      protein: the protein to dock into
+    Returns:
+      docking_scores: a list of docking scores for each molecule
+      docking_string: a string containing the results of the docking.
+  '''
+  print("docking tool")
+  print('===================================================')
+  cpuCount = os.cpu_count()
+  print(f"Number of CPUs: {cpuCount}")
+  print(f'query_protein: {query_protein}')
+  scores_list = []
+  scores_string = 'Docking below performed with AutoDock Vina on protein structures from the DUDE database.\n'
+  for query_smiles in smiles_list:
+    try:
+      query_smiles = query_smiles.replace('.[Na+]','').replace('.[Na+]','').replace('.[K+]','').replace('[K+].','').replace('.[Cl-]','').replace('[Cl-].','')
+      target = load_target(query_protein)
+      print("===============================================")
+      print(f"Docking molecule with {cpuCount} cpu cores.")
+      score, aux = target.dock(query_smiles, num_cpus = cpuCount)
+      scores_list.append(score)
+      mol = aux['ligand']
+      print(f"Docking score: {score}")
+      print("===============================================")
+      atoms_list = ""
+      template = mol
+      molH = Chem.AddHs(mol)
+      AllChem.ConstrainedEmbed(molH,template, useTethers=True)
+      xyz_string = f"{molH.GetNumAtoms()}\n\n"
+      for atom in molH.GetAtoms():
+        atoms_list += atom.GetSymbol()
+        pos = molH.GetConformer().GetAtomPosition(atom.GetIdx())
+        xyz_string += f"{atom.GetSymbol()} {pos[0]} {pos[1]} {pos[2]}\n"
+      scores_string += f"Docking score for molecule with SMILES: {query_smiles} is: {score} kcal/mol \n\n"
+      scores_string += f"pose XYZ structure for molecule with SMILES: {query_smiles} is: \n"
+      lines = xyz_string.split('\n')
+      for line in lines[2:]:
+        scores_string += f'{line}\n'
+      scores_string += f"=========================================================\n"
+    except:
+      print(f"Molecule {query_smiles} could not be docked!")
+      scores_string = 'Could not dock!'
+      scores_list.append(None)
+  return scores_list, scores_string, None
+@tool
+def target_node(search_descriptors: list[str]):
+  '''
+  Accepts a disease name and searches Open Targets for associated targets
+  Args:
+    search_descriptor (str): Disease name
+  Returns:
+    targets_list (list): List of targets
+    targets_string (str): String of targets
+    None
+  '''
+  base_url = "https://api.platform.opentargets.org/api/v4/graphql"
+  disease_query_string = """
+    query searchEntity($queryString: String!) {
+      search(queryString: $queryString){
+        total
+        hits  {
+          id
+          entity
+          description
+        }
+      }
+    }
+  """
+  target_query_string = """
+    query associatedTargets($efo_id: String!) {
+      disease(efoId: $efo_id) {
+        id
+        name
+        associatedTargets {
+          count
+          rows {
+            target {
+              id
+              approvedSymbol
+            }
+            score
+          }
+        }
+      }
+    }
+  """
+  total_targets_list = []
+  total_targets_string = ''
+  for search_descriptor in search_descriptors:
+    variables = {"queryString": search_descriptor}
+    r = requests.post(base_url, json={"query": disease_query_string, "variables": variables})
+    disease_list = []
+    targets_list = []
+    if r.status_code == 200:
+      api_response = json.loads(r.text)
+      if len(api_response['data']['search']['hits']) > 0:
+        for hit in api_response['data']['search']['hits']:
+          if hit['entity'] == 'disease':
+            disease_list.append(hit['id'])
+    else:
+      print('Could not find results.')
+    if len(disease_list) > 0:
+      q = requests.post(base_url, json={"query": target_query_string, "variables": {"efo_id": disease_list[0]}})
+      if q.status_code == 200:
+        api_response = json.loads(q.text)
+        for target in api_response['data']['disease']['associatedTargets']['rows']:
+          targets_list.append(target['target']['approvedSymbol'])
+    targets_string = f'Possible targets for {search_descriptor} include: \n'
+    if len(targets_list) > 0:
+      for i, target in enumerate(targets_list):
+        targets_string += f'{i+1}. {target}\n'
+    else:
+      targets_string = f'No targets found for {search_descriptor}'
+    total_targets_list.append(targets_list)
+    total_targets_string += targets_string
+  return total_targets_list, total_targets_string, None

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+bitsandbytes
+pubchempy
+rdkit
+chembl_webresource_client
+rcsb-api
+deepchem
+dockstring
+openbabel-wheel
+openai
+langchain_core
+langchain_openai
+langgraph
+gradio
+torch
+matplotlib
+pillow
+gradio-client
+transformers
+dockstring
+openbabel-wheel
+numpy
+elevenlabs
+lightgbm
+tf-keras
+tensorflow
+accelerate