RF_ECFP4_predict

#Python 3.8.16
#@import pandas as pd
#@from sklearn.ensemble import RandomForestClassifier
#@from sklearn.metrics import roc_auc_score
#@from sklearn.metrics import roc_curve
#@from sklearn import metrics
#@from rdkit import Chem
#@from rdkit.Chem import AllChem
#@import os
################################################################## Input data
#@dir_path = "C:\\Users\\liyix\\OneDrive\\Desktop\\RF_predict\\"
# List all files in the folder
#@files = os.listdir(dir_path)
#@files #['RF_ECFP4_predict_python.py', 'training data.csv']
# Filter CSV files
#@csv_files = [file for file in files if file.endswith('.csv')]
#@csv_files #['training data.csv']
#input data
#@training_data = pd.read_csv(dir_path + "training data.csv")
#@len(training_data) #3910
#@type(training_data) #<class 'pandas.core.frame.DataFrame'>
#@training_data.shape #(3910, 397)
#@training_data.columns.tolist()[0] #'SampleID'
# Remove the 'SampleID' column
#@training_data = training_data.drop('SampleID', axis=1)
#@training_data.shape
# Prepare data
#@target_col = "CYP3A7_inhibitor"
#@features = training_data.columns[:-1]  # Exclude the last column
#@X_train = training_data[features]
#@y_train = training_data[target_col]
# Train the model
#@model = RandomForestClassifier()
#@model.fit(X_train, y_train)
# Predict probabilities and calculate AUC on training data
#@train_pred_proba = model.predict_proba(X_train)[:, 1]
#@train_auc = roc_auc_score(y_train, train_pred_proba)
#############################
# ROC curve for random forest model
#@fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_train, train_pred_proba)
# Find the optimal threshold
#@optimal_idx = (tpr_rf - fpr_rf).argmax()
#@optimal_threshold = thresholds_rf[optimal_idx]
#@print("Optimal Threshold:", optimal_threshold) #Optimal Threshold: 0.56
##################################################################### Predict probabilities on one new compound
# List of SMILES strings representing the compounds
#@smiles_list = ["Oc1ccc(cc1)C3(OS(=O)(=O)c2ccccc23)c4ccc(O)cc4"]
# Create empty lists to store molecule structures and fingerprints
#@molecule_structures = []
#@fingerprints = []
#@molecule_smiles = []
# Convert each SMILES string to a molecule object and generate fingerprints
#@for i, smiles in enumerate(smiles_list):
#@  try:
#@  mol = Chem.MolFromSmiles(smiles)
#@fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useBondTypes=True, useChirality=False, useFeatures=False)
#@fp_bits = fp.ToBitString()

# Add molecule structure and fingerprint to the respective lists
#@molecule_structures.append(mol)
#@fingerprints.append(fp_bits)
#@molecule_smiles.append(smiles)
#@except Exception as e:
#@  continue
#@else:
#@  print("No error occurred")
#@print("End of the loop")

# Create a data frame from the lists
#@data_new_compound = pd.DataFrame({'Structure': molecule_structures, 'Fingerprint': fingerprints, 
#@ "Smiles" : molecule_smiles})
#@pd.set_option('display.max_columns', None)
#@pd.set_option('display.max_rows', None)
#@data_new_compound.head()
# Split fingerprint string into separate columns
#@split_fingerprints = data_new_compound['Fingerprint'].apply(lambda x: pd.Series(list(x)))
#@type(split_fingerprints)
# Rename columns to represent bit positions
#@split_fingerprints.columns = ['Bit_'+str(i+1) for i in range(split_fingerprints.shape[1])]
# Concatenate the original data frame with the split fingerprint columns
#@data_split = pd.concat([data_new_compound, split_fingerprints], axis=1)
#@X_test = data_split[features]
##################
#@test_pred_proba = model.predict_proba(X_test)[:, 1]
#@data_new_compound["probability"] = test_pred_proba
###################################################################
# Make predictions on the test data using the full model
#@test_pred_label_full = (test_pred_proba > 0.56).astype(int)
#@data_new_compound["predict_type"] = test_pred_label_full
#@data_new_compound.columns.tolist()
#@data_new_compound
###########################
#save the results
#@output_file = dir_path + "new_compound_prediction_CYP3A7_inhibitor.csv"
#@data_new_compound.to_csv(output_file, index=False)
print("well done")

## [1] "well done"

RF_ECFP4_predict_python.R

liyix

2023-05-26