1.1 Sample Data
The
Solubility
dataset from the
AppliedPredictiveModeling
package was used for this illustrated example. The original numeric
response was transformed to simulate a dichotomous categorical variable.
Other original predictors were removed from the dataset leaving only a
subset of numeric predictors used during the analysis.
Preliminary dataset assessment:
[A] 1267 rows (observations)
[A.1] Train Set = 951 observations
[A.2] Test Set = 316 observations
[B] 229 columns (variables)
[B.1] 1/5 response = Log_Solubility_Class variable
(factor)
[B.1.1] Levels = Log_Solubility_Class=Low < Log_Solubility_Class=High
[B.2] 4/5 predictors = All remaining variables
(0/4 factor + 4/4 numeric)
##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(lattice)
library(dplyr)
library(tidyr)
library(moments)
library(skimr)
library(RANN)
library(pls)
library(corrplot)
library(tidyverse)
library(lares)
library(DMwR)
library(gridExtra)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(stats)
library(nnet)
library(elasticnet)
library(earth)
library(party)
library(kernlab)
library(randomForest)
library(Cubist)
library(pROC)
library(mda)
library(klaR)
library(pamr)
##################################
# Loading source and
# formulating the train set
##################################
data(solubility)
Solubility_Train <- as.data.frame(cbind(solTrainY,solTrainX))
Solubility_Test <- as.data.frame(cbind(solTestY,solTestX))
##################################
# Applying dichotomization and
# defining the response variable
##################################
Solubility_Train$Log_Solubility_Class <- ifelse(Solubility_Train$solTrainY<mean(Solubility_Train$solTrainY),
"Low","High")
Solubility_Train$Log_Solubility_Class <- factor(Solubility_Train$Log_Solubility_Class,
levels = c("Low","High"))
Solubility_Test$Log_Solubility_Class <- ifelse(Solubility_Test$solTestY<mean(Solubility_Train$solTrainY),
"Low","High")
Solubility_Test$Log_Solubility_Class <- factor(Solubility_Test$Log_Solubility_Class,
levels = c("Low","High"))
Solubility_Train$solTrainY <- NULL
Solubility_Test$solTestY <- NULL
##################################
# Filtering in a subset of variables
# for the analysis
##################################
Solubility_Train <- Solubility_Train[,c("HydrophilicFactor",
"NumAtoms",
"NumNonHAtoms",
"NumCarbon",
"Log_Solubility_Class")]
Solubility_Test <- Solubility_Test[,c("HydrophilicFactor",
"NumAtoms",
"NumNonHAtoms",
"NumCarbon",
"Log_Solubility_Class")]
##################################
# Performing a general exploration of the train set
##################################
dim(Solubility_Train)
## [1] 951 5
## 'data.frame': 951 obs. of 5 variables:
## $ HydrophilicFactor : num -0.856 -0.37 -0.33 -0.96 -0.069 -0.651 -0.729 -0.835 0.194 0.353 ...
## $ NumAtoms : int 28 49 33 26 31 32 35 38 56 37 ...
## $ NumNonHAtoms : int 16 26 15 10 15 15 23 14 27 17 ...
## $ NumCarbon : int 14 21 13 10 9 10 17 12 22 14 ...
## $ Log_Solubility_Class: Factor w/ 2 levels "Low","High": 1 1 1 1 1 1 1 1 1 1 ...
summary(Solubility_Train)
## HydrophilicFactor NumAtoms NumNonHAtoms NumCarbon
## Min. :-0.98500 Min. : 5.00 Min. : 2.00 Min. : 1.000
## 1st Qu.:-0.76300 1st Qu.:17.00 1st Qu.: 8.00 1st Qu.: 6.000
## Median :-0.31400 Median :22.00 Median :12.00 Median : 9.000
## Mean :-0.02059 Mean :25.51 Mean :13.16 Mean : 9.893
## 3rd Qu.: 0.31300 3rd Qu.:31.00 3rd Qu.:17.00 3rd Qu.:12.000
## Max. :13.48300 Max. :94.00 Max. :47.00 Max. :33.000
## Log_Solubility_Class
## Low :427
## High:524
##
##
##
##
##################################
# Performing a general exploration of the test set
##################################
dim(Solubility_Test)
## [1] 316 5
## 'data.frame': 316 obs. of 5 variables:
## $ HydrophilicFactor : num 0.492 1.317 0.846 0.984 0.843 ...
## $ NumAtoms : int 8 13 14 19 15 8 8 13 13 17 ...
## $ NumNonHAtoms : int 5 6 8 7 9 4 4 5 5 8 ...
## $ NumCarbon : int 2 3 6 5 6 2 3 4 4 7 ...
## $ Log_Solubility_Class: Factor w/ 2 levels "Low","High": 2 2 2 2 2 2 2 2 2 2 ...
## HydrophilicFactor NumAtoms NumNonHAtoms NumCarbon
## Min. :-0.9860 Min. : 5.0 Min. : 3.00 Min. : 1.000
## 1st Qu.:-0.7670 1st Qu.:17.0 1st Qu.: 8.00 1st Qu.: 6.000
## Median :-0.3970 Median :22.0 Median :11.00 Median : 8.000
## Mean :-0.1022 Mean :24.6 Mean :12.71 Mean : 9.785
## 3rd Qu.: 0.2140 3rd Qu.:29.0 3rd Qu.:16.00 3rd Qu.:12.000
## Max. : 5.0000 Max. :68.0 Max. :33.00 Max. :24.000
## Log_Solubility_Class
## Low :143
## High:173
##
##
##
##
##################################
# Formulating a data type assessment summary
##################################
PDA <- Solubility_Train
(PDA.Summary <- data.frame(
Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 HydrophilicFactor numeric
## 2 2 NumAtoms integer
## 3 3 NumNonHAtoms integer
## 4 4 NumCarbon integer
## 5 5 Log_Solubility_Class factor
1.2 Data Quality Assessment
Data quality assessment:
[A] No missing observations noted for any
variable.
[B] No low variance observed for any variable with
First.Second.Mode.Ratio>5.
[C] No low variance observed for any variable with
Unique.Count.Ratio<0.01.
[D] High skewness observed for 1 variables with
Skewness>3 or Skewness<(-3).
[D.1] HydrophilicFactor variable (numeric)
##################################
# Loading dataset
##################################
DQA <- Solubility_Train
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 HydrophilicFactor numeric 951 0 1.000
## 2 2 NumAtoms integer 951 0 1.000
## 3 3 NumNonHAtoms integer 951 0 1.000
## 4 4 NumCarbon integer 951 0 1.000
## 5 5 Log_Solubility_Class factor 951 0 1.000
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,!names(DQA) %in% c("Log_Solubility_Class")]
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}
## [1] "There are 4 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}
## [1] "There are no factor predictor variables."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 1 HydrophilicFactor numeric 369 0.388
## 2 NumAtoms integer 66 0.069
## 3 NumNonHAtoms integer 36 0.038
## 4 NumCarbon integer 28 0.029
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 1 -0.828 -0.158 21 20
## 2 22.000 24.000 73 51
## 3 8.000 11.000 104 73
## 4 6.000 7.000 105 97
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 1 1.050 -0.985 -0.021 -0.314 13.483 3.404 27.504
## 2 1.431 5.000 25.507 22.000 94.000 1.364 5.523
## 3 1.425 2.000 13.161 12.000 47.000 0.993 4.129
## 4 1.082 1.000 9.893 9.000 33.000 0.927 3.616
## Percentile25th Percentile75th
## 1 -0.763 0.313
## 2 17.000 31.000
## 3 8.000 17.000
## 4 6.000 12.000
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No factor predictors noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "No low variance numeric predictors due to high first-second mode ratio noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}
## [1] "High skewness observed for 1 numeric variable(s) with Skewness>3 or Skewness<(-3)."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 1 HydrophilicFactor numeric 369 0.388
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 1 -0.828 -0.158 21 20
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 1 1.050 -0.985 -0.021 -0.314 13.483 3.404 27.504
## Percentile25th Percentile75th
## 1 -0.763 0.313
1.5 Predictive Model Development
1.5.1 Logistic Regression Without Skewness and Outlier Treatment
(LR_REF)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package without any treatments applied for both data skewness and
outliers.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.87475
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.88447
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
##################################
# Treating data skewness
# for the train set
##################################
# No actions applied
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)],
y = PMA_PreModelling_Train_LR$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_REF Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)],
y = PMA_PreModelling_Train_LR$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_REF Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_Tune <- train(x = PMA_PreModelling_Train_LR[,!names(PMA_PreModelling_Train_LR) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8747542 0.7001107 0.8397678
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 4.04521 1.16804 0.05341 -0.28105
## NumCarbon
## -0.14277
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 834.6 AIC: 844.6
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8747542 0.7001107 0.8397678 0.03649375 0.06155467 0.0638174
(LR_Train_ROCCurveAUC <- LR_Tune$results$ROC)
## [1] 0.8747542
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_VarImp <- varImp(LR_Tune, scale = TRUE)
plot(LR_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
# No actions applied
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
##################################
# Formulating the box plots
##################################
featurePlot(x = PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)],
y = PMA_PreModelling_Test_LR$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_REF Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)],
y = PMA_PreModelling_Test_LR$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_REF Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_Test <- data.frame(LR_Observed = PMA_PreModelling_Test_LR$Log_Solubility_Class,
LR_Predicted = predict(LR_Tune,
PMA_PreModelling_Test_LR[,!names(PMA_PreModelling_Test_LR) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_Test
## LR_Observed LR_Predicted.Low LR_Predicted.High
## 20 High 0.0336865657 0.966313434
## 21 High 0.0153183414 0.984681659
## 23 High 0.0643972187 0.935602781
## 25 High 0.0285205288 0.971479471
## 28 High 0.0798066290 0.920193371
## 31 High 0.0718016713 0.928198329
## 32 High 0.1028972691 0.897102731
## 33 High 0.1274812074 0.872518793
## 34 High 0.1274812074 0.872518793
## 37 High 0.3233789907 0.676621009
## 38 High 0.3233789907 0.676621009
## 42 High 0.5594454651 0.440554535
## 49 High 0.2002171263 0.799782874
## 54 High 0.0684240570 0.931575943
## 55 High 0.0181095804 0.981890420
## 58 High 0.3886335524 0.611366448
## 60 High 0.1402991233 0.859700877
## 61 High 0.1274812074 0.872518793
## 65 High 0.0864344530 0.913565547
## 69 High 0.4637906536 0.536209346
## 73 High 0.0029275515 0.997072449
## 86 High 0.1127673204 0.887232680
## 90 High 0.0962059194 0.903794081
## 91 High 0.0047906462 0.995209354
## 93 High 0.0962059194 0.903794081
## 96 High 0.0047906462 0.995209354
## 98 High 0.1059035446 0.894096455
## 100 High 0.1131780647 0.886821935
## 104 High 0.9327897176 0.067210282
## 112 High 0.2328162785 0.767183721
## 115 High 0.8387113746 0.161288625
## 119 High 0.1836571689 0.816342831
## 128 High 0.1836571689 0.816342831
## 130 High 0.0190842020 0.980915798
## 139 High 0.0190842020 0.980915798
## 143 High 0.0563130978 0.943686902
## 145 High 0.1531943560 0.846805644
## 146 High 0.1836571689 0.816342831
## 149 High 0.1969074264 0.803092574
## 150 High 0.1674558482 0.832544152
## 152 High 0.1274812074 0.872518793
## 157 High 0.4426510503 0.557348950
## 161 High 0.2961059613 0.703894039
## 162 High 0.0061418555 0.993858144
## 166 High 0.4589405370 0.541059463
## 167 High 0.1909996198 0.809000380
## 173 High 0.1530869135 0.846913086
## 176 High 0.1836571689 0.816342831
## 182 High 0.0613936617 0.938606338
## 187 High 0.0441054546 0.955894545
## 190 High 0.0121865069 0.987813493
## 194 High 0.0576402753 0.942359725
## 195 High 0.2423869901 0.757613010
## 201 High 0.1307021191 0.869297881
## 207 High 0.1726530538 0.827346946
## 208 High 0.4277335519 0.572266448
## 215 High 0.0718016713 0.928198329
## 222 High 0.3047752395 0.695224760
## 224 High 0.2013625086 0.798637491
## 231 High 0.6523674422 0.347632558
## 236 High 0.1398934495 0.860106551
## 237 High 0.0890185417 0.910981458
## 240 High 0.2046581225 0.795341878
## 243 High 0.1307021191 0.869297881
## 248 High 0.2423869901 0.757613010
## 251 High 0.8035890142 0.196410986
## 256 High 0.4398434030 0.560156597
## 258 High 0.2233967470 0.776603253
## 262 High 0.4277335519 0.572266448
## 266 High 0.4628428824 0.537157118
## 272 High 0.4989075140 0.501092486
## 280 High 0.3289791156 0.671020884
## 283 High 0.3828696538 0.617130346
## 286 High 0.4495928586 0.550407141
## 287 High 0.2327138907 0.767286109
## 289 High 0.1292348977 0.870765102
## 290 High 0.3529766973 0.647023303
## 298 High 0.2935585665 0.706441434
## 305 High 0.3346274939 0.665372506
## 306 High 0.1894718660 0.810528134
## 312 High 0.1237091711 0.876290829
## 320 High 0.2893537234 0.710646277
## 325 High 0.2038752816 0.796124718
## 332 High 0.0802850496 0.919714950
## 333 High 0.3890415450 0.610958455
## 335 High 0.2935585665 0.706441434
## 339 High 0.6696503319 0.330349668
## 346 High 0.3986640566 0.601335943
## 347 High 0.0860863900 0.913913610
## 350 High 0.3277438405 0.672256160
## 353 High 0.3910406088 0.608959391
## 358 High 0.3733526855 0.626647315
## 365 High 0.2662005002 0.733799500
## 367 High 0.2182970608 0.781702939
## 370 High 0.0287403751 0.971259625
## 379 High 0.1042586676 0.895741332
## 386 High 0.3129499183 0.687050082
## 394 High 0.5395683832 0.460431617
## 396 High 0.1610214870 0.838978513
## 400 High 0.0890185417 0.910981458
## 404 High 0.0721585562 0.927841444
## 405 High 0.5330445369 0.466955463
## 413 High 0.1852494698 0.814750530
## 415 High 0.4082548314 0.591745169
## 417 High 0.2173598482 0.782640152
## 418 High 0.4530410351 0.546958965
## 423 High 0.2838563728 0.716143627
## 434 High 0.3397469664 0.660253034
## 437 High 0.2188160386 0.781183961
## 440 High 0.3550137284 0.644986272
## 449 High 0.4032385216 0.596761478
## 450 High 0.3044787370 0.695521263
## 457 High 0.4032385216 0.596761478
## 467 High 0.2827750579 0.717224942
## 469 High 0.0818798441 0.918120156
## 474 High 0.9175649757 0.082435024
## 475 High 0.8729656154 0.127034385
## 485 High 0.1139410564 0.886058944
## 504 Low 0.1708923019 0.829107698
## 511 Low 0.6509099069 0.349090093
## 512 Low 0.4402430307 0.559756969
## 517 Low 0.0275983874 0.972401613
## 519 Low 0.7833906241 0.216609376
## 520 Low 0.1216573714 0.878342629
## 522 Low 0.9204361063 0.079563894
## 527 Low 0.6497262336 0.350273766
## 528 Low 0.2689383818 0.731061618
## 529 Low 0.3420093611 0.657990639
## 537 Low 0.1601409934 0.839859007
## 540 Low 0.8991542777 0.100845722
## 541 Low 0.4407363999 0.559263600
## 547 Low 0.8997547622 0.100245238
## 550 Low 0.3840800947 0.615919905
## 555 Low 0.5881995873 0.411800413
## 564 Low 0.0792550489 0.920744951
## 570 Low 0.4185241373 0.581475863
## 573 Low 0.2767634485 0.723236551
## 575 Low 0.4407363999 0.559263600
## 578 Low 0.1997361134 0.800263887
## 581 Low 0.2767634485 0.723236551
## 585 Low 0.3394330523 0.660566948
## 590 Low 0.7555077932 0.244492207
## 601 Low 0.9234128549 0.076587145
## 602 Low 0.6369498782 0.363050122
## 607 Low 0.6234820573 0.376517943
## 610 Low 0.6108630426 0.389136957
## 618 Low 0.7753385721 0.224661428
## 624 Low 0.3394330523 0.660566948
## 626 Low 0.2798585686 0.720141431
## 627 Low 0.3204248034 0.679575197
## 634 Low 0.5983101185 0.401689881
## 640 Low 0.9898595316 0.010140468
## 642 Low 0.1832068271 0.816793173
## 643 Low 0.5081035274 0.491896473
## 644 Low 0.8116432284 0.188356772
## 645 Low 0.7002023850 0.299797615
## 646 Low 0.6507453243 0.349254676
## 647 Low 0.8301016048 0.169898395
## 652 Low 0.2054268163 0.794573184
## 658 Low 0.6607108311 0.339289169
## 659 Low 0.7510505757 0.248949424
## 660 Low 0.9161958942 0.083804106
## 664 Low 0.3408694824 0.659130518
## 666 Low 0.4192476727 0.580752327
## 667 Low 0.8681760956 0.131823904
## 675 Low 0.5081035274 0.491896473
## 680 Low 0.9926469280 0.007353072
## 681 Low 0.9166739748 0.083326025
## 687 Low 0.7797970867 0.220202913
## 694 Low 0.7788178093 0.221182191
## 697 Low 0.4799232251 0.520076775
## 701 Low 0.3033114910 0.696688509
## 705 Low 0.9802413586 0.019758641
## 707 Low 0.6893653986 0.310634601
## 710 Low 0.6311139386 0.368886061
## 716 Low 0.9129372667 0.087062733
## 719 Low 0.9266307028 0.073369297
## 720 Low 0.9743859647 0.025614035
## 725 Low 0.9786074660 0.021392534
## 727 Low 0.3033114910 0.696688509
## 730 Low 0.4471810591 0.552818941
## 738 Low 0.7769124419 0.223087558
## 745 Low 0.5748922115 0.425107789
## 748 Low 0.6952562425 0.304743758
## 751 Low 0.9647413723 0.035258628
## 756 Low 0.7139313964 0.286068604
## 766 Low 0.8661335675 0.133866432
## 769 Low 0.5137458692 0.486254131
## 783 Low 0.8026569527 0.197343047
## 785 Low 0.8373166692 0.162683331
## 790 Low 0.9240417860 0.075958214
## 793 Low 0.8026569527 0.197343047
## 795 Low 0.9935450908 0.006454909
## 796 Low 0.9835769734 0.016423027
## 797 Low 0.6437243296 0.356275670
## 801 Low 0.6958616855 0.304138315
## 811 Low 0.4691432530 0.530856747
## 812 Low 0.9617475976 0.038252402
## 815 Low 0.9376097372 0.062390263
## 816 Low 0.7074523261 0.292547674
## 817 Low 0.9234880845 0.076511916
## 824 Low 0.8349276989 0.165072301
## 825 Low 0.8349276989 0.165072301
## 826 Low 0.8349276989 0.165072301
## 830 Low 0.8513492063 0.148650794
## 837 Low 0.9069592513 0.093040749
## 838 Low 0.7074523261 0.292547674
## 844 Low 0.8819618824 0.118038118
## 845 Low 0.9748569182 0.025143082
## 847 Low 0.9174469864 0.082553014
## 850 Low 0.8637880842 0.136211916
## 852 Low 0.8888561225 0.111143878
## 853 Low 0.8888561225 0.111143878
## 861 Low 0.9101752188 0.089824781
## 868 Low 0.9795950024 0.020404998
## 874 Low 0.9281279972 0.071872003
## 879 High 0.1529569733 0.847043027
## 895 High 0.0463424895 0.953657511
## 899 High 0.0001178774 0.999882123
## 903 High 0.0463424895 0.953657511
## 917 High 0.0423986925 0.957601308
## 927 High 0.0684240570 0.931575943
## 929 High 0.1530869135 0.846913086
## 931 High 0.0684240570 0.931575943
## 933 High 0.4172979708 0.582702029
## 944 High 0.0720404149 0.927959585
## 947 High 0.0962059194 0.903794081
## 949 High 0.2223733535 0.777626646
## 953 High 0.0273184034 0.972681597
## 958 High 0.4446072150 0.555392785
## 961 High 0.0326363542 0.967363646
## 963 High 0.1566911276 0.843308872
## 964 High 0.1307021191 0.869297881
## 973 High 0.0688034909 0.931196509
## 976 High 0.0792550489 0.920744951
## 977 High 0.2423869901 0.757613010
## 980 High 0.2318664434 0.768133557
## 983 High 0.5516380511 0.448361949
## 984 High 0.2423869901 0.757613010
## 986 High 0.1237091711 0.876290829
## 989 High 0.2142824472 0.785717553
## 991 High 0.0363618409 0.963638159
## 996 High 0.0980702886 0.901929711
## 997 High 0.4019257860 0.598074214
## 999 High 0.0643972187 0.935602781
## 1000 High 0.0622947220 0.937705278
## 1003 High 0.0890185417 0.910981458
## 1008 High 0.1343419103 0.865658090
## 1009 High 0.3828696538 0.617130346
## 1014 High 0.0688164915 0.931183509
## 1015 High 0.5341066328 0.465893367
## 1040 High 0.2006168147 0.799383185
## 1042 High 0.3777393739 0.622260626
## 1043 High 0.6482206500 0.351779350
## 1050 High 0.1245951154 0.875404885
## 1052 High 0.2328000692 0.767199931
## 1056 High 0.0302052801 0.969794720
## 1070 High 0.5790424430 0.420957557
## 1073 High 0.4142959793 0.585704021
## 1074 High 0.1993893842 0.800610616
## 1079 High 0.3579641321 0.642035868
## 1080 High 0.4873669707 0.512633029
## 1085 High 0.1155517838 0.884448216
## 1087 High 0.6888614637 0.311138536
## 1096 High 0.9458132723 0.054186728
## 1099 High 0.5089195538 0.491080446
## 1100 High 0.6024802982 0.397519702
## 1102 High 0.0512009711 0.948799029
## 1107 Low 0.3749354744 0.625064526
## 1109 Low 0.7661888954 0.233811105
## 1114 Low 0.4002856069 0.599714393
## 1118 Low 0.4347382476 0.565261752
## 1123 Low 0.4690012962 0.530998704
## 1132 Low 0.8206401102 0.179359890
## 1134 Low 0.6312389825 0.368761018
## 1137 Low 0.3204248034 0.679575197
## 1154 Low 0.3204248034 0.679575197
## 1155 Low 0.6060432019 0.393956798
## 1157 Low 0.8609402930 0.139059707
## 1162 Low 0.5081035274 0.491896473
## 1164 Low 0.1832068271 0.816793173
## 1171 Low 0.9647413723 0.035258628
## 1172 Low 0.4426510503 0.557348950
## 1175 Low 0.7347521726 0.265247827
## 1177 Low 0.5669973713 0.433002629
## 1179 Low 0.9495401865 0.050459814
## 1183 Low 0.2484602589 0.751539741
## 1185 Low 0.9700991083 0.029900892
## 1189 Low 0.9353149148 0.064685085
## 1211 Low 0.7090081559 0.290991844
## 1218 Low 0.9962485352 0.003751465
## 1224 Low 0.3641947857 0.635805214
## 1225 Low 0.3033114910 0.696688509
## 1227 Low 0.9477186443 0.052281356
## 1232 Low 0.9906240752 0.009375925
## 1235 Low 0.7850592628 0.214940737
## 1238 Low 0.8244394114 0.175560589
## 1240 Low 0.8970123273 0.102987673
## 1241 Low 0.7090081559 0.290991844
## 1248 Low 0.8513492063 0.148650794
## 1258 Low 0.7074523261 0.292547674
## 1261 Low 0.8640693634 0.135930637
## 1263 Low 0.8349276989 0.165072301
## 1269 Low 0.9566595486 0.043340451
## 1270 Low 0.9911281573 0.008871843
## 1271 Low 0.8637880842 0.136211916
## 1272 Low 0.8888561225 0.111143878
## 1280 Low 0.9281279972 0.071872003
## 1286 Low 0.9890962672 0.010903733
## 1287 Low 0.9901901712 0.009809829
## 1289 Low 0.9548963849 0.045103615
## 1290 Low 0.8888561225 0.111143878
## 1291 High 0.2132741120 0.786725888
## 1294 High 0.7243151562 0.275684844
## 1305 Low 0.9635883387 0.036411661
## 1308 High 0.8468798982 0.153120102
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_Test_ROC <- roc(response = LR_Test$LR_Observed,
predictor = LR_Test$LR_Predicted.High,
levels = rev(levels(LR_Test$LR_Observed)))
(LR_Test_ROCCurveAUC <- auc(LR_Test_ROC)[1])
## [1] 0.8844739
1.5.2 Logistic Regression With Box-Cox Transformation (LR_BCT)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Box-Cox
Transformation applies a family of power transformations such that
the modified values are a monotonic function of the observations over
some admissible range and indexed by an optimal parameter lambda. The
process involves determining the value of lambda which would result to
the highest correlation between the Box-Cox-transformed values and the
z-scores of the observations’ ordered indices. The original data set is
recomputed through the same power transformations given the optimized
lambda value. The method is restricted to positive values only.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with Box-Cox transformation applied to treat data skewness but
no any treatment applied for data outliers.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.88878
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.89676
##################################
# Adding an offset to adjust the
# range of values to only positive values
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
PMA_PreModelling_Train[,i] <- PMA_PreModelling_Train[,i]+1
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
PMA_PreModelling_Test[,i] <- PMA_PreModelling_Test[,i]+1
}
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
Transform_BoxCox <- preProcess(PMA_PreModelling_Train_LR, method = c("BoxCox"))
PMA_PreModelling_Train_LR_BCT <- predict(Transform_BoxCox, PMA_PreModelling_Train_LR.Numeric)
PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_BCT[,sapply(PMA_PreModelling_Train_LR_BCT, is.numeric)],
y = PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_BCT Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_BCT[,sapply(PMA_PreModelling_Train_LR_BCT, is.numeric)],
y = PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_BCT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_BCT_Tune <- train(x = PMA_PreModelling_Train_LR_BCT[,!names(PMA_PreModelling_Train_LR_BCT) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_BCT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8887838 0.7281285 0.8283382
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 10.7676 1.3301 1.4474 -5.4764
## NumCarbon
## -0.1236
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 779.5 AIC: 789.5
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8887838 0.7281285 0.8283382 0.03378421 0.07358024 0.06110159
(LR_BCT_Train_ROCCurveAUC <- LR_BCT_Tune$results$ROC)
## [1] 0.8887838
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_BCT_VarImp <- varImp(LR_BCT_Tune, scale = TRUE)
plot(LR_BCT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
PMA_PreModelling_Test_LR_BCT <- predict(Transform_BoxCox, PMA_PreModelling_Test_LR.Numeric)
PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_BCT[,sapply(PMA_PreModelling_Test_LR_BCT, is.numeric)],
y = PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_BCT Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_BCT[,sapply(PMA_PreModelling_Test_LR_BCT, is.numeric)],
y = PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_BCT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_BCT_Test <- data.frame(LR_BCT_Observed = PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class,
LR_BCT_Predicted = predict(LR_BCT_Tune,
PMA_PreModelling_Test_LR_BCT[,!names(PMA_PreModelling_Test_LR_BCT) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_BCT_Test
## LR_BCT_Observed LR_BCT_Predicted.Low LR_BCT_Predicted.High
## 20 High 0.0106788478 0.989321152
## 21 High 0.0071362537 0.992863746
## 23 High 0.0393459886 0.960654011
## 25 High 0.0121378710 0.987862129
## 28 High 0.0624328061 0.937567194
## 31 High 0.0138721555 0.986127844
## 32 High 0.0247401461 0.975259854
## 33 High 0.0461679379 0.953832062
## 34 High 0.0461679379 0.953832062
## 37 High 0.3571115534 0.642888447
## 38 High 0.3571115534 0.642888447
## 42 High 0.4980215283 0.501978472
## 49 High 0.1438747853 0.856125215
## 54 High 0.0165462619 0.983453738
## 55 High 0.0166769554 0.983323045
## 58 High 0.4550864082 0.544913592
## 60 High 0.0600003953 0.939999605
## 61 High 0.0461679379 0.953832062
## 65 High 0.0447692681 0.955230732
## 69 High 0.5416238123 0.458376188
## 73 High 0.0197460856 0.980253914
## 86 High 0.0422336568 0.957766343
## 90 High 0.0321165091 0.967883491
## 91 High 0.0094919686 0.990508031
## 93 High 0.0321165091 0.967883491
## 96 High 0.0094919686 0.990508031
## 98 High 0.0366929536 0.963307046
## 100 High 0.0542108436 0.945789156
## 104 High 0.8776746097 0.122325390
## 112 High 0.1983421147 0.801657885
## 115 High 0.8831774718 0.116822528
## 119 High 0.1241227426 0.875877257
## 128 High 0.1241227426 0.875877257
## 130 High 0.0181786697 0.981821330
## 139 High 0.0181786697 0.981821330
## 143 High 0.0378887829 0.962111217
## 145 High 0.0730209400 0.926979060
## 146 High 0.1241227426 0.875877257
## 149 High 0.1481173988 0.851882601
## 150 High 0.0964081466 0.903591853
## 152 High 0.0461679379 0.953832062
## 157 High 0.5424164866 0.457583513
## 161 High 0.2169241823 0.783075818
## 162 High 0.0081207937 0.991879206
## 166 High 0.5350267240 0.464973276
## 167 High 0.1238179689 0.876182031
## 173 High 0.0832659662 0.916734034
## 176 High 0.1241227426 0.875877257
## 182 High 0.0051278105 0.994872189
## 187 High 0.0422067189 0.957793281
## 190 High 0.0180259773 0.981974023
## 194 High 0.0157103850 0.984289615
## 195 High 0.1420659840 0.857934016
## 201 High 0.0566739425 0.943326058
## 207 High 0.0923516972 0.907648303
## 208 High 0.5326871755 0.467312825
## 215 High 0.0138721555 0.986127844
## 222 High 0.2009338031 0.799066197
## 224 High 0.2021924279 0.797807572
## 231 High 0.7632973587 0.236702641
## 236 High 0.0948388987 0.905161101
## 237 High 0.0164527338 0.983547266
## 240 High 0.1992633468 0.800736653
## 243 High 0.0566739425 0.943326058
## 248 High 0.1420659840 0.857934016
## 251 High 0.7380831273 0.261916873
## 256 High 0.3579586896 0.642041310
## 258 High 0.1476808290 0.852319171
## 262 High 0.5326871755 0.467312825
## 266 High 0.4699388730 0.530061127
## 272 High 0.5770850405 0.422914959
## 280 High 0.3130800631 0.686919937
## 283 High 0.4439245125 0.556075487
## 286 High 0.4514948386 0.548505161
## 287 High 0.1336742035 0.866325796
## 289 High 0.1193414649 0.880658535
## 290 High 0.3335117710 0.666488229
## 298 High 0.3052569144 0.694743086
## 305 High 0.2800013287 0.719998671
## 306 High 0.2260049236 0.773995076
## 312 High 0.1032698105 0.896730190
## 320 High 0.3076680491 0.692331951
## 325 High 0.1842236116 0.815776388
## 332 High 0.0586131421 0.941386858
## 333 High 0.4769698970 0.523030103
## 335 High 0.3052569144 0.694743086
## 339 High 0.8166001784 0.183399822
## 346 High 0.2935888502 0.706411150
## 347 High 0.0320240534 0.967975947
## 350 High 0.4009958389 0.599004161
## 353 High 0.3131508028 0.686849197
## 358 High 0.2701478279 0.729852172
## 365 High 0.2156848824 0.784315118
## 367 High 0.2112425750 0.788757425
## 370 High 0.0509989361 0.949001064
## 379 High 0.1262417880 0.873758212
## 386 High 0.4443738084 0.555626192
## 394 High 0.5041699492 0.495830051
## 396 High 0.1311385627 0.868861437
## 400 High 0.0164527338 0.983547266
## 404 High 0.0393672403 0.960632760
## 405 High 0.6364127374 0.363587263
## 413 High 0.1147530425 0.885246957
## 415 High 0.3508766249 0.649123375
## 417 High 0.2248338620 0.775166138
## 418 High 0.4108293636 0.589170636
## 423 High 0.2838050184 0.716194982
## 434 High 0.2353807171 0.764619283
## 437 High 0.2990469896 0.700953010
## 440 High 0.3983575610 0.601642439
## 449 High 0.3740547041 0.625945296
## 450 High 0.2800405789 0.719959421
## 457 High 0.3740547041 0.625945296
## 467 High 0.3477870747 0.652212925
## 469 High 0.2411307699 0.758869230
## 474 High 0.9420572222 0.057942778
## 475 High 0.9065732264 0.093426774
## 485 High 0.1233535430 0.876646457
## 504 Low 0.2527906760 0.747209324
## 511 Low 0.6865391505 0.313460849
## 512 Low 0.4047166486 0.595283351
## 517 Low 0.0656989760 0.934301024
## 519 Low 0.6847887241 0.315211276
## 520 Low 0.0419643779 0.958035622
## 522 Low 0.9512569299 0.048743070
## 527 Low 0.5950600444 0.404939956
## 528 Low 0.3666971317 0.633302868
## 529 Low 0.3033258986 0.696674101
## 537 Low 0.0894377504 0.910562250
## 540 Low 0.9299392609 0.070060739
## 541 Low 0.6976605213 0.302339479
## 547 Low 0.9224917546 0.077508245
## 550 Low 0.6120118529 0.387988147
## 555 Low 0.4797523411 0.520247659
## 564 Low 0.0198374990 0.980162501
## 570 Low 0.3404075983 0.659592402
## 573 Low 0.2308988075 0.769101192
## 575 Low 0.6976605213 0.302339479
## 578 Low 0.1980215186 0.801978481
## 581 Low 0.2308988075 0.769101192
## 585 Low 0.3361800443 0.663819956
## 590 Low 0.7115152158 0.288484784
## 601 Low 0.7926652156 0.207334784
## 602 Low 0.6243017798 0.375698220
## 607 Low 0.5287426671 0.471257333
## 610 Low 0.5139990677 0.486000932
## 618 Low 0.7102111890 0.289788811
## 624 Low 0.3361800443 0.663819956
## 626 Low 0.2006616946 0.799338305
## 627 Low 0.3402550529 0.659744947
## 634 Low 0.6211336105 0.378866390
## 640 Low 0.9838055034 0.016194497
## 642 Low 0.1760170355 0.823982965
## 643 Low 0.7851585980 0.214841402
## 644 Low 0.8721155291 0.127884471
## 645 Low 0.7767636349 0.223236365
## 646 Low 0.8881478956 0.111852104
## 647 Low 0.7286976220 0.271302378
## 652 Low 0.1652311255 0.834768874
## 658 Low 0.5649909662 0.435009034
## 659 Low 0.9358548026 0.064145197
## 660 Low 0.8494685582 0.150531442
## 664 Low 0.3462913416 0.653708658
## 666 Low 0.4573944495 0.542605550
## 667 Low 0.8366650170 0.163334983
## 675 Low 0.7851585980 0.214841402
## 680 Low 0.9649799678 0.035020032
## 681 Low 0.8599408367 0.140059163
## 687 Low 0.9460336996 0.053966300
## 694 Low 0.7144664403 0.285533560
## 697 Low 0.5847904454 0.415209555
## 701 Low 0.4828825333 0.517117467
## 705 Low 0.9090396249 0.090960375
## 707 Low 0.6414060837 0.358593916
## 710 Low 0.7840992120 0.215900788
## 716 Low 0.8522709843 0.147729016
## 719 Low 0.9417254613 0.058274539
## 720 Low 0.9812037744 0.018796226
## 725 Low 0.9797183384 0.020281662
## 727 Low 0.4828825333 0.517117467
## 730 Low 0.5419442378 0.458055762
## 738 Low 0.8926103955 0.107389604
## 745 Low 0.8485016772 0.151498323
## 748 Low 0.8291228102 0.170877190
## 751 Low 0.9156090270 0.084390973
## 756 Low 0.6729571755 0.327042824
## 766 Low 0.7161626263 0.283837374
## 769 Low 0.6479837562 0.352016244
## 783 Low 0.8822621938 0.117737806
## 785 Low 0.9637029474 0.036297053
## 790 Low 0.8580730609 0.141926939
## 793 Low 0.8822621938 0.117737806
## 795 Low 0.9888028122 0.011197188
## 796 Low 0.9830892650 0.016910735
## 797 Low 0.5255268720 0.474473128
## 801 Low 0.7934333977 0.206566602
## 811 Low 0.5931980901 0.406801910
## 812 Low 0.9094790547 0.090520945
## 815 Low 0.9737933608 0.026206639
## 816 Low 0.7873874858 0.212612514
## 817 Low 0.9838411522 0.016158848
## 824 Low 0.8906357926 0.109364207
## 825 Low 0.8906357926 0.109364207
## 826 Low 0.8906357926 0.109364207
## 830 Low 0.9674044746 0.032595525
## 837 Low 0.9803346321 0.019665368
## 838 Low 0.7873874858 0.212612514
## 844 Low 0.9746818677 0.025318132
## 845 Low 0.9935410672 0.006458933
## 847 Low 0.9348514760 0.065148524
## 850 Low 0.9033168636 0.096683136
## 852 Low 0.9165077717 0.083492228
## 853 Low 0.9165077717 0.083492228
## 861 Low 0.9287872244 0.071212776
## 868 Low 0.9945213964 0.005478604
## 874 Low 0.9399255679 0.060074432
## 879 High 0.0820381758 0.917961824
## 895 High 0.0075429914 0.992457009
## 899 High 0.0003688008 0.999631199
## 903 High 0.0075429914 0.992457009
## 917 High 0.0512730090 0.948726991
## 927 High 0.0165462619 0.983453738
## 929 High 0.0832659662 0.916734034
## 931 High 0.0165462619 0.983453738
## 933 High 0.3637050472 0.636294953
## 944 High 0.0345478479 0.965452152
## 947 High 0.0321165091 0.967883491
## 949 High 0.1830929077 0.816907092
## 953 High 0.0247953563 0.975204644
## 958 High 0.5752647020 0.424735298
## 961 High 0.0114054875 0.988594513
## 963 High 0.1003959629 0.899604037
## 964 High 0.0566739425 0.943326058
## 973 High 0.0493627059 0.950637294
## 976 High 0.0198374990 0.980162501
## 977 High 0.1420659840 0.857934016
## 980 High 0.2975453104 0.702454690
## 983 High 0.6552675251 0.344732475
## 984 High 0.1420659840 0.857934016
## 986 High 0.1032698105 0.896730190
## 989 High 0.1752033347 0.824796665
## 991 High 0.0216816050 0.978318395
## 996 High 0.0213154376 0.978684562
## 997 High 0.4645359623 0.535464038
## 999 High 0.0393459886 0.960654011
## 1000 High 0.0558764682 0.944123532
## 1003 High 0.0164527338 0.983547266
## 1008 High 0.0923518450 0.907648155
## 1009 High 0.4439245125 0.556075487
## 1014 High 0.0345635445 0.965436455
## 1015 High 0.4759596868 0.524040313
## 1040 High 0.1881135515 0.811886448
## 1042 High 0.3560654524 0.643934548
## 1043 High 0.7350851706 0.264914829
## 1050 High 0.1128103317 0.887189668
## 1052 High 0.1590707746 0.840929225
## 1056 High 0.2601441055 0.739855895
## 1070 High 0.5365314855 0.463468514
## 1073 High 0.5067464368 0.493253563
## 1074 High 0.1359320993 0.864067901
## 1079 High 0.4166247667 0.583375233
## 1080 High 0.6521706797 0.347829320
## 1085 High 0.0914183321 0.908581668
## 1087 High 0.5954058977 0.404594102
## 1096 High 0.8884323804 0.111567620
## 1099 High 0.4356482012 0.564351799
## 1100 High 0.7147323089 0.285267691
## 1102 High 0.0756794028 0.924320597
## 1107 Low 0.5814501628 0.418549837
## 1109 Low 0.6608220214 0.339177979
## 1114 Low 0.6200404052 0.379959595
## 1118 Low 0.3315143332 0.668485667
## 1123 Low 0.4710123894 0.528987611
## 1132 Low 0.9166894611 0.083310539
## 1134 Low 0.5681675652 0.431832435
## 1137 Low 0.3402550529 0.659744947
## 1154 Low 0.3402550529 0.659744947
## 1155 Low 0.5465223806 0.453477619
## 1157 Low 0.7298873873 0.270112613
## 1162 Low 0.7851585980 0.214841402
## 1164 Low 0.1760170355 0.823982965
## 1171 Low 0.9156090270 0.084390973
## 1172 Low 0.5424164866 0.457583513
## 1175 Low 0.6006201592 0.399379841
## 1177 Low 0.7100702299 0.289929770
## 1179 Low 0.9122765249 0.087723475
## 1183 Low 0.3352874269 0.664712573
## 1185 Low 0.9229073005 0.077092699
## 1189 Low 0.7983456906 0.201654309
## 1211 Low 0.9189338153 0.081066185
## 1218 Low 0.9896125086 0.010387491
## 1224 Low 0.4190665046 0.580933495
## 1225 Low 0.4828825333 0.517117467
## 1227 Low 0.8952185285 0.104781471
## 1232 Low 0.9862080090 0.013791991
## 1235 Low 0.7011319853 0.298868015
## 1238 Low 0.6722498554 0.327750145
## 1240 Low 0.8403084604 0.159691540
## 1241 Low 0.9189338153 0.081066185
## 1248 Low 0.9674044746 0.032595525
## 1258 Low 0.7873874858 0.212612514
## 1261 Low 0.8966286234 0.103371377
## 1263 Low 0.8906357926 0.109364207
## 1269 Low 0.9567248742 0.043275126
## 1270 Low 0.9871950642 0.012804936
## 1271 Low 0.9033168636 0.096683136
## 1272 Low 0.9165077717 0.083492228
## 1280 Low 0.9399255679 0.060074432
## 1286 Low 0.9964564442 0.003543556
## 1287 Low 0.9967294062 0.003270594
## 1289 Low 0.9574638600 0.042536140
## 1290 Low 0.9165077717 0.083492228
## 1291 High 0.1740583738 0.825941626
## 1294 High 0.8061455726 0.193854427
## 1305 Low 0.8709712683 0.129028732
## 1308 High 0.7071099477 0.292890052
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_BCT_Test_ROC <- roc(response = LR_BCT_Test$LR_BCT_Observed,
predictor = LR_BCT_Test$LR_BCT_Predicted.High,
levels = rev(levels(LR_BCT_Test$LR_BCT_Observed)))
(LR_BCT_Test_ROCCurveAUC <- auc(LR_BCT_Test_ROC)[1])
## [1] 0.8967622
1.5.3 Logistic Regression With Yeo-Johnson Transformation
(LR_YJT)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Yeo-Johnson
Transformation applies a new family of distributions that can be
used without restrictions, extending many of the good properties of the
Box-Cox power family. Similar to the Box-Cox transformation, the method
also estimates the optimal value of lambda but has the ability to
transform both positive and negative values by inflating low variance
data and deflating high variance data to create a more uniform data set.
While there are no restrictions in terms of the applicable values, the
interpretability of the transformed values is more diminished as
compared to the other methods.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with Yeo-Johnson transformation applied to treat data skewness
but no any treatment applied for data outliers.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.88070
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.89061
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
Transform_YeoJohnson <- preProcess(PMA_PreModelling_Train_LR, method = c("YeoJohnson"))
PMA_PreModelling_Train_LR_YJT <- predict(Transform_YeoJohnson, PMA_PreModelling_Train_LR.Numeric)
PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_YJT[,sapply(PMA_PreModelling_Train_LR_YJT, is.numeric)],
y = PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_YJT Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_YJT[,sapply(PMA_PreModelling_Train_LR_YJT, is.numeric)],
y = PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_YJT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_YJT_Tune <- train(x = PMA_PreModelling_Train_LR_YJT[,!names(PMA_PreModelling_Train_LR_YJT) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_YJT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8807066 0.7187154 0.8283019
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 9.3048 1.4325 1.2632 -4.0866
## NumCarbon
## -0.4926
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 806.5 AIC: 816.5
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8807066 0.7187154 0.8283019 0.03522898 0.07917814 0.05907546
(LR_YJT_Train_ROCCurveAUC <- LR_YJT_Tune$results$ROC)
## [1] 0.8807066
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_YJT_VarImp <- varImp(LR_YJT_Tune, scale = TRUE)
plot(LR_YJT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
PMA_PreModelling_Test_LR_YJT <- predict(Transform_YeoJohnson, PMA_PreModelling_Test_LR.Numeric)
PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_YJT[,sapply(PMA_PreModelling_Test_LR_YJT, is.numeric)],
y = PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_YJT Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_YJT[,sapply(PMA_PreModelling_Test_LR_YJT, is.numeric)],
y = PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_YJT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_YJT_Test <- data.frame(LR_YJT_Observed = PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class,
LR_YJT_Predicted = predict(LR_YJT_Tune,
PMA_PreModelling_Test_LR_YJT[,!names(PMA_PreModelling_Test_LR_YJT) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_YJT_Test
## LR_YJT_Observed LR_YJT_Predicted.Low LR_YJT_Predicted.High
## 20 High 0.0111852876 0.98881471
## 21 High 0.0086628052 0.99133719
## 23 High 0.0465501013 0.95344990
## 25 High 0.0150865233 0.98491348
## 28 High 0.0673498546 0.93265015
## 31 High 0.0187893105 0.98121069
## 32 High 0.0385145943 0.96148541
## 33 High 0.0651287215 0.93487128
## 34 High 0.0651287215 0.93487128
## 37 High 0.3795518012 0.62044820
## 38 High 0.3795518012 0.62044820
## 42 High 0.5083619918 0.49163801
## 49 High 0.1725513385 0.82744866
## 54 High 0.0211968888 0.97880311
## 55 High 0.0243796249 0.97562038
## 58 High 0.4639398237 0.53606018
## 60 High 0.0740667556 0.92593324
## 61 High 0.0651287215 0.93487128
## 65 High 0.0454691543 0.95453085
## 69 High 0.5509798165 0.44902018
## 73 High 0.0335583314 0.96644167
## 86 High 0.0553639019 0.94463610
## 90 High 0.0398746182 0.96012538
## 91 High 0.0177295306 0.98227047
## 93 High 0.0398746182 0.96012538
## 96 High 0.0177295306 0.98227047
## 98 High 0.0455680065 0.95443199
## 100 High 0.0607090959 0.93929090
## 104 High 0.8782141160 0.12178588
## 112 High 0.2280878343 0.77191217
## 115 High 0.8818973943 0.11810261
## 119 High 0.1495433520 0.85045665
## 128 High 0.1495433520 0.85045665
## 130 High 0.0265537930 0.97344621
## 139 High 0.0265537930 0.97344621
## 143 High 0.0408285223 0.95917148
## 145 High 0.1010816593 0.89891834
## 146 High 0.1495433520 0.85045665
## 149 High 0.1495628278 0.85043717
## 150 High 0.1223809375 0.87761906
## 152 High 0.0651287215 0.93487128
## 157 High 0.5324789498 0.46752105
## 161 High 0.2353887548 0.76461125
## 162 High 0.0108126332 0.98918737
## 166 High 0.5421256704 0.45787433
## 167 High 0.1557431213 0.84425688
## 173 High 0.1060669672 0.89393303
## 176 High 0.1495433520 0.85045665
## 182 High 0.0089571356 0.99104286
## 187 High 0.0559846274 0.94401537
## 190 High 0.0229926784 0.97700732
## 194 High 0.0164604888 0.98353951
## 195 High 0.1682281293 0.83177187
## 201 High 0.0687190029 0.93128100
## 207 High 0.1097517332 0.89024827
## 208 High 0.5386162406 0.46138376
## 215 High 0.0187893105 0.98121069
## 222 High 0.2334165404 0.76658346
## 224 High 0.1887353848 0.81126462
## 231 High 0.7667001343 0.23329987
## 236 High 0.0956712807 0.90432872
## 237 High 0.0259764904 0.97402351
## 240 High 0.1856240164 0.81437598
## 243 High 0.0687190029 0.93128100
## 248 High 0.1682281293 0.83177187
## 251 High 0.7507400098 0.24925999
## 256 High 0.3828163835 0.61718362
## 258 High 0.1615928275 0.83840717
## 262 High 0.5386162406 0.46138376
## 266 High 0.4414142561 0.55858574
## 272 High 0.5848712015 0.41512880
## 280 High 0.2943400032 0.70566000
## 283 High 0.4656557701 0.53434423
## 286 High 0.4230688180 0.57693118
## 287 High 0.1584876212 0.84151238
## 289 High 0.1001632377 0.89983676
## 290 High 0.3147070899 0.68529291
## 298 High 0.3331168358 0.66688316
## 305 High 0.2845465573 0.71545344
## 306 High 0.2377354619 0.76226454
## 312 High 0.1089806628 0.89101934
## 320 High 0.3282503800 0.67174962
## 325 High 0.1789329323 0.82106707
## 332 High 0.0616433564 0.93835664
## 333 High 0.4825359090 0.51746409
## 335 High 0.3331168358 0.66688316
## 339 High 0.7915253506 0.20847465
## 346 High 0.3337862922 0.66621371
## 347 High 0.0354544470 0.96454555
## 350 High 0.4051825670 0.59481743
## 353 High 0.3366565097 0.66334349
## 358 High 0.3080651712 0.69193483
## 365 High 0.2184421650 0.78155784
## 367 High 0.2078586791 0.79214132
## 370 High 0.0572003924 0.94279961
## 379 High 0.1413219999 0.85867800
## 386 High 0.3773059137 0.62269409
## 394 High 0.4977002808 0.50229972
## 396 High 0.1361516876 0.86384831
## 400 High 0.0259764904 0.97402351
## 404 High 0.0341686706 0.96583133
## 405 High 0.6483833587 0.35161664
## 413 High 0.1298373285 0.87016267
## 415 High 0.3574576992 0.64254230
## 417 High 0.2052920777 0.79470792
## 418 High 0.4180844835 0.58191552
## 423 High 0.3207522384 0.67924776
## 434 High 0.2720066738 0.72799333
## 437 High 0.2932273518 0.70677265
## 440 High 0.3919768236 0.60802318
## 449 High 0.3535353096 0.64646469
## 450 High 0.2675541417 0.73244586
## 457 High 0.3535353096 0.64646469
## 467 High 0.3416031366 0.65839686
## 469 High 0.2412064259 0.75879357
## 474 High 0.9372709045 0.06272910
## 475 High 0.9045788808 0.09542112
## 485 High 0.1400180092 0.85998199
## 504 Low 0.3014552746 0.69854473
## 511 Low 0.7986676581 0.20133234
## 512 Low 0.3948293950 0.60517061
## 517 Low 0.0915588732 0.90844113
## 519 Low 0.6710622179 0.32893778
## 520 Low 0.0593466688 0.94065333
## 522 Low 0.9427595075 0.05724049
## 527 Low 0.6066246995 0.39337530
## 528 Low 0.3047234793 0.69527652
## 529 Low 0.2940689781 0.70593102
## 537 Low 0.1137543356 0.88624566
## 540 Low 0.9257268536 0.07427315
## 541 Low 0.5764299193 0.42357008
## 547 Low 0.9213479392 0.07865206
## 550 Low 0.7714659594 0.22853404
## 555 Low 0.4823762799 0.51762372
## 564 Low 0.0264753239 0.97352468
## 570 Low 0.3415221433 0.65847786
## 573 Low 0.2337758471 0.76622415
## 575 Low 0.5764299193 0.42357008
## 578 Low 0.1801949699 0.81980503
## 581 Low 0.2337758471 0.76622415
## 585 Low 0.3222395378 0.67776046
## 590 Low 0.7000901095 0.29990989
## 601 Low 0.7965076159 0.20349238
## 602 Low 0.5911455454 0.40885445
## 607 Low 0.5745963409 0.42540366
## 610 Low 0.5598294027 0.44017060
## 618 Low 0.6718083378 0.32819166
## 624 Low 0.3222395378 0.67776046
## 626 Low 0.2300137304 0.76998627
## 627 Low 0.3796756641 0.62032434
## 634 Low 0.6772469161 0.32275308
## 640 Low 0.9826668299 0.01733317
## 642 Low 0.1596173546 0.84038265
## 643 Low 0.6603529106 0.33964709
## 644 Low 0.8722361193 0.12776388
## 645 Low 0.7805565258 0.21944347
## 646 Low 0.7944944044 0.20550560
## 647 Low 0.7214006575 0.27859934
## 652 Low 0.1909579025 0.80904210
## 658 Low 0.5578158104 0.44218419
## 659 Low 0.8677054479 0.13229455
## 660 Low 0.8660831750 0.13391682
## 664 Low 0.3154666895 0.68453331
## 666 Low 0.4094602677 0.59053973
## 667 Low 0.8101180279 0.18988197
## 675 Low 0.6603529106 0.33964709
## 680 Low 0.9641799867 0.03582001
## 681 Low 0.8594185600 0.14058144
## 687 Low 0.8801008895 0.11989911
## 694 Low 0.7275971055 0.27240289
## 697 Low 0.5638293124 0.43617069
## 701 Low 0.3818211000 0.61817890
## 705 Low 0.9049764620 0.09502354
## 707 Low 0.6651475934 0.33485241
## 710 Low 0.7664857792 0.23351422
## 716 Low 0.8679648226 0.13203518
## 719 Low 0.9391300604 0.06086994
## 720 Low 0.9749949067 0.02500509
## 725 Low 0.9760911253 0.02390887
## 727 Low 0.3818211000 0.61817890
## 730 Low 0.5141175113 0.48588249
## 738 Low 0.8712602625 0.12873974
## 745 Low 0.7303903111 0.26960969
## 748 Low 0.8143622451 0.18563775
## 751 Low 0.9236735195 0.07632648
## 756 Low 0.6594624920 0.34053751
## 766 Low 0.7257122288 0.27428777
## 769 Low 0.6219413487 0.37805865
## 783 Low 0.8773414681 0.12265853
## 785 Low 0.9119616001 0.08803840
## 790 Low 0.8742200814 0.12577992
## 793 Low 0.8773414681 0.12265853
## 795 Low 0.9873215475 0.01267845
## 796 Low 0.9797901295 0.02020987
## 797 Low 0.5245149624 0.47548504
## 801 Low 0.7819966694 0.21800333
## 811 Low 0.5830613359 0.41693866
## 812 Low 0.9209125651 0.07908743
## 815 Low 0.9579291216 0.04207088
## 816 Low 0.7563111238 0.24368888
## 817 Low 0.9554192278 0.04458077
## 824 Low 0.8910529193 0.10894708
## 825 Low 0.8910529193 0.10894708
## 826 Low 0.8910529193 0.10894708
## 830 Low 0.9206758207 0.07932418
## 837 Low 0.9458135660 0.05418643
## 838 Low 0.7563111238 0.24368888
## 844 Low 0.9344794385 0.06552056
## 845 Low 0.9788289697 0.02117103
## 847 Low 0.9342189025 0.06578110
## 850 Low 0.9042545052 0.09574549
## 852 Low 0.9162961404 0.08370386
## 853 Low 0.9162961404 0.08370386
## 861 Low 0.9270249964 0.07297500
## 868 Low 0.9821061387 0.01789386
## 874 Low 0.9367252603 0.06327474
## 879 High 0.1033547908 0.89664521
## 895 High 0.0101263144 0.98987369
## 899 High 0.0009470188 0.99905298
## 903 High 0.0101263144 0.98987369
## 917 High 0.0647560019 0.93524400
## 927 High 0.0211968888 0.97880311
## 929 High 0.1060669672 0.89393303
## 931 High 0.0211968888 0.97880311
## 933 High 0.3554801761 0.64451982
## 944 High 0.0433148853 0.95668511
## 947 High 0.0398746182 0.96012538
## 949 High 0.1877588381 0.81224116
## 953 High 0.0233211466 0.97667885
## 958 High 0.5637319742 0.43626803
## 961 High 0.0150665037 0.98493350
## 963 High 0.1129960710 0.88700393
## 964 High 0.0687190029 0.93128100
## 973 High 0.0532886213 0.94671138
## 976 High 0.0264753239 0.97352468
## 977 High 0.1682281293 0.83177187
## 980 High 0.2904095993 0.70959040
## 983 High 0.6727220240 0.32727798
## 984 High 0.1682281293 0.83177187
## 986 High 0.1089806628 0.89101934
## 989 High 0.2022144660 0.79778553
## 991 High 0.0240809741 0.97591903
## 996 High 0.0333823806 0.96661762
## 997 High 0.4975165837 0.50248342
## 999 High 0.0465501013 0.95344990
## 1000 High 0.0565000618 0.94349994
## 1003 High 0.0259764904 0.97402351
## 1008 High 0.1025560666 0.89744393
## 1009 High 0.4656557701 0.53434423
## 1014 High 0.0413949131 0.95860509
## 1015 High 0.4555939327 0.54440607
## 1040 High 0.1848342215 0.81516578
## 1042 High 0.3372214149 0.66277859
## 1043 High 0.7410621250 0.25893787
## 1050 High 0.1112632945 0.88873671
## 1052 High 0.1738234646 0.82617654
## 1056 High 0.3235717630 0.67642824
## 1070 High 0.5311755155 0.46882448
## 1073 High 0.5151860040 0.48481400
## 1074 High 0.1702113670 0.82978863
## 1079 High 0.4374952373 0.56250476
## 1080 High 0.6262738215 0.37372618
## 1085 High 0.0855709853 0.91442901
## 1087 High 0.6387004046 0.36129960
## 1096 High 0.8979928982 0.10200710
## 1099 High 0.4266976843 0.57330232
## 1100 High 0.7178153041 0.28218470
## 1102 High 0.0821493064 0.91785069
## 1107 Low 0.4800969559 0.51990304
## 1109 Low 0.6555391529 0.34446085
## 1114 Low 0.5204465731 0.47955343
## 1118 Low 0.3451967905 0.65480321
## 1123 Low 0.4281009860 0.57189901
## 1132 Low 0.8951347736 0.10486523
## 1134 Low 0.5798822776 0.42011772
## 1137 Low 0.3796756641 0.62032434
## 1154 Low 0.3796756641 0.62032434
## 1155 Low 0.5576566154 0.44234338
## 1157 Low 0.7299812013 0.27001880
## 1162 Low 0.6603529106 0.33964709
## 1164 Low 0.1596173546 0.84038265
## 1171 Low 0.9236735195 0.07632648
## 1172 Low 0.5324789498 0.46752105
## 1175 Low 0.6025547990 0.39744520
## 1177 Low 0.7033318635 0.29666814
## 1179 Low 0.8937618468 0.10623815
## 1183 Low 0.2760144412 0.72398556
## 1185 Low 0.9280768920 0.07192311
## 1189 Low 0.8086497299 0.19135027
## 1211 Low 0.8354424008 0.16455760
## 1218 Low 0.9892374352 0.01076256
## 1224 Low 0.4458631939 0.55413681
## 1225 Low 0.3818211000 0.61817890
## 1227 Low 0.8964097761 0.10359022
## 1232 Low 0.9844700142 0.01552999
## 1235 Low 0.7387527307 0.26124727
## 1238 Low 0.6806068698 0.31939313
## 1240 Low 0.8384203887 0.16157961
## 1241 Low 0.8354424008 0.16455760
## 1248 Low 0.9206758207 0.07932418
## 1258 Low 0.7563111238 0.24368888
## 1261 Low 0.8844033428 0.11559666
## 1263 Low 0.8910529193 0.10894708
## 1269 Low 0.9557472206 0.04425278
## 1270 Low 0.9853749858 0.01462501
## 1271 Low 0.9042545052 0.09574549
## 1272 Low 0.9162961404 0.08370386
## 1280 Low 0.9367252603 0.06327474
## 1286 Low 0.9875508520 0.01244915
## 1287 Low 0.9885358662 0.01146413
## 1289 Low 0.9523876595 0.04761234
## 1290 Low 0.9162961404 0.08370386
## 1291 High 0.1783988428 0.82160116
## 1294 High 0.8086918533 0.19130815
## 1305 Low 0.8712582117 0.12874179
## 1308 High 0.7148932523 0.28510675
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_YJT_Test_ROC <- roc(response = LR_YJT_Test$LR_YJT_Observed,
predictor = LR_YJT_Test$LR_YJT_Predicted.High,
levels = rev(levels(LR_YJT_Test$LR_YJT_Observed)))
(LR_YJT_Test_ROCCurveAUC <- auc(LR_YJT_Test_ROC)[1])
## [1] 0.8906181
1.5.4 Logistic Regression With Exponential Transformation
(LR_ET)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Exponential
Transformation applies a new family of exponential distributions
that can be used without restrictions, providing a good alternative for
other power transformation families. While there are no restrictions in
terms of the applicable values, the method’s effectiveness is sensitive
to the quality of the data as it needs to assume a common mean and
estimate the transformation parameter by directly maximizing the
likelihood.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with exponential transformation applied to treat data skewness
but no any treatment applied for data outliers.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.88053
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.89013
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
Transform_Exponential <- preProcess(PMA_PreModelling_Train_LR, method = c("expoTrans"))
PMA_PreModelling_Train_LR_ET <- predict(Transform_Exponential, PMA_PreModelling_Train_LR.Numeric)
PMA_PreModelling_Train_LR_ET$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_ET[,sapply(PMA_PreModelling_Train_LR_ET, is.numeric)],
y = PMA_PreModelling_Train_LR_ET$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_ET Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_ET[,sapply(PMA_PreModelling_Train_LR_ET, is.numeric)],
y = PMA_PreModelling_Train_LR_ET$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_ET Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_ET$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_ET$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_ET_Tune <- train(x = PMA_PreModelling_Train_LR_ET[,!names(PMA_PreModelling_Train_LR_ET) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_ET$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_ET_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8805333 0.7187154 0.830225
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 6.2403 1.4785 0.1379 -0.7367
## NumCarbon
## -0.1887
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 811.6 AIC: 821.6
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8805333 0.7187154 0.830225 0.03607516 0.07895841 0.06335689
(LR_ET_Train_ROCCurveAUC <- LR_ET_Tune$results$ROC)
## [1] 0.8805333
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_ET_VarImp <- varImp(LR_ET_Tune, scale = TRUE)
plot(LR_ET_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
PMA_PreModelling_Test_LR_ET <- predict(Transform_Exponential, PMA_PreModelling_Test_LR.Numeric)
PMA_PreModelling_Test_LR_ET$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_ET[,sapply(PMA_PreModelling_Test_LR_ET, is.numeric)],
y = PMA_PreModelling_Test_LR_ET$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_ET Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_ET[,sapply(PMA_PreModelling_Test_LR_ET, is.numeric)],
y = PMA_PreModelling_Test_LR_ET$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_ET Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_ET$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_ET_Test <- data.frame(LR_ET_Observed = PMA_PreModelling_Test_LR_ET$Log_Solubility_Class,
LR_ET_Predicted = predict(LR_ET_Tune,
PMA_PreModelling_Test_LR_ET[,!names(PMA_PreModelling_Test_LR_ET) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_ET_Test
## LR_ET_Observed LR_ET_Predicted.Low LR_ET_Predicted.High
## 20 High 0.013752867 0.98624713
## 21 High 0.009194143 0.99080586
## 23 High 0.044071687 0.95592831
## 25 High 0.015356098 0.98464390
## 28 High 0.062116193 0.93788381
## 31 High 0.030063697 0.96993630
## 32 High 0.055492297 0.94450770
## 33 High 0.081840730 0.91815927
## 34 High 0.081840730 0.91815927
## 37 High 0.362323531 0.63767647
## 38 High 0.362323531 0.63767647
## 42 High 0.529862825 0.47013717
## 49 High 0.175456910 0.82454309
## 54 High 0.027319430 0.97268057
## 55 High 0.022068424 0.97793158
## 58 High 0.443511260 0.55648874
## 60 High 0.082693666 0.91730633
## 61 High 0.081840730 0.91815927
## 65 High 0.048911337 0.95108866
## 69 High 0.533781158 0.46621884
## 73 High 0.032939911 0.96706009
## 86 High 0.067654064 0.93234594
## 90 High 0.046903055 0.95309695
## 91 High 0.016987566 0.98301243
## 93 High 0.046903055 0.95309695
## 96 High 0.016987566 0.98301243
## 98 High 0.053351489 0.94664851
## 100 High 0.066118585 0.93388142
## 104 High 0.892159099 0.10784090
## 112 High 0.226736762 0.77326324
## 115 High 0.880645514 0.11935449
## 119 High 0.153976304 0.84602370
## 128 High 0.153976304 0.84602370
## 130 High 0.023851968 0.97614803
## 139 High 0.023851968 0.97614803
## 143 High 0.038552905 0.96144709
## 145 High 0.114662957 0.88533704
## 146 High 0.153976304 0.84602370
## 149 High 0.152805307 0.84719469
## 150 High 0.133318136 0.86668186
## 152 High 0.081840730 0.91815927
## 157 High 0.513142137 0.48685786
## 161 High 0.245051293 0.75494871
## 162 High 0.010303091 0.98969691
## 166 High 0.528097208 0.47190279
## 167 High 0.164575958 0.83542404
## 173 High 0.116830681 0.88316932
## 176 High 0.153976304 0.84602370
## 182 High 0.020554918 0.97944508
## 187 High 0.049933006 0.95006699
## 190 High 0.020110335 0.97988967
## 194 High 0.023549684 0.97645032
## 195 High 0.178797120 0.82120288
## 201 High 0.076255775 0.92374422
## 207 High 0.117776492 0.88222351
## 208 High 0.511302373 0.48869763
## 215 High 0.030063697 0.96993630
## 222 High 0.244990513 0.75500949
## 224 High 0.182053122 0.81794688
## 231 High 0.749586694 0.25041331
## 236 High 0.097554294 0.90244571
## 237 High 0.041892008 0.95810799
## 240 High 0.185612595 0.81438741
## 243 High 0.076255775 0.92374422
## 248 High 0.178797120 0.82120288
## 251 High 0.775565527 0.22443447
## 256 High 0.398539794 0.60146021
## 258 High 0.169197351 0.83080265
## 262 High 0.511302373 0.48869763
## 266 High 0.452920897 0.54707910
## 272 High 0.570995637 0.42900436
## 280 High 0.296779366 0.70322063
## 283 High 0.443886349 0.55611365
## 286 High 0.435270439 0.56472956
## 287 High 0.168888318 0.83111168
## 289 High 0.099729418 0.90027058
## 290 High 0.316994422 0.68300558
## 298 High 0.318073613 0.68192639
## 305 High 0.294523101 0.70547690
## 306 High 0.222812248 0.77718775
## 312 High 0.101828758 0.89817124
## 320 High 0.314342671 0.68565733
## 325 High 0.176122072 0.82387793
## 332 High 0.058470118 0.94152988
## 333 High 0.457755060 0.54224494
## 335 High 0.318073613 0.68192639
## 339 High 0.772723681 0.22727632
## 346 High 0.347449045 0.65255096
## 347 High 0.044973673 0.95502633
## 350 High 0.382267109 0.61773289
## 353 High 0.349013643 0.65098636
## 358 High 0.321275416 0.67872458
## 365 High 0.224247273 0.77575273
## 367 High 0.199038406 0.80096159
## 370 High 0.048269146 0.95173085
## 379 High 0.128343126 0.87165687
## 386 High 0.362492091 0.63750791
## 394 High 0.522815273 0.47718473
## 396 High 0.130957795 0.86904220
## 400 High 0.041892008 0.95810799
## 404 High 0.038322544 0.96167746
## 405 High 0.626352870 0.37364713
## 413 High 0.131232803 0.86876720
## 415 High 0.371596531 0.62840347
## 417 High 0.212807558 0.78719244
## 418 High 0.441346753 0.55865325
## 423 High 0.306995054 0.69300495
## 434 High 0.283812634 0.71618737
## 437 High 0.274034772 0.72596523
## 440 High 0.378573196 0.62142680
## 449 High 0.362681763 0.63731824
## 450 High 0.273348151 0.72665185
## 457 High 0.362681763 0.63731824
## 467 High 0.325012206 0.67498779
## 469 High 0.221752671 0.77824733
## 474 High 0.936674227 0.06332577
## 475 High 0.904392341 0.09560766
## 485 High 0.132581065 0.86741894
## 504 Low 0.288558015 0.71144198
## 511 Low 0.780430879 0.21956912
## 512 Low 0.410836585 0.58916341
## 517 Low 0.082425749 0.91757425
## 519 Low 0.689505392 0.31049461
## 520 Low 0.075648339 0.92435166
## 522 Low 0.940550594 0.05944941
## 527 Low 0.635155993 0.36484401
## 528 Low 0.295989467 0.70401053
## 529 Low 0.301900165 0.69809983
## 537 Low 0.124694956 0.87530504
## 540 Low 0.924323450 0.07567655
## 541 Low 0.545190763 0.45480924
## 547 Low 0.920545497 0.07945450
## 550 Low 0.769867948 0.23013205
## 555 Low 0.500850846 0.49914915
## 564 Low 0.037015067 0.96298493
## 570 Low 0.354179129 0.64582087
## 573 Low 0.238387555 0.76161244
## 575 Low 0.545190763 0.45480924
## 578 Low 0.188629199 0.81137080
## 581 Low 0.238387555 0.76161244
## 585 Low 0.323229930 0.67677007
## 590 Low 0.723611110 0.27638889
## 601 Low 0.797056033 0.20294397
## 602 Low 0.613964299 0.38603570
## 607 Low 0.592073516 0.40792648
## 610 Low 0.577538414 0.42246159
## 618 Low 0.685443653 0.31455635
## 624 Low 0.323229930 0.67677007
## 626 Low 0.241740192 0.75825981
## 627 Low 0.357226702 0.64277330
## 634 Low 0.660110865 0.33988913
## 640 Low 0.982153120 0.01784688
## 642 Low 0.168057998 0.83194200
## 643 Low 0.628905923 0.37109408
## 644 Low 0.867026175 0.13297383
## 645 Low 0.773312713 0.22668729
## 646 Low 0.770707712 0.22929229
## 647 Low 0.714109400 0.28589060
## 652 Low 0.191323910 0.80867609
## 658 Low 0.575949760 0.42405024
## 659 Low 0.851385294 0.14861471
## 660 Low 0.880217375 0.11978263
## 664 Low 0.319822683 0.68017732
## 666 Low 0.416063033 0.58393697
## 667 Low 0.827447505 0.17255250
## 675 Low 0.628905923 0.37109408
## 680 Low 0.964633230 0.03536677
## 681 Low 0.874768134 0.12523187
## 687 Low 0.866602938 0.13339706
## 694 Low 0.748302599 0.25169740
## 697 Low 0.559681179 0.44031882
## 701 Low 0.359811881 0.64018812
## 705 Low 0.912000344 0.08799966
## 707 Low 0.700044807 0.29995519
## 710 Low 0.743622998 0.25637700
## 716 Low 0.885887800 0.11411220
## 719 Low 0.939669824 0.06033018
## 720 Low 0.974804926 0.02519507
## 725 Low 0.976039979 0.02396002
## 727 Low 0.359811881 0.64018812
## 730 Low 0.504435382 0.49556462
## 738 Low 0.859253410 0.14074659
## 745 Low 0.702405944 0.29759406
## 748 Low 0.808544204 0.19145580
## 751 Low 0.933147457 0.06685254
## 756 Low 0.685429835 0.31457016
## 766 Low 0.751514843 0.24848516
## 769 Low 0.602517547 0.39748245
## 783 Low 0.869255200 0.13074480
## 785 Low 0.902579681 0.09742032
## 790 Low 0.886834280 0.11316572
## 793 Low 0.869255200 0.13074480
## 795 Low 0.986490055 0.01350995
## 796 Low 0.979675106 0.02032489
## 797 Low 0.555989916 0.44401008
## 801 Low 0.775047694 0.22495231
## 811 Low 0.556377102 0.44362290
## 812 Low 0.929865865 0.07013413
## 815 Low 0.957749970 0.04225003
## 816 Low 0.758028058 0.24197194
## 817 Low 0.951236951 0.04876305
## 824 Low 0.886411056 0.11358894
## 825 Low 0.886411056 0.11358894
## 826 Low 0.886411056 0.11358894
## 830 Low 0.912117000 0.08788300
## 837 Low 0.940840521 0.05915948
## 838 Low 0.758028058 0.24197194
## 844 Low 0.927974648 0.07202535
## 845 Low 0.976894550 0.02310545
## 847 Low 0.933393677 0.06660632
## 850 Low 0.902116294 0.09788371
## 852 Low 0.915857107 0.08414289
## 853 Low 0.915857107 0.08414289
## 861 Low 0.927636110 0.07236389
## 868 Low 0.980340828 0.01965917
## 874 Low 0.937852871 0.06214713
## 879 High 0.113209880 0.88679012
## 895 High 0.014930232 0.98506977
## 899 High 0.001859651 0.99814035
## 903 High 0.014930232 0.98506977
## 917 High 0.056418013 0.94358199
## 927 High 0.027319430 0.97268057
## 929 High 0.116830681 0.88316932
## 931 High 0.027319430 0.97268057
## 933 High 0.362953760 0.63704624
## 944 High 0.043101463 0.95689854
## 947 High 0.046903055 0.95309695
## 949 High 0.184608445 0.81539155
## 953 High 0.021584412 0.97841559
## 958 High 0.536045815 0.46395419
## 961 High 0.015768127 0.98423187
## 963 High 0.119476478 0.88052352
## 964 High 0.076255775 0.92374422
## 973 High 0.049972348 0.95002765
## 976 High 0.037015067 0.96298493
## 977 High 0.178797120 0.82120288
## 980 High 0.275965540 0.72403446
## 983 High 0.650018484 0.34998152
## 984 High 0.178797120 0.82120288
## 986 High 0.101828758 0.89817124
## 989 High 0.202228735 0.79777127
## 991 High 0.023066118 0.97693388
## 996 High 0.050382284 0.94961772
## 997 High 0.470933547 0.52906645
## 999 High 0.044071687 0.95592831
## 1000 High 0.051069687 0.94893031
## 1003 High 0.041892008 0.95810799
## 1008 High 0.099376550 0.90062345
## 1009 High 0.443886349 0.55611365
## 1014 High 0.041232631 0.95876737
## 1015 High 0.470359921 0.52964008
## 1040 High 0.177580468 0.82241953
## 1042 High 0.339492531 0.66050747
## 1043 High 0.730538815 0.26946119
## 1050 High 0.104620962 0.89537904
## 1052 High 0.180674837 0.81932516
## 1056 High 0.333657886 0.66634211
## 1070 High 0.555232608 0.44476739
## 1073 High 0.486744863 0.51325514
## 1074 High 0.177106183 0.82289382
## 1079 High 0.416066054 0.58393395
## 1080 High 0.599211550 0.40078845
## 1085 High 0.082405863 0.91759414
## 1087 High 0.657183051 0.34281695
## 1096 High 0.909565322 0.09043468
## 1099 High 0.440418289 0.55958171
## 1100 High 0.699230090 0.30076991
## 1102 High 0.071892116 0.92810788
## 1107 Low 0.454403473 0.54559653
## 1109 Low 0.675224568 0.32477543
## 1114 Low 0.491991507 0.50800849
## 1118 Low 0.356946854 0.64305315
## 1123 Low 0.432009924 0.56799008
## 1132 Low 0.886272790 0.11372721
## 1134 Low 0.603826999 0.39617300
## 1137 Low 0.357226702 0.64277330
## 1154 Low 0.357226702 0.64277330
## 1155 Low 0.582463212 0.41753679
## 1157 Low 0.765117005 0.23488300
## 1162 Low 0.628905923 0.37109408
## 1164 Low 0.168057998 0.83194200
## 1171 Low 0.933147457 0.06685254
## 1172 Low 0.513142137 0.48685786
## 1175 Low 0.627703257 0.37229674
## 1177 Low 0.687471315 0.31252868
## 1179 Low 0.902338418 0.09766158
## 1183 Low 0.268533199 0.73146680
## 1185 Low 0.934234085 0.06576592
## 1189 Low 0.820187858 0.17981214
## 1211 Low 0.816488095 0.18351190
## 1218 Low 0.987709192 0.01229081
## 1224 Low 0.418627487 0.58137251
## 1225 Low 0.359811881 0.64018812
## 1227 Low 0.906940352 0.09305965
## 1232 Low 0.983937916 0.01606208
## 1235 Low 0.755569564 0.24443044
## 1238 Low 0.713703436 0.28629656
## 1240 Low 0.855212853 0.14478715
## 1241 Low 0.816488095 0.18351190
## 1248 Low 0.912117000 0.08788300
## 1258 Low 0.758028058 0.24197194
## 1261 Low 0.886758010 0.11324199
## 1263 Low 0.886411056 0.11358894
## 1269 Low 0.955630849 0.04436915
## 1270 Low 0.985258898 0.01474110
## 1271 Low 0.902116294 0.09788371
## 1272 Low 0.915857107 0.08414289
## 1280 Low 0.937852871 0.06214713
## 1286 Low 0.986020488 0.01397951
## 1287 Low 0.987076206 0.01292379
## 1289 Low 0.953597078 0.04640292
## 1290 Low 0.915857107 0.08414289
## 1291 High 0.175452898 0.82454710
## 1294 High 0.799908047 0.20009195
## 1305 Low 0.880239111 0.11976089
## 1308 High 0.727385856 0.27261414
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_ET_Test_ROC <- roc(response = LR_ET_Test$LR_ET_Observed,
predictor = LR_ET_Test$LR_ET_Predicted.High,
levels = rev(levels(LR_ET_Test$LR_ET_Observed)))
(LR_ET_Test_ROCCurveAUC <- auc(LR_ET_Test_ROC)[1])
## [1] 0.890133
1.5.5 Logistic Regression With Inverse Hyperbolic Sine
Transformation (LR_IHST)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Inverse
Hyperbolic Sine Transformation behaves similarly to a log
transformation for positive values. The inverse hyperbolic since
function however has the added advantage of remaining defined for zeroes
and negative values.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with inverse hyperbolic sine transformation applied to treat
data skewness but no any treatment applied for data outliers.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.87107
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.87998
##################################
# Applying inverse hyperbolic sine function
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
PMA_PreModelling_Train[,i] <- log(PMA_PreModelling_Train[,i]+(((PMA_PreModelling_Train[,i])^2)+1)^(1/2))
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
PMA_PreModelling_Test[,i] <- log(PMA_PreModelling_Test[,i]+(((PMA_PreModelling_Test[,i])^2)+1)^(1/2))
}
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
PMA_PreModelling_Train_LR_IHST <- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_IHST[,sapply(PMA_PreModelling_Train_LR_IHST, is.numeric)],
y = PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_IHST Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_IHST[,sapply(PMA_PreModelling_Train_LR_IHST, is.numeric)],
y = PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_IHST Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_IHST_Tune <- train(x = PMA_PreModelling_Train_LR_IHST[,!names(PMA_PreModelling_Train_LR_IHST) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_IHST_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8710722 0.7374862 0.8073657
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 10.721 1.364 1.208 -3.671
## NumCarbon
## -1.128
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 836 AIC: 846
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8710722 0.7374862 0.8073657 0.04019999 0.06659904 0.0606913
(LR_IHST_Train_ROCCurveAUC <- LR_IHST_Tune$results$ROC)
## [1] 0.8710722
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_IHST_VarImp <- varImp(LR_IHST_Tune, scale = TRUE)
plot(LR_IHST_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
PMA_PreModelling_Test_LR_IHST <- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_IHST[,sapply(PMA_PreModelling_Test_LR_IHST, is.numeric)],
y = PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_IHST Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_IHST[,sapply(PMA_PreModelling_Test_LR_IHST, is.numeric)],
y = PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_IHST Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_IHST_Test <- data.frame(LR_IHST_Observed = PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class,
LR_IHST_Predicted = predict(LR_IHST_Tune,
PMA_PreModelling_Test_LR_IHST[,!names(PMA_PreModelling_Test_LR_IHST) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_IHST_Test
## LR_IHST_Observed LR_IHST_Predicted.Low LR_IHST_Predicted.High
## 20 High 0.0099300682 0.99006993
## 21 High 0.0070843515 0.99291565
## 23 High 0.0578716298 0.94212837
## 25 High 0.0182500257 0.98174997
## 28 High 0.0801383023 0.91986170
## 31 High 0.0150075039 0.98499250
## 32 High 0.0292220184 0.97077798
## 33 High 0.0524819225 0.94751808
## 34 High 0.0524819225 0.94751808
## 37 High 0.3152799614 0.68472004
## 38 High 0.3152799614 0.68472004
## 42 High 0.6136673812 0.38633262
## 49 High 0.1598231577 0.84017684
## 54 High 0.0356333544 0.96436665
## 55 High 0.0219177360 0.97808226
## 58 High 0.4549805179 0.54501948
## 60 High 0.1172175407 0.88278246
## 61 High 0.0524819225 0.94751808
## 65 High 0.0609565698 0.93904343
## 69 High 0.5374744295 0.46252557
## 73 High 0.0182159523 0.98178405
## 86 High 0.0487705078 0.95122949
## 90 High 0.0693209815 0.93067902
## 91 High 0.0121654002 0.98783460
## 93 High 0.0693209815 0.93067902
## 96 High 0.0121654002 0.98783460
## 98 High 0.0775264455 0.92247355
## 100 High 0.0904961248 0.90950388
## 104 High 0.8993420391 0.10065796
## 112 High 0.1883413325 0.81165867
## 115 High 0.8526100741 0.14738993
## 119 High 0.1405876713 0.85941233
## 128 High 0.1405876713 0.85941233
## 130 High 0.0236487683 0.97635123
## 139 High 0.0236487683 0.97635123
## 143 High 0.0520388523 0.94796115
## 145 High 0.0793526698 0.92064733
## 146 High 0.1405876713 0.85941233
## 149 High 0.2122457494 0.78775425
## 150 High 0.1009300825 0.89906992
## 152 High 0.0524819225 0.94751808
## 157 High 0.5269860724 0.47301393
## 161 High 0.3337627789 0.66623722
## 162 High 0.0062680045 0.99373200
## 166 High 0.5354668322 0.46453317
## 167 High 0.1260047503 0.87399525
## 173 High 0.0887542160 0.91124578
## 176 High 0.1405876713 0.85941233
## 182 High 0.0061870117 0.99381299
## 187 High 0.0576049610 0.94239504
## 190 High 0.0156622796 0.98433772
## 194 High 0.0105820641 0.98941794
## 195 High 0.2551363208 0.74486368
## 201 High 0.1185994482 0.88140055
## 207 High 0.1831137154 0.81688628
## 208 High 0.4561448837 0.54385512
## 215 High 0.0150075039 0.98499250
## 222 High 0.3377831981 0.66221680
## 224 High 0.2291389419 0.77086106
## 231 High 0.7081685175 0.29183148
## 236 High 0.1339363550 0.86606365
## 237 High 0.0201223973 0.97987760
## 240 High 0.1926954630 0.80730454
## 243 High 0.1185994482 0.88140055
## 248 High 0.2551363208 0.74486368
## 251 High 0.8071616550 0.19283835
## 256 High 0.4973558109 0.50264419
## 258 High 0.2393589007 0.76064110
## 262 High 0.4561448837 0.54385512
## 266 High 0.5304409692 0.46955903
## 272 High 0.5720104543 0.42798955
## 280 High 0.3631225228 0.63687748
## 283 High 0.4275080021 0.57249200
## 286 High 0.5141733831 0.48582662
## 287 High 0.2435576257 0.75644237
## 289 High 0.0848641421 0.91513586
## 290 High 0.3819373209 0.61806268
## 298 High 0.3093894899 0.69061051
## 305 High 0.3858465603 0.61415344
## 306 High 0.2315157545 0.76848425
## 312 High 0.1353064070 0.86469359
## 320 High 0.2745723275 0.72542767
## 325 High 0.2358766959 0.76412330
## 332 High 0.0836961949 0.91630381
## 333 High 0.4080892379 0.59191076
## 335 High 0.3093894899 0.69061051
## 339 High 0.7128384446 0.28716156
## 346 High 0.4472904099 0.55270959
## 347 High 0.0300015916 0.96999841
## 350 High 0.3449964094 0.65500359
## 353 High 0.4462698817 0.55373012
## 358 High 0.4225671930 0.57743281
## 365 High 0.3039458499 0.69605415
## 367 High 0.2495985988 0.75040140
## 370 High 0.0427492613 0.95725074
## 379 High 0.1399298135 0.86007019
## 386 High 0.2689183845 0.73108162
## 394 High 0.6069330289 0.39306697
## 396 High 0.1774691742 0.82253083
## 400 High 0.0201223973 0.97987760
## 404 High 0.0221953778 0.97780462
## 405 High 0.5922002466 0.40779975
## 413 High 0.1989446496 0.80105535
## 415 High 0.4673768642 0.53262314
## 417 High 0.1389459796 0.86105402
## 418 High 0.5397594210 0.46024058
## 423 High 0.2609195742 0.73908043
## 434 High 0.3784020245 0.62159798
## 437 High 0.2626702786 0.73732972
## 440 High 0.3698931740 0.63010683
## 449 High 0.4326697948 0.56733021
## 450 High 0.3544261568 0.64557384
## 457 High 0.4326697948 0.56733021
## 467 High 0.3153110496 0.68468895
## 469 High 0.1527762476 0.84722375
## 474 High 0.9075381438 0.09246186
## 475 High 0.8765506823 0.12344932
## 485 High 0.1579281927 0.84207181
## 504 Low 0.2444521260 0.75554787
## 511 Low 0.5871849577 0.41281504
## 512 Low 0.5018320598 0.49816794
## 517 Low 0.0668902556 0.93310974
## 519 Low 0.7251116360 0.27488836
## 520 Low 0.0482164150 0.95178359
## 522 Low 0.9083535976 0.09164640
## 527 Low 0.6995000622 0.30049994
## 528 Low 0.2166802349 0.78331977
## 529 Low 0.3803802257 0.61961977
## 537 Low 0.0944886922 0.90551131
## 540 Low 0.8937605539 0.10623945
## 541 Low 0.4523082593 0.54769174
## 547 Low 0.8943000868 0.10569991
## 550 Low 0.5168207177 0.48317928
## 555 Low 0.5751405754 0.42485942
## 564 Low 0.0210364403 0.97896356
## 570 Low 0.4411424281 0.55885757
## 573 Low 0.3206488297 0.67935117
## 575 Low 0.4523082593 0.54769174
## 578 Low 0.1228104528 0.87718955
## 581 Low 0.3206488297 0.67935117
## 585 Low 0.4095958799 0.59040412
## 590 Low 0.7649838590 0.23501614
## 601 Low 0.7912963243 0.20870368
## 602 Low 0.6719001646 0.32809984
## 607 Low 0.6636629369 0.33633706
## 610 Low 0.6521049110 0.34789509
## 618 Low 0.7096458155 0.29035418
## 624 Low 0.4095958799 0.59040412
## 626 Low 0.3435856801 0.65641432
## 627 Low 0.3417805970 0.65821940
## 634 Low 0.5277070848 0.47229292
## 640 Low 0.9701320053 0.02986799
## 642 Low 0.1097637588 0.89023624
## 643 Low 0.5390069187 0.46099308
## 644 Low 0.8335923318 0.16640767
## 645 Low 0.7520869618 0.24791304
## 646 Low 0.6851967239 0.31480328
## 647 Low 0.6972416523 0.30275835
## 652 Low 0.1601891216 0.83981088
## 658 Low 0.6323549805 0.36764502
## 659 Low 0.7784373668 0.22156263
## 660 Low 0.8882344180 0.11176558
## 664 Low 0.3808059125 0.61919409
## 666 Low 0.4848211779 0.51517882
## 667 Low 0.8424237841 0.15757622
## 675 Low 0.5390069187 0.46099308
## 680 Low 0.9646714157 0.03532858
## 681 Low 0.8853530190 0.11464698
## 687 Low 0.7979014677 0.20209853
## 694 Low 0.7863428810 0.21365712
## 697 Low 0.5780038485 0.42199615
## 701 Low 0.2845925670 0.71540743
## 705 Low 0.9200100288 0.07998997
## 707 Low 0.7455518887 0.25444811
## 710 Low 0.6831664789 0.31683352
## 716 Low 0.8903703029 0.10962970
## 719 Low 0.9144792544 0.08552075
## 720 Low 0.9537538416 0.04624616
## 725 Low 0.9577964846 0.04220352
## 727 Low 0.2845925670 0.71540743
## 730 Low 0.5180529588 0.48194704
## 738 Low 0.8051588342 0.19484117
## 745 Low 0.6164473390 0.38355266
## 748 Low 0.7636704477 0.23632955
## 751 Low 0.9320117380 0.06798826
## 756 Low 0.7357943834 0.26420562
## 766 Low 0.7940349873 0.20596501
## 769 Low 0.6140644801 0.38593552
## 783 Low 0.8278040363 0.17219596
## 785 Low 0.8433731544 0.15662685
## 790 Low 0.8939502894 0.10604971
## 793 Low 0.8278040363 0.17219596
## 795 Low 0.9757396053 0.02426039
## 796 Low 0.9631790196 0.03682098
## 797 Low 0.6283445387 0.37165546
## 801 Low 0.7663863103 0.23361369
## 811 Low 0.5728956904 0.42710431
## 812 Low 0.9288809270 0.07111907
## 815 Low 0.9245343078 0.07546569
## 816 Low 0.7513710061 0.24862899
## 817 Low 0.9110816646 0.08891834
## 824 Low 0.8539043619 0.14609564
## 825 Low 0.8539043619 0.14609564
## 826 Low 0.8539043619 0.14609564
## 830 Low 0.8557141199 0.14428588
## 837 Low 0.8960597684 0.10394023
## 838 Low 0.7513710061 0.24862899
## 844 Low 0.8776431746 0.12235683
## 845 Low 0.9529465696 0.04705343
## 847 Low 0.9085970121 0.09140299
## 850 Low 0.8755465620 0.12445344
## 852 Low 0.8934471700 0.10655283
## 853 Low 0.8934471700 0.10655283
## 861 Low 0.9082923526 0.09170765
## 868 Low 0.9589272001 0.04107280
## 874 Low 0.9207968258 0.07920317
## 879 High 0.0946843934 0.90531561
## 895 High 0.0154204755 0.98457952
## 899 High 0.0001948401 0.99980516
## 903 High 0.0154204755 0.98457952
## 917 High 0.0596706125 0.94032939
## 927 High 0.0356333544 0.96436665
## 929 High 0.0887542160 0.91124578
## 931 High 0.0356333544 0.96436665
## 933 High 0.4369472072 0.56305279
## 944 High 0.0614810203 0.93851898
## 947 High 0.0693209815 0.93067902
## 949 High 0.2464938138 0.75350619
## 953 High 0.0184962609 0.98150374
## 958 High 0.4853511053 0.51464889
## 961 High 0.0213531141 0.97864689
## 963 High 0.1129852773 0.88701472
## 964 High 0.1185994482 0.88140055
## 973 High 0.0653591674 0.93464083
## 976 High 0.0210364403 0.97896356
## 977 High 0.2551363208 0.74486368
## 980 High 0.2741358706 0.72586413
## 983 High 0.6342816182 0.36571838
## 984 High 0.2551363208 0.74486368
## 986 High 0.1353064070 0.86469359
## 989 High 0.1686883204 0.83131168
## 991 High 0.0281518759 0.97184812
## 996 High 0.0254700687 0.97452993
## 997 High 0.4490879688 0.55091203
## 999 High 0.0578716298 0.94212837
## 1000 High 0.0600594201 0.93994058
## 1003 High 0.0201223973 0.97987760
## 1008 High 0.1412171683 0.85878283
## 1009 High 0.4275080021 0.57249200
## 1014 High 0.0667172124 0.93328279
## 1015 High 0.5346752396 0.46532476
## 1040 High 0.2265614990 0.77343850
## 1042 High 0.4024945490 0.59750545
## 1043 High 0.7138824116 0.28611759
## 1050 High 0.1370627240 0.86293728
## 1052 High 0.2538067860 0.74619321
## 1056 High 0.1534757420 0.84652426
## 1070 High 0.6331959179 0.36680408
## 1073 High 0.5012647881 0.49873521
## 1074 High 0.1370051566 0.86299484
## 1079 High 0.4040490277 0.59595097
## 1080 High 0.5480798185 0.45192018
## 1085 High 0.0730303379 0.92696966
## 1087 High 0.7155593169 0.28444068
## 1096 High 0.9127890547 0.08721095
## 1099 High 0.5132163700 0.48678363
## 1100 High 0.6607261272 0.33927387
## 1102 High 0.0746000948 0.92539991
## 1107 Low 0.3601428127 0.63985719
## 1109 Low 0.7166172165 0.28338278
## 1114 Low 0.3934733313 0.60652667
## 1118 Low 0.4477738160 0.55222618
## 1123 Low 0.4773018833 0.52269812
## 1132 Low 0.8357851715 0.16421483
## 1134 Low 0.6748232479 0.32517675
## 1137 Low 0.3417805970 0.65821940
## 1154 Low 0.3417805970 0.65821940
## 1155 Low 0.6580481264 0.34195187
## 1157 Low 0.7989572984 0.20104270
## 1162 Low 0.5390069187 0.46099308
## 1164 Low 0.1097637588 0.89023624
## 1171 Low 0.9320117380 0.06798826
## 1172 Low 0.5269860724 0.47301393
## 1175 Low 0.6872275852 0.31277241
## 1177 Low 0.6517991969 0.34820080
## 1179 Low 0.9066712497 0.09332875
## 1183 Low 0.1968901528 0.80310985
## 1185 Low 0.9353748426 0.06462516
## 1189 Low 0.8149583318 0.18504167
## 1211 Low 0.7381952599 0.26180474
## 1218 Low 0.9802534524 0.01974655
## 1224 Low 0.4232012115 0.57679879
## 1225 Low 0.2845925670 0.71540743
## 1227 Low 0.9120709631 0.08792904
## 1232 Low 0.9714750094 0.02852499
## 1235 Low 0.7910733456 0.20892665
## 1238 Low 0.7635803175 0.23641968
## 1240 Low 0.8695451356 0.13045486
## 1241 Low 0.7381952599 0.26180474
## 1248 Low 0.8557141199 0.14428588
## 1258 Low 0.7513710061 0.24862899
## 1261 Low 0.8706811127 0.12931889
## 1263 Low 0.8539043619 0.14609564
## 1269 Low 0.9367083202 0.06329168
## 1270 Low 0.9728605279 0.02713947
## 1271 Low 0.8755465620 0.12445344
## 1272 Low 0.8934471700 0.10655283
## 1280 Low 0.9207968258 0.07920317
## 1286 Low 0.9697385070 0.03026149
## 1287 Low 0.9716992194 0.02830078
## 1289 Low 0.9400342381 0.05996576
## 1290 Low 0.8934471700 0.10655283
## 1291 High 0.2366679120 0.76333209
## 1294 High 0.7680428374 0.23195716
## 1305 Low 0.8943028215 0.10569718
## 1308 High 0.7223131307 0.27768687
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_IHST_Test_ROC <- roc(response = LR_IHST_Test$LR_IHST_Observed,
predictor = LR_IHST_Test$LR_IHST_Predicted.High,
levels = rev(levels(LR_IHST_Test$LR_IHST_Observed)))
(LR_IHST_Test_ROCCurveAUC <- auc(LR_IHST_Test_ROC)[1])
## [1] 0.8799871
1.5.6 Logistic Regression With Base-10 Logarithm Transformation
(LR_LOG10T)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Base-10
Logarithm Transformation applies an algebraic transformation by
substituting the variable by its logarithm function-applied form using a
base equal to 10. As the logarithm of zero or any negative number is
undefined, a constant must be added to move the minimum value of the
distribution preferably to 1.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with base-10 logarithm transformation applied to treat data
skewness but no any treatment applied for data outliers.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.89192
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.89882
##################################
# Applying numerical adjustments
# to eliminate zero values and
# base10-logarithm function
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
PMA_PreModelling_Train[,i] <- PMA_PreModelling_Train[,i] + 1
PMA_PreModelling_Train[,i] <- log10(PMA_PreModelling_Train[,i])
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
PMA_PreModelling_Test[,i] <- PMA_PreModelling_Test[,i] + 1
PMA_PreModelling_Test[,i] <- log10(PMA_PreModelling_Test[,i])
}
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
PMA_PreModelling_Train_LR_LOG10T <- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_LOG10T[,sapply(PMA_PreModelling_Train_LR_LOG10T, is.numeric)],
y = PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LOG10T Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_LOG10T[,sapply(PMA_PreModelling_Train_LR_LOG10T, is.numeric)],
y = PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LOG10T Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_LOG10T_Tune <- train(x = PMA_PreModelling_Train_LR_LOG10T[,!names(PMA_PreModelling_Train_LR_LOG10T) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_LOG10T_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.891921 0.7492802 0.8245646
LR_LOG10T_Tune$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 11.417 2.928 3.279 -14.187
## NumCarbon
## 0.934
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 767.4 AIC: 777.4
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.891921 0.7492802 0.8245646 0.03393011 0.06711081 0.06719134
(LR_LOG10T_Train_ROCCurveAUC <- LR_LOG10T_Tune$results$ROC)
## [1] 0.891921
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_LOG10T_VarImp <- varImp(LR_LOG10T_Tune, scale = TRUE)
plot(LR_LOG10T_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
PMA_PreModelling_Test_LR_LOG10T <- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_LOG10T[,sapply(PMA_PreModelling_Test_LR_LOG10T, is.numeric)],
y = PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LOG10T Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_LOG10T[,sapply(PMA_PreModelling_Test_LR_LOG10T, is.numeric)],
y = PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LOG10T Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_LOG10T_Test <- data.frame(LR_LOG10T_Observed = PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class,
LR_LOG10T_Predicted = predict(LR_LOG10T_Tune,
PMA_PreModelling_Test_LR_LOG10T[,!names(PMA_PreModelling_Test_LR_LOG10T) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_LOG10T_Test
## LR_LOG10T_Observed LR_LOG10T_Predicted.Low LR_LOG10T_Predicted.High
## 20 High 0.0114244517 0.9885755483
## 21 High 0.0080310909 0.9919689091
## 23 High 0.0354283423 0.9645716577
## 25 High 0.0113319188 0.9886680812
## 28 High 0.0603784377 0.9396215623
## 31 High 0.0126421937 0.9873578063
## 32 High 0.0204088121 0.9795911879
## 33 High 0.0401610347 0.9598389653
## 34 High 0.0401610347 0.9598389653
## 37 High 0.3543519894 0.6456480106
## 38 High 0.3543519894 0.6456480106
## 42 High 0.4516439293 0.5483560707
## 49 High 0.1326709535 0.8673290465
## 54 High 0.0127138418 0.9872861582
## 55 High 0.0171425230 0.9828574770
## 58 High 0.4508886389 0.5491113611
## 60 High 0.0458725899 0.9541274101
## 61 High 0.0401610347 0.9598389653
## 65 High 0.0422682755 0.9577317245
## 69 High 0.5360931241 0.4639068759
## 73 High 0.0313344498 0.9686655502
## 86 High 0.0379168490 0.9620831510
## 90 High 0.0246356037 0.9753643963
## 91 High 0.0112519351 0.9887480649
## 93 High 0.0246356037 0.9753643963
## 96 High 0.0112519351 0.9887480649
## 98 High 0.0281163829 0.9718836171
## 100 High 0.0455991395 0.9544008605
## 104 High 0.8596843413 0.1403156587
## 112 High 0.1899203318 0.8100796682
## 115 High 0.8830092589 0.1169907411
## 119 High 0.1145296081 0.8854703919
## 128 High 0.1145296081 0.8854703919
## 130 High 0.0186594977 0.9813405023
## 139 High 0.0186594977 0.9813405023
## 143 High 0.0369097104 0.9630902896
## 145 High 0.0632876797 0.9367123203
## 146 High 0.1145296081 0.8854703919
## 149 High 0.1330597875 0.8669402125
## 150 High 0.0881581152 0.9118418848
## 152 High 0.0401610347 0.9598389653
## 157 High 0.5491594648 0.4508405352
## 161 High 0.1818965999 0.8181034001
## 162 High 0.0114108307 0.9885891693
## 166 High 0.5306213331 0.4693786669
## 167 High 0.1130629002 0.8869370998
## 173 High 0.0762280843 0.9237719157
## 176 High 0.1145296081 0.8854703919
## 182 High 0.0040896523 0.9959103477
## 187 High 0.0411811702 0.9588188298
## 190 High 0.0232842897 0.9767157103
## 194 High 0.0175444271 0.9824555729
## 195 High 0.1108113031 0.8891886969
## 201 High 0.0436230271 0.9563769729
## 207 High 0.0716468222 0.9283531778
## 208 High 0.5395740091 0.4604259909
## 215 High 0.0126421937 0.9873578063
## 222 High 0.1592460118 0.8407539882
## 224 High 0.2064348880 0.7935651120
## 231 High 0.7632446817 0.2367553183
## 236 High 0.0866704378 0.9133295622
## 237 High 0.0136419825 0.9863580175
## 240 High 0.2146524994 0.7853475006
## 243 High 0.0436230271 0.9563769729
## 248 High 0.1108113031 0.8891886969
## 251 High 0.6961143000 0.3038857000
## 256 High 0.3070340842 0.6929659158
## 258 High 0.1230410001 0.8769589999
## 262 High 0.5395740091 0.4604259909
## 266 High 0.4571965619 0.5428034381
## 272 High 0.5720886850 0.4279113150
## 280 High 0.3082654815 0.6917345185
## 283 High 0.4354389857 0.5645610143
## 286 High 0.4391327329 0.5608672671
## 287 High 0.1041477820 0.8958522180
## 289 High 0.1437887272 0.8562112728
## 290 High 0.3282018426 0.6717981574
## 298 High 0.2939848627 0.7060151373
## 305 High 0.2492525888 0.7507474112
## 306 High 0.2398758877 0.7601241123
## 312 High 0.0973381705 0.9026618295
## 320 High 0.3058707588 0.6941292412
## 325 High 0.1771522624 0.8228477376
## 332 High 0.0556061812 0.9443938188
## 333 High 0.4847787185 0.5152212815
## 335 High 0.2939848627 0.7060151373
## 339 High 0.8371427968 0.1628572032
## 346 High 0.2382954634 0.7617045366
## 347 High 0.0327311005 0.9672688995
## 350 High 0.4097345492 0.5902654508
## 353 High 0.2663701600 0.7336298400
## 358 High 0.2182251914 0.7817748086
## 365 High 0.1922987093 0.8077012907
## 367 High 0.2058056954 0.7941943046
## 370 High 0.0651543422 0.9348456578
## 379 High 0.1327838973 0.8672161027
## 386 High 0.5247916818 0.4752083182
## 394 High 0.4697661230 0.5302338770
## 396 High 0.1215635914 0.8784364086
## 400 High 0.0136419825 0.9863580175
## 404 High 0.0493865148 0.9506134852
## 405 High 0.6323780570 0.3676219430
## 413 High 0.0931860048 0.9068139952
## 415 High 0.3133445051 0.6866554949
## 417 High 0.2643581720 0.7356418280
## 418 High 0.3679591512 0.6320408488
## 423 High 0.2714888903 0.7285111097
## 434 High 0.1877329621 0.8122670379
## 437 High 0.3396836347 0.6603163653
## 440 High 0.4302378412 0.5697621588
## 449 High 0.3642274184 0.6357725816
## 450 High 0.2638891235 0.7361108765
## 457 High 0.3642274184 0.6357725816
## 467 High 0.3843344069 0.6156655931
## 469 High 0.3427771468 0.6572228532
## 474 High 0.9448977678 0.0551022322
## 475 High 0.9069691143 0.0930308857
## 485 High 0.1206262001 0.8793737999
## 504 Low 0.2810994647 0.7189005353
## 511 Low 0.7713879404 0.2286120596
## 512 Low 0.3771323094 0.6228676906
## 517 Low 0.0796952208 0.9203047792
## 519 Low 0.6631915667 0.3368084333
## 520 Low 0.0365399927 0.9634600073
## 522 Low 0.9567591567 0.0432408433
## 527 Low 0.5472028728 0.4527971272
## 528 Low 0.4455816601 0.5544183399
## 529 Low 0.2855557521 0.7144442479
## 537 Low 0.0818301918 0.9181698082
## 540 Low 0.9324385799 0.0675614201
## 541 Low 0.7941384873 0.2058615127
## 547 Low 0.9225916586 0.0774083414
## 550 Low 0.7281742163 0.2718257837
## 555 Low 0.4391059978 0.5608940022
## 564 Low 0.0179846866 0.9820153134
## 570 Low 0.3080507810 0.6919492190
## 573 Low 0.2060658806 0.7939341194
## 575 Low 0.7941384873 0.2058615127
## 578 Low 0.2347216890 0.7652783110
## 581 Low 0.2060658806 0.7939341194
## 585 Low 0.3173015329 0.6826984671
## 590 Low 0.6875129427 0.3124870573
## 601 Low 0.7826540228 0.2173459772
## 602 Low 0.6143605885 0.3856394115
## 607 Low 0.4573521726 0.5426478274
## 610 Low 0.4429636059 0.5570363941
## 618 Low 0.7158484958 0.2841515042
## 624 Low 0.3173015329 0.6826984671
## 626 Low 0.1600193605 0.8399806395
## 627 Low 0.3215503018 0.6784496982
## 634 Low 0.6883858742 0.3116141258
## 640 Low 0.9844093361 0.0155906639
## 642 Low 0.2100918801 0.7899081199
## 643 Low 0.8692472243 0.1307527757
## 644 Low 0.8718338373 0.1281661627
## 645 Low 0.7731505226 0.2268494774
## 646 Low 0.9397436656 0.0602563344
## 647 Low 0.7401939612 0.2598060388
## 652 Low 0.1584162203 0.8415837797
## 658 Low 0.5345180440 0.4654819560
## 659 Low 0.9685770972 0.0314229028
## 660 Low 0.8148166346 0.1851833654
## 664 Low 0.3515609710 0.6484390290
## 666 Low 0.4641158585 0.5358841415
## 667 Low 0.8356306242 0.1643693758
## 675 Low 0.8692472243 0.1307527757
## 680 Low 0.9606395556 0.0393604444
## 681 Low 0.8406617830 0.1593382170
## 687 Low 0.9754003752 0.0245996248
## 694 Low 0.6710783028 0.3289216972
## 697 Low 0.5964862294 0.4035137706
## 701 Low 0.5884750688 0.4115249312
## 705 Low 0.8954203536 0.1045796464
## 707 Low 0.5851454633 0.4148545367
## 710 Low 0.8010389126 0.1989610874
## 716 Low 0.8184036632 0.1815963368
## 719 Low 0.9428266773 0.0571733227
## 720 Low 0.9847960378 0.0152039622
## 725 Low 0.9818803020 0.0181196980
## 727 Low 0.5884750688 0.4115249312
## 730 Low 0.5611518183 0.4388481817
## 738 Low 0.9081879879 0.0918120121
## 745 Low 0.9170977223 0.0829022777
## 748 Low 0.8391627813 0.1608372187
## 751 Low 0.8963099718 0.1036900282
## 756 Low 0.6480019501 0.3519980499
## 766 Low 0.6647629476 0.3352370524
## 769 Low 0.6639046057 0.3360953943
## 783 Low 0.8863780609 0.1136219391
## 785 Low 0.9847467994 0.0152532006
## 790 Low 0.8248336297 0.1751663703
## 793 Low 0.8863780609 0.1136219391
## 795 Low 0.9897095855 0.0102904145
## 796 Low 0.9850516420 0.0149483580
## 797 Low 0.4811591379 0.5188408621
## 801 Low 0.7986667815 0.2013332185
## 811 Low 0.5989448209 0.4010551791
## 812 Low 0.8865111374 0.1134888626
## 815 Low 0.9820707051 0.0179292949
## 816 Low 0.8050772583 0.1949227417
## 817 Low 0.9939985174 0.0060014826
## 824 Low 0.8901108481 0.1098891519
## 825 Low 0.8901108481 0.1098891519
## 826 Low 0.8901108481 0.1098891519
## 830 Low 0.9863085584 0.0136914416
## 837 Low 0.9927031149 0.0072968851
## 838 Low 0.8050772583 0.1949227417
## 844 Low 0.9899820408 0.0100179592
## 845 Low 0.9980207841 0.0019792159
## 847 Low 0.9347533178 0.0652466822
## 850 Low 0.9018688685 0.0981311315
## 852 Low 0.9155921637 0.0844078363
## 853 Low 0.9155921637 0.0844078363
## 861 Low 0.9288460568 0.0711539432
## 868 Low 0.9983178675 0.0016821325
## 874 Low 0.9409936465 0.0590063535
## 879 High 0.0743114149 0.9256885851
## 895 High 0.0058470295 0.9941529705
## 899 High 0.0007541913 0.9992458087
## 903 High 0.0058470295 0.9941529705
## 917 High 0.0541474152 0.9458525848
## 927 High 0.0127138418 0.9872861582
## 929 High 0.0762280843 0.9237719157
## 931 High 0.0127138418 0.9872861582
## 933 High 0.3431573442 0.6568426558
## 944 High 0.0283606163 0.9716393837
## 947 High 0.0246356037 0.9753643963
## 949 High 0.1676976167 0.8323023833
## 953 High 0.0325221723 0.9674778277
## 958 High 0.5930878020 0.4069121980
## 961 High 0.0095523084 0.9904476916
## 963 High 0.0969115592 0.9030884408
## 964 High 0.0436230271 0.9563769729
## 973 High 0.0478986779 0.9521013221
## 976 High 0.0179846866 0.9820153134
## 977 High 0.1108113031 0.8891886969
## 980 High 0.3323177208 0.6676822792
## 983 High 0.6441963861 0.3558036139
## 984 High 0.1108113031 0.8891886969
## 986 High 0.0973381705 0.9026618295
## 989 High 0.1679011559 0.8320988441
## 991 High 0.0218893139 0.9781106861
## 996 High 0.0176152778 0.9823847222
## 997 High 0.4484937124 0.5515062876
## 999 High 0.0354283423 0.9645716577
## 1000 High 0.0598930376 0.9401069624
## 1003 High 0.0136419825 0.9863580175
## 1008 High 0.0805914802 0.9194085198
## 1009 High 0.4354389857 0.5645610143
## 1014 High 0.0282625766 0.9717374234
## 1015 High 0.4604541718 0.5395458282
## 1040 High 0.1834826180 0.8165173820
## 1042 High 0.3502196618 0.6497803382
## 1043 High 0.7301353926 0.2698646074
## 1050 High 0.1128584616 0.8871415384
## 1052 High 0.1326642328 0.8673357672
## 1056 High 0.4214253198 0.5785746802
## 1070 High 0.5016341979 0.4983658021
## 1073 High 0.5016785964 0.4983214036
## 1074 High 0.1240863502 0.8759136498
## 1079 High 0.4087109291 0.5912890709
## 1080 High 0.6775015397 0.3224984603
## 1085 High 0.1017757206 0.8982242794
## 1087 High 0.5255391386 0.4744608614
## 1096 High 0.8643165720 0.1356834280
## 1099 High 0.4099125986 0.5900874014
## 1100 High 0.7154999827 0.2845000173
## 1102 High 0.0887503601 0.9112496399
## 1107 Low 0.6788519384 0.3211480616
## 1109 Low 0.6292550701 0.3707449299
## 1114 Low 0.7123656715 0.2876343285
## 1118 Low 0.2889459081 0.7110540919
## 1123 Low 0.4860246521 0.5139753479
## 1132 Low 0.9313431964 0.0686568036
## 1134 Low 0.5204468329 0.4795531671
## 1137 Low 0.3215503018 0.6784496982
## 1154 Low 0.3215503018 0.6784496982
## 1155 Low 0.4988737937 0.5011262063
## 1157 Low 0.6870639639 0.3129360361
## 1162 Low 0.8692472243 0.1307527757
## 1164 Low 0.2100918801 0.7899081199
## 1171 Low 0.8963099718 0.1036900282
## 1172 Low 0.5491594648 0.4508405352
## 1175 Low 0.5561457281 0.4438542719
## 1177 Low 0.7164966875 0.2835033125
## 1179 Low 0.9138172674 0.0861827326
## 1183 Low 0.4123464196 0.5876535804
## 1185 Low 0.9073388166 0.0926611834
## 1189 Low 0.7755333295 0.2244666705
## 1211 Low 0.9600742008 0.0399257992
## 1218 Low 0.9898292244 0.0101707756
## 1224 Low 0.4045598255 0.5954401745
## 1225 Low 0.5884750688 0.4115249312
## 1227 Low 0.8791559614 0.1208440386
## 1232 Low 0.9872441389 0.0127558611
## 1235 Low 0.6380743568 0.3619256432
## 1238 Low 0.6179361327 0.3820638673
## 1240 Low 0.8199182266 0.1800817734
## 1241 Low 0.9600742008 0.0399257992
## 1248 Low 0.9863085584 0.0136914416
## 1258 Low 0.8050772583 0.1949227417
## 1261 Low 0.9027349430 0.0972650570
## 1263 Low 0.8901108481 0.1098891519
## 1269 Low 0.9568830828 0.0431169172
## 1270 Low 0.9882558677 0.0117441323
## 1271 Low 0.9018688685 0.0981311315
## 1272 Low 0.9155921637 0.0844078363
## 1280 Low 0.9409936465 0.0590063535
## 1286 Low 0.9990096848 0.0009903152
## 1287 Low 0.9990849676 0.0009150324
## 1289 Low 0.9599001175 0.0400998825
## 1290 Low 0.9155921637 0.0844078363
## 1291 High 0.1594073766 0.8405926234
## 1294 High 0.8044109182 0.1955890818
## 1305 Low 0.8477696040 0.1522303960
## 1308 High 0.6914165831 0.3085834169
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_LOG10T_Test_ROC <- roc(response = LR_LOG10T_Test$LR_LOG10T_Observed,
predictor = LR_LOG10T_Test$LR_LOG10T_Predicted.High,
levels = rev(levels(LR_LOG10T_Test$LR_LOG10T_Observed)))
(LR_LOG10T_Test_ROCCurveAUC <- auc(LR_LOG10T_Test_ROC)[1])
## [1] 0.8988237
1.5.7 Logistic Regression With Natural Logarithm Transformation
(LR_LNT)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Natural
Logarithm Transformation applies an algebraic transformation by
substituting the variable by its natural logarithm function-applied form
using a base equal to the constant e (2.7182818). As the logarithm of
zero or any negative number is undefined, a constant must be added to
move the minimum value of the distribution preferably to 1.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with natural logarithm transformation applied to treat data
skewness but no any treatment applied for data outliers.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.89192
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.89882
##################################
# Applying numerical adjustments
# to eliminate zero values and
# natural logarithm function
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
PMA_PreModelling_Train[,i] <- PMA_PreModelling_Train[,i] + 1
PMA_PreModelling_Train[,i] <- log(PMA_PreModelling_Train[,i])
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
PMA_PreModelling_Test[,i] <- PMA_PreModelling_Test[,i] + 1
PMA_PreModelling_Test[,i] <- log(PMA_PreModelling_Test[,i])
}
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
PMA_PreModelling_Train_LR_LNT <- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_LNT[,sapply(PMA_PreModelling_Train_LR_LNT, is.numeric)],
y = PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LNT Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_LNT[,sapply(PMA_PreModelling_Train_LR_LNT, is.numeric)],
y = PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LNT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_LNT_Tune <- train(x = PMA_PreModelling_Train_LR_LNT[,!names(PMA_PreModelling_Train_LR_LNT) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_LNT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.891921 0.7492802 0.8245646
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 11.4167 1.2717 1.4241 -6.1614
## NumCarbon
## 0.4056
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 767.4 AIC: 777.4
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.891921 0.7492802 0.8245646 0.03393011 0.06711081 0.06719134
(LR_LNT_Train_ROCCurveAUC <- LR_LNT_Tune$results$ROC)
## [1] 0.891921
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_LNT_VarImp <- varImp(LR_LNT_Tune, scale = TRUE)
plot(LR_LNT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
PMA_PreModelling_Test_LR_LNT <- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_LNT[,sapply(PMA_PreModelling_Test_LR_LNT, is.numeric)],
y = PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LNT Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_LNT[,sapply(PMA_PreModelling_Test_LR_LNT, is.numeric)],
y = PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LNT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_LNT_Test <- data.frame(LR_LNT_Observed = PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class,
LR_LNT_Predicted = predict(LR_LNT_Tune,
PMA_PreModelling_Test_LR_LNT[,!names(PMA_PreModelling_Test_LR_LNT) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_LNT_Test
## LR_LNT_Observed LR_LNT_Predicted.Low LR_LNT_Predicted.High
## 20 High 0.0114244517 0.9885755483
## 21 High 0.0080310909 0.9919689091
## 23 High 0.0354283423 0.9645716577
## 25 High 0.0113319188 0.9886680812
## 28 High 0.0603784377 0.9396215623
## 31 High 0.0126421937 0.9873578063
## 32 High 0.0204088121 0.9795911879
## 33 High 0.0401610347 0.9598389653
## 34 High 0.0401610347 0.9598389653
## 37 High 0.3543519894 0.6456480106
## 38 High 0.3543519894 0.6456480106
## 42 High 0.4516439293 0.5483560707
## 49 High 0.1326709535 0.8673290465
## 54 High 0.0127138418 0.9872861582
## 55 High 0.0171425230 0.9828574770
## 58 High 0.4508886389 0.5491113611
## 60 High 0.0458725899 0.9541274101
## 61 High 0.0401610347 0.9598389653
## 65 High 0.0422682755 0.9577317245
## 69 High 0.5360931241 0.4639068759
## 73 High 0.0313344498 0.9686655502
## 86 High 0.0379168490 0.9620831510
## 90 High 0.0246356037 0.9753643963
## 91 High 0.0112519351 0.9887480649
## 93 High 0.0246356037 0.9753643963
## 96 High 0.0112519351 0.9887480649
## 98 High 0.0281163829 0.9718836171
## 100 High 0.0455991395 0.9544008605
## 104 High 0.8596843413 0.1403156587
## 112 High 0.1899203318 0.8100796682
## 115 High 0.8830092589 0.1169907411
## 119 High 0.1145296081 0.8854703919
## 128 High 0.1145296081 0.8854703919
## 130 High 0.0186594977 0.9813405023
## 139 High 0.0186594977 0.9813405023
## 143 High 0.0369097104 0.9630902896
## 145 High 0.0632876797 0.9367123203
## 146 High 0.1145296081 0.8854703919
## 149 High 0.1330597875 0.8669402125
## 150 High 0.0881581152 0.9118418848
## 152 High 0.0401610347 0.9598389653
## 157 High 0.5491594648 0.4508405352
## 161 High 0.1818965999 0.8181034001
## 162 High 0.0114108307 0.9885891693
## 166 High 0.5306213331 0.4693786669
## 167 High 0.1130629002 0.8869370998
## 173 High 0.0762280843 0.9237719157
## 176 High 0.1145296081 0.8854703919
## 182 High 0.0040896523 0.9959103477
## 187 High 0.0411811702 0.9588188298
## 190 High 0.0232842897 0.9767157103
## 194 High 0.0175444271 0.9824555729
## 195 High 0.1108113031 0.8891886969
## 201 High 0.0436230271 0.9563769729
## 207 High 0.0716468222 0.9283531778
## 208 High 0.5395740091 0.4604259909
## 215 High 0.0126421937 0.9873578063
## 222 High 0.1592460118 0.8407539882
## 224 High 0.2064348880 0.7935651120
## 231 High 0.7632446817 0.2367553183
## 236 High 0.0866704378 0.9133295622
## 237 High 0.0136419825 0.9863580175
## 240 High 0.2146524994 0.7853475006
## 243 High 0.0436230271 0.9563769729
## 248 High 0.1108113031 0.8891886969
## 251 High 0.6961143000 0.3038857000
## 256 High 0.3070340842 0.6929659158
## 258 High 0.1230410001 0.8769589999
## 262 High 0.5395740091 0.4604259909
## 266 High 0.4571965619 0.5428034381
## 272 High 0.5720886850 0.4279113150
## 280 High 0.3082654815 0.6917345185
## 283 High 0.4354389857 0.5645610143
## 286 High 0.4391327329 0.5608672671
## 287 High 0.1041477820 0.8958522180
## 289 High 0.1437887272 0.8562112728
## 290 High 0.3282018426 0.6717981574
## 298 High 0.2939848627 0.7060151373
## 305 High 0.2492525888 0.7507474112
## 306 High 0.2398758877 0.7601241123
## 312 High 0.0973381705 0.9026618295
## 320 High 0.3058707588 0.6941292412
## 325 High 0.1771522624 0.8228477376
## 332 High 0.0556061812 0.9443938188
## 333 High 0.4847787185 0.5152212815
## 335 High 0.2939848627 0.7060151373
## 339 High 0.8371427968 0.1628572032
## 346 High 0.2382954634 0.7617045366
## 347 High 0.0327311005 0.9672688995
## 350 High 0.4097345492 0.5902654508
## 353 High 0.2663701600 0.7336298400
## 358 High 0.2182251914 0.7817748086
## 365 High 0.1922987093 0.8077012907
## 367 High 0.2058056954 0.7941943046
## 370 High 0.0651543422 0.9348456578
## 379 High 0.1327838973 0.8672161027
## 386 High 0.5247916818 0.4752083182
## 394 High 0.4697661230 0.5302338770
## 396 High 0.1215635914 0.8784364086
## 400 High 0.0136419825 0.9863580175
## 404 High 0.0493865148 0.9506134852
## 405 High 0.6323780570 0.3676219430
## 413 High 0.0931860048 0.9068139952
## 415 High 0.3133445051 0.6866554949
## 417 High 0.2643581720 0.7356418280
## 418 High 0.3679591512 0.6320408488
## 423 High 0.2714888903 0.7285111097
## 434 High 0.1877329621 0.8122670379
## 437 High 0.3396836347 0.6603163653
## 440 High 0.4302378412 0.5697621588
## 449 High 0.3642274184 0.6357725816
## 450 High 0.2638891235 0.7361108765
## 457 High 0.3642274184 0.6357725816
## 467 High 0.3843344069 0.6156655931
## 469 High 0.3427771468 0.6572228532
## 474 High 0.9448977678 0.0551022322
## 475 High 0.9069691143 0.0930308857
## 485 High 0.1206262001 0.8793737999
## 504 Low 0.2810994647 0.7189005353
## 511 Low 0.7713879404 0.2286120596
## 512 Low 0.3771323094 0.6228676906
## 517 Low 0.0796952208 0.9203047792
## 519 Low 0.6631915667 0.3368084333
## 520 Low 0.0365399927 0.9634600073
## 522 Low 0.9567591567 0.0432408433
## 527 Low 0.5472028728 0.4527971272
## 528 Low 0.4455816601 0.5544183399
## 529 Low 0.2855557521 0.7144442479
## 537 Low 0.0818301918 0.9181698082
## 540 Low 0.9324385799 0.0675614201
## 541 Low 0.7941384873 0.2058615127
## 547 Low 0.9225916586 0.0774083414
## 550 Low 0.7281742163 0.2718257837
## 555 Low 0.4391059978 0.5608940022
## 564 Low 0.0179846866 0.9820153134
## 570 Low 0.3080507810 0.6919492190
## 573 Low 0.2060658806 0.7939341194
## 575 Low 0.7941384873 0.2058615127
## 578 Low 0.2347216890 0.7652783110
## 581 Low 0.2060658806 0.7939341194
## 585 Low 0.3173015329 0.6826984671
## 590 Low 0.6875129427 0.3124870573
## 601 Low 0.7826540228 0.2173459772
## 602 Low 0.6143605885 0.3856394115
## 607 Low 0.4573521726 0.5426478274
## 610 Low 0.4429636059 0.5570363941
## 618 Low 0.7158484958 0.2841515042
## 624 Low 0.3173015329 0.6826984671
## 626 Low 0.1600193605 0.8399806395
## 627 Low 0.3215503018 0.6784496982
## 634 Low 0.6883858742 0.3116141258
## 640 Low 0.9844093361 0.0155906639
## 642 Low 0.2100918801 0.7899081199
## 643 Low 0.8692472243 0.1307527757
## 644 Low 0.8718338373 0.1281661627
## 645 Low 0.7731505226 0.2268494774
## 646 Low 0.9397436656 0.0602563344
## 647 Low 0.7401939612 0.2598060388
## 652 Low 0.1584162203 0.8415837797
## 658 Low 0.5345180440 0.4654819560
## 659 Low 0.9685770972 0.0314229028
## 660 Low 0.8148166346 0.1851833654
## 664 Low 0.3515609710 0.6484390290
## 666 Low 0.4641158585 0.5358841415
## 667 Low 0.8356306242 0.1643693758
## 675 Low 0.8692472243 0.1307527757
## 680 Low 0.9606395556 0.0393604444
## 681 Low 0.8406617830 0.1593382170
## 687 Low 0.9754003752 0.0245996248
## 694 Low 0.6710783028 0.3289216972
## 697 Low 0.5964862294 0.4035137706
## 701 Low 0.5884750688 0.4115249312
## 705 Low 0.8954203536 0.1045796464
## 707 Low 0.5851454633 0.4148545367
## 710 Low 0.8010389126 0.1989610874
## 716 Low 0.8184036632 0.1815963368
## 719 Low 0.9428266773 0.0571733227
## 720 Low 0.9847960378 0.0152039622
## 725 Low 0.9818803020 0.0181196980
## 727 Low 0.5884750688 0.4115249312
## 730 Low 0.5611518183 0.4388481817
## 738 Low 0.9081879879 0.0918120121
## 745 Low 0.9170977223 0.0829022777
## 748 Low 0.8391627813 0.1608372187
## 751 Low 0.8963099718 0.1036900282
## 756 Low 0.6480019501 0.3519980499
## 766 Low 0.6647629476 0.3352370524
## 769 Low 0.6639046057 0.3360953943
## 783 Low 0.8863780609 0.1136219391
## 785 Low 0.9847467994 0.0152532006
## 790 Low 0.8248336297 0.1751663703
## 793 Low 0.8863780609 0.1136219391
## 795 Low 0.9897095855 0.0102904145
## 796 Low 0.9850516420 0.0149483580
## 797 Low 0.4811591379 0.5188408621
## 801 Low 0.7986667815 0.2013332185
## 811 Low 0.5989448209 0.4010551791
## 812 Low 0.8865111374 0.1134888626
## 815 Low 0.9820707051 0.0179292949
## 816 Low 0.8050772583 0.1949227417
## 817 Low 0.9939985174 0.0060014826
## 824 Low 0.8901108481 0.1098891519
## 825 Low 0.8901108481 0.1098891519
## 826 Low 0.8901108481 0.1098891519
## 830 Low 0.9863085584 0.0136914416
## 837 Low 0.9927031149 0.0072968851
## 838 Low 0.8050772583 0.1949227417
## 844 Low 0.9899820408 0.0100179592
## 845 Low 0.9980207841 0.0019792159
## 847 Low 0.9347533178 0.0652466822
## 850 Low 0.9018688685 0.0981311315
## 852 Low 0.9155921637 0.0844078363
## 853 Low 0.9155921637 0.0844078363
## 861 Low 0.9288460568 0.0711539432
## 868 Low 0.9983178675 0.0016821325
## 874 Low 0.9409936465 0.0590063535
## 879 High 0.0743114149 0.9256885851
## 895 High 0.0058470295 0.9941529705
## 899 High 0.0007541913 0.9992458087
## 903 High 0.0058470295 0.9941529705
## 917 High 0.0541474152 0.9458525848
## 927 High 0.0127138418 0.9872861582
## 929 High 0.0762280843 0.9237719157
## 931 High 0.0127138418 0.9872861582
## 933 High 0.3431573442 0.6568426558
## 944 High 0.0283606163 0.9716393837
## 947 High 0.0246356037 0.9753643963
## 949 High 0.1676976167 0.8323023833
## 953 High 0.0325221723 0.9674778277
## 958 High 0.5930878020 0.4069121980
## 961 High 0.0095523084 0.9904476916
## 963 High 0.0969115592 0.9030884408
## 964 High 0.0436230271 0.9563769729
## 973 High 0.0478986779 0.9521013221
## 976 High 0.0179846866 0.9820153134
## 977 High 0.1108113031 0.8891886969
## 980 High 0.3323177208 0.6676822792
## 983 High 0.6441963861 0.3558036139
## 984 High 0.1108113031 0.8891886969
## 986 High 0.0973381705 0.9026618295
## 989 High 0.1679011559 0.8320988441
## 991 High 0.0218893139 0.9781106861
## 996 High 0.0176152778 0.9823847222
## 997 High 0.4484937124 0.5515062876
## 999 High 0.0354283423 0.9645716577
## 1000 High 0.0598930376 0.9401069624
## 1003 High 0.0136419825 0.9863580175
## 1008 High 0.0805914802 0.9194085198
## 1009 High 0.4354389857 0.5645610143
## 1014 High 0.0282625766 0.9717374234
## 1015 High 0.4604541718 0.5395458282
## 1040 High 0.1834826180 0.8165173820
## 1042 High 0.3502196618 0.6497803382
## 1043 High 0.7301353926 0.2698646074
## 1050 High 0.1128584616 0.8871415384
## 1052 High 0.1326642328 0.8673357672
## 1056 High 0.4214253198 0.5785746802
## 1070 High 0.5016341979 0.4983658021
## 1073 High 0.5016785964 0.4983214036
## 1074 High 0.1240863502 0.8759136498
## 1079 High 0.4087109291 0.5912890709
## 1080 High 0.6775015397 0.3224984603
## 1085 High 0.1017757206 0.8982242794
## 1087 High 0.5255391386 0.4744608614
## 1096 High 0.8643165720 0.1356834280
## 1099 High 0.4099125986 0.5900874014
## 1100 High 0.7154999827 0.2845000173
## 1102 High 0.0887503601 0.9112496399
## 1107 Low 0.6788519384 0.3211480616
## 1109 Low 0.6292550701 0.3707449299
## 1114 Low 0.7123656715 0.2876343285
## 1118 Low 0.2889459081 0.7110540919
## 1123 Low 0.4860246521 0.5139753479
## 1132 Low 0.9313431964 0.0686568036
## 1134 Low 0.5204468329 0.4795531671
## 1137 Low 0.3215503018 0.6784496982
## 1154 Low 0.3215503018 0.6784496982
## 1155 Low 0.4988737937 0.5011262063
## 1157 Low 0.6870639639 0.3129360361
## 1162 Low 0.8692472243 0.1307527757
## 1164 Low 0.2100918801 0.7899081199
## 1171 Low 0.8963099718 0.1036900282
## 1172 Low 0.5491594648 0.4508405352
## 1175 Low 0.5561457281 0.4438542719
## 1177 Low 0.7164966875 0.2835033125
## 1179 Low 0.9138172674 0.0861827326
## 1183 Low 0.4123464196 0.5876535804
## 1185 Low 0.9073388166 0.0926611834
## 1189 Low 0.7755333295 0.2244666705
## 1211 Low 0.9600742008 0.0399257992
## 1218 Low 0.9898292244 0.0101707756
## 1224 Low 0.4045598255 0.5954401745
## 1225 Low 0.5884750688 0.4115249312
## 1227 Low 0.8791559614 0.1208440386
## 1232 Low 0.9872441389 0.0127558611
## 1235 Low 0.6380743568 0.3619256432
## 1238 Low 0.6179361327 0.3820638673
## 1240 Low 0.8199182266 0.1800817734
## 1241 Low 0.9600742008 0.0399257992
## 1248 Low 0.9863085584 0.0136914416
## 1258 Low 0.8050772583 0.1949227417
## 1261 Low 0.9027349430 0.0972650570
## 1263 Low 0.8901108481 0.1098891519
## 1269 Low 0.9568830828 0.0431169172
## 1270 Low 0.9882558677 0.0117441323
## 1271 Low 0.9018688685 0.0981311315
## 1272 Low 0.9155921637 0.0844078363
## 1280 Low 0.9409936465 0.0590063535
## 1286 Low 0.9990096848 0.0009903152
## 1287 Low 0.9990849676 0.0009150324
## 1289 Low 0.9599001175 0.0400998825
## 1290 Low 0.9155921637 0.0844078363
## 1291 High 0.1594073766 0.8405926234
## 1294 High 0.8044109182 0.1955890818
## 1305 Low 0.8477696040 0.1522303960
## 1308 High 0.6914165831 0.3085834169
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_LNT_Test_ROC <- roc(response = LR_LNT_Test$LR_LNT_Observed,
predictor = LR_LNT_Test$LR_LNT_Predicted.High,
levels = rev(levels(LR_LNT_Test$LR_LNT_Observed)))
(LR_LNT_Test_ROCCurveAUC <- auc(LR_LNT_Test_ROC)[1])
## [1] 0.8988237
1.5.8 Logistic Regression With Square Root Transformation
(LR_SRT)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Square Root
Transformation applies an algebraic transformation by substituting
the variable by its square root function-applied form. While the
function can be applied to zero values, square root of any negative
number is undefined, a constant must be added to move the minimum value
of the distribution preferably to 0.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with square root transformation applied to treat data skewness
but no any treatment applied for data outliers.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.88441
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.89882
##################################
# Applying square root function
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
PMA_PreModelling_Train[,i] <- PMA_PreModelling_Train[,i] + 1
PMA_PreModelling_Train[,i] <- sqrt(PMA_PreModelling_Train[,i])
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
PMA_PreModelling_Test[,i] <- PMA_PreModelling_Test[,i] + 1
PMA_PreModelling_Test[,i] <- sqrt(PMA_PreModelling_Test[,i])
}
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
PMA_PreModelling_Train_LR_SRT <- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_SRT[,sapply(PMA_PreModelling_Train_LR_SRT, is.numeric)],
y = PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SRT Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_SRT[,sapply(PMA_PreModelling_Train_LR_SRT, is.numeric)],
y = PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SRT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_SRT_Tune <- train(x = PMA_PreModelling_Train_LR_SRT[,!names(PMA_PreModelling_Train_LR_SRT) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_SRT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8844149 0.7163344 0.830225
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 6.2882 2.7830 0.5132 -2.7168
## NumCarbon
## -0.3034
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 800.3 AIC: 810.3
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8844149 0.7163344 0.830225 0.03385158 0.07539679 0.06335689
(LR_SRT_Train_ROCCurveAUC <- LR_SRT_Tune$results$ROC)
## [1] 0.8844149
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_SRT_VarImp <- varImp(LR_SRT_Tune, scale = TRUE)
plot(LR_SRT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
PMA_PreModelling_Test_LR_SRT <- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_SRT[,sapply(PMA_PreModelling_Test_LR_SRT, is.numeric)],
y = PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SRT Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_SRT[,sapply(PMA_PreModelling_Test_LR_SRT, is.numeric)],
y = PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SRT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_SRT_Test <- data.frame(LR_SRT_Observed = PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class,
LR_SRT_Predicted = predict(LR_SRT_Tune,
PMA_PreModelling_Test_LR_SRT[,!names(PMA_PreModelling_Test_LR_SRT) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_SRT_Test
## LR_SRT_Observed LR_SRT_Predicted.Low LR_SRT_Predicted.High
## 20 High 0.0171776680 0.982822332
## 21 High 0.0094733300 0.990526670
## 23 High 0.0429512262 0.957048774
## 25 High 0.0166962209 0.983303779
## 28 High 0.0615130927 0.938486907
## 31 High 0.0346712284 0.965328772
## 32 High 0.0572263673 0.942773633
## 33 High 0.0870345720 0.912965428
## 34 High 0.0870345720 0.912965428
## 37 High 0.3518170970 0.648182903
## 38 High 0.3518170970 0.648182903
## 42 High 0.5218061033 0.478193897
## 49 High 0.1799027090 0.820097291
## 54 High 0.0328468786 0.967153121
## 55 High 0.0151348891 0.984865111
## 58 High 0.4317935721 0.568206428
## 60 High 0.0847834160 0.915216584
## 61 High 0.0870345720 0.912965428
## 65 High 0.0573837578 0.942616242
## 69 High 0.5187323178 0.481267682
## 73 High 0.0088225257 0.991177474
## 86 High 0.0731684575 0.926831543
## 90 High 0.0538625246 0.946137475
## 91 High 0.0064373829 0.993562617
## 93 High 0.0538625246 0.946137475
## 96 High 0.0064373829 0.993562617
## 98 High 0.0598648467 0.940135153
## 100 High 0.0719361787 0.928063821
## 104 High 0.9099158472 0.090084153
## 112 High 0.2285023414 0.771497659
## 115 High 0.8750053683 0.124994632
## 119 High 0.1622573557 0.837742644
## 128 High 0.1622573557 0.837742644
## 130 High 0.0161060102 0.983893990
## 139 High 0.0161060102 0.983893990
## 143 High 0.0417491435 0.958250856
## 145 High 0.1137434063 0.886256594
## 146 High 0.1622573557 0.837742644
## 149 High 0.1593165890 0.840683411
## 150 High 0.1398798250 0.860120175
## 152 High 0.0870345720 0.912965428
## 157 High 0.5055646637 0.494435336
## 161 High 0.2398674752 0.760132525
## 162 High 0.0066302108 0.993369789
## 166 High 0.5237831608 0.476216839
## 167 High 0.1651313064 0.834868694
## 173 High 0.1259453369 0.874054663
## 176 High 0.1622573557 0.837742644
## 182 High 0.0222998583 0.977700142
## 187 High 0.0370880559 0.962911944
## 190 High 0.0134601255 0.986539874
## 194 High 0.0291152071 0.970884793
## 195 High 0.1750006699 0.824999330
## 201 High 0.0832309264 0.916769074
## 207 High 0.1222726588 0.877727341
## 208 High 0.4877656097 0.512234390
## 215 High 0.0346712284 0.965328772
## 222 High 0.2353885107 0.764611489
## 224 High 0.1922505225 0.807749477
## 231 High 0.7189699421 0.281030058
## 236 High 0.1056715454 0.894328455
## 237 High 0.0456519036 0.954348096
## 240 High 0.2128438968 0.787156103
## 243 High 0.0832309264 0.916769074
## 248 High 0.1750006699 0.824999330
## 251 High 0.7704009245 0.229599076
## 256 High 0.3841407732 0.615859227
## 258 High 0.1701526065 0.829847393
## 262 High 0.4877656097 0.512234390
## 266 High 0.4606389841 0.539361016
## 272 High 0.5616128480 0.438387152
## 280 High 0.3111471891 0.688852811
## 283 High 0.4319894761 0.568010524
## 286 High 0.4462380183 0.553761982
## 287 High 0.1669680937 0.833031906
## 289 High 0.1191373826 0.880862617
## 290 High 0.3312408721 0.668759128
## 298 High 0.3172571574 0.682742843
## 305 High 0.2944486733 0.705551327
## 306 High 0.1943816153 0.805618385
## 312 High 0.0989000206 0.901099979
## 320 High 0.3131330745 0.686866926
## 325 High 0.1824338561 0.817566144
## 332 High 0.0623662697 0.937633730
## 333 High 0.4443815794 0.555618421
## 335 High 0.3172571574 0.682742843
## 339 High 0.7526842928 0.247315707
## 346 High 0.3267166104 0.673283390
## 347 High 0.0533733809 0.946626619
## 350 High 0.3802631772 0.619736823
## 353 High 0.3335196993 0.666480301
## 358 High 0.3045483863 0.695451614
## 365 High 0.2268214855 0.773178514
## 367 High 0.1951627467 0.804837253
## 370 High 0.0351733764 0.964826624
## 379 High 0.1045430079 0.895456992
## 386 High 0.3769454782 0.623054522
## 394 High 0.5261865734 0.473813427
## 396 High 0.1296532429 0.870346757
## 400 High 0.0456519036 0.954348096
## 404 High 0.0482692739 0.951730726
## 405 High 0.5987312318 0.401268768
## 413 High 0.1269081773 0.873091823
## 415 High 0.3684495301 0.631550470
## 417 High 0.2322587979 0.767741202
## 418 High 0.4327304582 0.567269542
## 423 High 0.2911428364 0.708857164
## 434 High 0.2665859841 0.733414016
## 437 High 0.2526126299 0.747387370
## 440 High 0.3719172425 0.628082757
## 449 High 0.3790499525 0.620950048
## 450 High 0.2818076021 0.718192398
## 457 High 0.3790499525 0.620950048
## 467 High 0.3100685455 0.689931454
## 469 High 0.1599358543 0.840064146
## 474 High 0.9353877385 0.064612262
## 475 High 0.9012220999 0.098777900
## 485 High 0.1079410717 0.892058928
## 504 Low 0.1987091431 0.801290857
## 511 Low 0.7040002637 0.295999736
## 512 Low 0.4183441677 0.581655832
## 517 Low 0.0403857448 0.959614255
## 519 Low 0.7193762711 0.280623729
## 520 Low 0.0818263209 0.918173679
## 522 Low 0.9390148916 0.060985108
## 527 Low 0.6225909545 0.377409046
## 528 Low 0.3219492221 0.678050778
## 529 Low 0.3079150489 0.692084951
## 537 Low 0.1326610454 0.867338955
## 540 Low 0.9212090082 0.078790992
## 541 Low 0.5642976565 0.435702343
## 547 Low 0.9188139959 0.081186004
## 550 Low 0.5753601845 0.424639815
## 555 Low 0.5010132937 0.498986706
## 564 Low 0.0412981629 0.958701837
## 570 Low 0.3533886536 0.646611346
## 573 Low 0.2380917144 0.761908286
## 575 Low 0.5642976565 0.435702343
## 578 Low 0.2113833669 0.788616633
## 581 Low 0.2380917144 0.761908286
## 585 Low 0.3217351998 0.678264800
## 590 Low 0.7367369732 0.263263027
## 601 Low 0.8572092918 0.142790708
## 602 Low 0.6397712472 0.360228753
## 607 Low 0.5539816043 0.446018396
## 610 Low 0.5411389615 0.458861038
## 618 Low 0.7487472889 0.251252711
## 624 Low 0.3217351998 0.678264800
## 626 Low 0.2312349798 0.768765020
## 627 Low 0.3353599489 0.664640051
## 634 Low 0.6280533141 0.371946686
## 640 Low 0.9879769391 0.012023061
## 642 Low 0.1928813704 0.807118630
## 643 Low 0.6470732658 0.352926734
## 644 Low 0.8524420814 0.147557919
## 645 Low 0.7576589388 0.242341061
## 646 Low 0.7766825260 0.223317474
## 647 Low 0.7768079181 0.223192082
## 652 Low 0.1992101897 0.800789810
## 658 Low 0.5871468529 0.412853147
## 659 Low 0.8531651594 0.146834841
## 660 Low 0.8805707118 0.119429288
## 664 Low 0.3430474939 0.656952506
## 666 Low 0.4365034540 0.563496546
## 667 Low 0.8629695776 0.137030422
## 675 Low 0.6470732658 0.352926734
## 680 Low 0.9839734149 0.016026585
## 681 Low 0.8925205505 0.107479449
## 687 Low 0.8732490620 0.126750938
## 694 Low 0.7386214676 0.261378532
## 697 Low 0.5743225447 0.425677455
## 701 Low 0.3916949690 0.608305031
## 705 Low 0.9524371969 0.047562803
## 707 Low 0.6657996248 0.334200375
## 710 Low 0.7143156897 0.285684310
## 716 Low 0.8822175146 0.117782485
## 719 Low 0.9406540923 0.059345908
## 720 Low 0.9780703756 0.021929624
## 725 Low 0.9799007961 0.020099204
## 727 Low 0.3916949690 0.608305031
## 730 Low 0.5185168218 0.481483178
## 738 Low 0.8416972687 0.158302731
## 745 Low 0.7186886650 0.281311335
## 748 Low 0.7852185869 0.214781413
## 751 Low 0.9427266890 0.057273311
## 756 Low 0.7000527225 0.299947278
## 766 Low 0.7710747011 0.228925299
## 769 Low 0.5964903375 0.403509663
## 783 Low 0.8502174908 0.149782509
## 785 Low 0.9090787984 0.090921202
## 790 Low 0.8886512925 0.111348707
## 793 Low 0.8502174908 0.149782509
## 795 Low 0.9915592821 0.008440718
## 796 Low 0.9837678710 0.016232129
## 797 Low 0.5608208122 0.439179188
## 801 Low 0.7642200416 0.235779958
## 811 Low 0.5397913514 0.460208649
## 812 Low 0.9363960470 0.063603953
## 815 Low 0.9586825149 0.041317485
## 816 Low 0.7804673310 0.219532669
## 817 Low 0.9566695569 0.043330443
## 824 Low 0.8709603992 0.129039601
## 825 Low 0.8709603992 0.129039601
## 826 Low 0.8709603992 0.129039601
## 830 Low 0.9170635243 0.082936476
## 837 Low 0.9480386701 0.051961330
## 838 Low 0.7804673310 0.219532669
## 844 Low 0.9343242730 0.065675727
## 845 Low 0.9833504460 0.016649554
## 847 Low 0.9315521332 0.068447867
## 850 Low 0.8919647214 0.108035279
## 852 Low 0.9110529484 0.088947052
## 853 Low 0.9110529484 0.088947052
## 861 Low 0.9275671791 0.072432821
## 868 Low 0.9859660498 0.014033950
## 874 Low 0.9416548705 0.058345129
## 879 High 0.1190842356 0.880915764
## 895 High 0.0185473665 0.981452634
## 899 High 0.0002913624 0.999708638
## 903 High 0.0185473665 0.981452634
## 917 High 0.0406554150 0.959344585
## 927 High 0.0328468786 0.967153121
## 929 High 0.1259453369 0.874054663
## 931 High 0.0328468786 0.967153121
## 933 High 0.3687867543 0.631213246
## 944 High 0.0424837446 0.957516255
## 947 High 0.0538625246 0.946137475
## 949 High 0.1819006993 0.818099301
## 953 High 0.0238344707 0.976165529
## 958 High 0.5223971717 0.477602828
## 961 High 0.0171642639 0.982835736
## 963 High 0.1328530189 0.867146981
## 964 High 0.0832309264 0.916769074
## 973 High 0.0517005568 0.948299443
## 976 High 0.0412981629 0.958701837
## 977 High 0.1750006699 0.824999330
## 980 High 0.2633610863 0.736638914
## 983 High 0.6133383461 0.386661654
## 984 High 0.1750006699 0.824999330
## 986 High 0.0989000206 0.901099979
## 989 High 0.2084058286 0.791594171
## 991 High 0.0254199950 0.974580005
## 996 High 0.0528955326 0.947104467
## 997 High 0.4403169291 0.559683071
## 999 High 0.0429512262 0.957048774
## 1000 High 0.0525655462 0.947434454
## 1003 High 0.0456519036 0.954348096
## 1008 High 0.0967426835 0.903257316
## 1009 High 0.4319894761 0.568010524
## 1014 High 0.0429687572 0.957031243
## 1015 High 0.4919848875 0.508015113
## 1040 High 0.1777604857 0.822239514
## 1042 High 0.3526828399 0.647317160
## 1043 High 0.7096112136 0.290388786
## 1050 High 0.1085884359 0.891411564
## 1052 High 0.1792599161 0.820740084
## 1056 High 0.1318850763 0.868114924
## 1070 High 0.5591896277 0.440810372
## 1073 High 0.4657303100 0.534269690
## 1074 High 0.1749873003 0.825012700
## 1079 High 0.4074777876 0.592522212
## 1080 High 0.5848037333 0.415196267
## 1085 High 0.0954182731 0.904581727
## 1087 High 0.6230391933 0.376960807
## 1096 High 0.9185149388 0.081485061
## 1099 High 0.4496281245 0.550371875
## 1100 High 0.6737379747 0.326262025
## 1102 High 0.0589896088 0.941010391
## 1107 Low 0.4722318025 0.527768198
## 1109 Low 0.6900698356 0.309930164
## 1114 Low 0.5024548018 0.497545198
## 1118 Low 0.3450298942 0.654970106
## 1123 Low 0.4711940701 0.528805930
## 1132 Low 0.8749560859 0.125043914
## 1134 Low 0.5948511032 0.405148897
## 1137 Low 0.3353599489 0.664640051
## 1154 Low 0.3353599489 0.664640051
## 1155 Low 0.5731491992 0.426850801
## 1157 Low 0.7851027520 0.214897248
## 1162 Low 0.6470732658 0.352926734
## 1164 Low 0.1928813704 0.807118630
## 1171 Low 0.9427266890 0.057273311
## 1172 Low 0.5055646637 0.494435336
## 1175 Low 0.6405974664 0.359402534
## 1177 Low 0.6626098078 0.337390192
## 1179 Low 0.9406173547 0.059382645
## 1183 Low 0.2979985007 0.702001499
## 1185 Low 0.9489202020 0.051079798
## 1189 Low 0.8642810191 0.135718981
## 1211 Low 0.8242523053 0.175747695
## 1218 Low 0.9938658969 0.006134103
## 1224 Low 0.3956745407 0.604325459
## 1225 Low 0.3916949690 0.608305031
## 1227 Low 0.9249686768 0.075031323
## 1232 Low 0.9889737070 0.011026293
## 1235 Low 0.7256637224 0.274336278
## 1238 Low 0.7264484488 0.273551551
## 1240 Low 0.8721554250 0.127844575
## 1241 Low 0.8242523053 0.175747695
## 1248 Low 0.9170635243 0.082936476
## 1258 Low 0.7804673310 0.219532669
## 1261 Low 0.8974384750 0.102561525
## 1263 Low 0.8709603992 0.129039601
## 1269 Low 0.9605579303 0.039442070
## 1270 Low 0.9897858307 0.010214169
## 1271 Low 0.8919647214 0.108035279
## 1272 Low 0.9110529484 0.088947052
## 1280 Low 0.9416548705 0.058345129
## 1286 Low 0.9913833133 0.008616687
## 1287 Low 0.9920841459 0.007915854
## 1289 Low 0.9625961144 0.037403886
## 1290 Low 0.9110529484 0.088947052
## 1291 High 0.1741556081 0.825844392
## 1294 High 0.7811147653 0.218885235
## 1305 Low 0.9179685932 0.082031407
## 1308 High 0.7657936357 0.234206364
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_SRT_Test_ROC <- roc(response = LR_SRT_Test$LR_SRT_Observed,
predictor = LR_SRT_Test$LR_SRT_Predicted.High,
levels = rev(levels(LR_SRT_Test$LR_SRT_Observed)))
(LR_SRT_Test_ROCCurveAUC <- auc(LR_SRT_Test_ROC)[1])
## [1] 0.8946198
1.5.9 Logistic Regression With Outlier Winsorization Treatment
(LR_WT)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Winsorization
Treatment replaces the extreme values of a data set in order to
limit the effect of the outliers on the calculations or the results
obtained by using such data. An iteration of such method is an 80%
winsorization which replaces the top and bottom 10% data with the values
at the 10th and 90th percentiles, respectively. The winsorization
process results in a statistical measure of location that is closer to
the bulk of the distribution.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with Winsorization transformation applied to treat data outliers
but no any treatment applied for data skewness.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.87428
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.88916
##################################
# Applying winsorization function
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
Predictor_Percentile90 <- quantile(PMA_PreModelling_Train[,i], 0.90)
Predictor_Percentile10 <- quantile(PMA_PreModelling_Train[,i], 0.10)
Predictor_Percentile75 <- quantile(PMA_PreModelling_Train[,i], 0.75)
Predictor_Percentile25 <- quantile(PMA_PreModelling_Train[,i], 0.25)
Predictor_IQR <- Predictor_Percentile75-Predictor_Percentile25
Predictor_Outlier_UCL <- Predictor_Percentile75 + (1.5*Predictor_IQR)
Predictor_Outlier_LCL <- Predictor_Percentile25 - (1.5*Predictor_IQR)
PMA_PreModelling_Train[,i] <- ifelse(PMA_PreModelling_Train[,i]>Predictor_Outlier_UCL,Predictor_Percentile90,
ifelse(PMA_PreModelling_Train[,i]<Predictor_Outlier_LCL,Predictor_Percentile10,
PMA_PreModelling_Train[,i]))
PMA_PreModelling_Test[,i] <- ifelse(PMA_PreModelling_Test[,i]>Predictor_Outlier_UCL,Predictor_Percentile90,
ifelse(PMA_PreModelling_Test[,i]<Predictor_Outlier_LCL,Predictor_Percentile10,
PMA_PreModelling_Test[,i]))
}
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
# No actions applied
##################################
# Treating data outliers
# for the train set
##################################
PMA_PreModelling_Train_LR_WT <- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_WT$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_WT[,sapply(PMA_PreModelling_Train_LR_WT, is.numeric)],
y = PMA_PreModelling_Train_LR_WT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_WT Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_WT[,sapply(PMA_PreModelling_Train_LR_WT, is.numeric)],
y = PMA_PreModelling_Train_LR_WT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_WT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_WT$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_WT$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_WT_Tune <- train(x = PMA_PreModelling_Train_LR_WT[,!names(PMA_PreModelling_Train_LR_WT) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_WT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_WT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8742874 0.7024363 0.8359216
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 4.21806 1.35203 0.05381 -0.30758
## NumCarbon
## -0.11897
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 834.5 AIC: 844.5
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8742874 0.7024363 0.8359216 0.03831246 0.06100711 0.06260559
(LR_WT_Train_ROCCurveAUC <- LR_WT_Tune$results$ROC)
## [1] 0.8742874
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_WT_VarImp <- varImp(LR_WT_Tune, scale = TRUE)
plot(LR_WT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
# No actions applied
##################################
# Treating data outliers
# for the test set
##################################
PMA_PreModelling_Test_LR_WT <- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_WT$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_WT[,sapply(PMA_PreModelling_Test_LR_WT, is.numeric)],
y = PMA_PreModelling_Test_LR_WT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_WT Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_WT[,sapply(PMA_PreModelling_Test_LR_WT, is.numeric)],
y = PMA_PreModelling_Test_LR_WT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_WT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_WT$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_WT_Test <- data.frame(LR_WT_Observed = PMA_PreModelling_Test_LR_WT$Log_Solubility_Class,
LR_WT_Predicted = predict(LR_WT_Tune,
PMA_PreModelling_Test_LR_WT[,!names(PMA_PreModelling_Test_LR_WT) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_WT_Test
## LR_WT_Observed LR_WT_Predicted.Low LR_WT_Predicted.High
## 20 High 0.028252639 0.971747361
## 21 High 0.011032277 0.988967723
## 23 High 0.050174667 0.949825333
## 25 High 0.021395292 0.978604708
## 28 High 0.063987262 0.936012738
## 31 High 0.069295112 0.930704888
## 32 High 0.100841511 0.899158489
## 33 High 0.126560560 0.873439440
## 34 High 0.126560560 0.873439440
## 37 High 0.327406425 0.672593575
## 38 High 0.327406425 0.672593575
## 42 High 0.551750285 0.448249715
## 49 High 0.201947838 0.798052162
## 54 High 0.059988529 0.940011471
## 55 High 0.011554148 0.988445852
## 58 High 0.400584308 0.599415692
## 60 High 0.126557798 0.873442202
## 61 High 0.126560560 0.873439440
## 65 High 0.077399182 0.922600818
## 69 High 0.478379139 0.521620861
## 73 High 0.042117210 0.957882790
## 86 High 0.111173578 0.888826422
## 90 High 0.086098117 0.913901883
## 91 High 0.024835727 0.975164273
## 93 High 0.086098117 0.913901883
## 96 High 0.024835727 0.975164273
## 98 High 0.094952344 0.905047656
## 100 High 0.101736634 0.898263366
## 104 High 0.935621948 0.064378052
## 112 High 0.234756885 0.765243115
## 115 High 0.850921886 0.149078114
## 119 High 0.185157545 0.814842455
## 128 High 0.185157545 0.814842455
## 130 High 0.012185144 0.987814856
## 139 High 0.012185144 0.987814856
## 143 High 0.044803695 0.955196305
## 145 High 0.152326220 0.847673780
## 146 High 0.185157545 0.814842455
## 149 High 0.184218497 0.815781503
## 150 High 0.167743600 0.832256400
## 152 High 0.126560560 0.873439440
## 157 High 0.459326567 0.540673433
## 161 High 0.280613265 0.719386735
## 162 High 0.015166466 0.984833534
## 166 High 0.474765084 0.525234916
## 167 High 0.191503675 0.808496325
## 173 High 0.153251009 0.846748991
## 176 High 0.185157545 0.814842455
## 182 High 0.058745459 0.941254541
## 187 High 0.030012739 0.969987261
## 190 High 0.023048545 0.976951455
## 194 High 0.054829148 0.945170852
## 195 High 0.225743072 0.774256928
## 201 High 0.118943078 0.881056922
## 207 High 0.159359817 0.840640183
## 208 High 0.433804137 0.566195863
## 215 High 0.069295112 0.930704888
## 222 High 0.287429665 0.712570335
## 224 High 0.177837300 0.822162700
## 231 High 0.663173031 0.336826969
## 236 High 0.128407370 0.871592630
## 237 High 0.087117926 0.912882074
## 240 High 0.211568207 0.788431793
## 243 High 0.118943078 0.881056922
## 248 High 0.225743072 0.774256928
## 251 High 0.802510963 0.197489037
## 256 High 0.425809264 0.574190736
## 258 High 0.208604234 0.791395766
## 262 High 0.433804137 0.566195863
## 266 High 0.458916481 0.541083519
## 272 High 0.515097736 0.484902264
## 280 High 0.301261733 0.698738267
## 283 High 0.390880675 0.609119325
## 286 High 0.445587091 0.554412909
## 287 High 0.216476703 0.783523297
## 289 High 0.130497736 0.869502264
## 290 High 0.324389101 0.675610899
## 298 High 0.299117365 0.700882635
## 305 High 0.322034003 0.677965997
## 306 High 0.150503800 0.849496200
## 312 High 0.102017831 0.897982169
## 320 High 0.292893182 0.707106818
## 325 High 0.178390162 0.821609838
## 332 High 0.065417688 0.934582312
## 333 High 0.394656156 0.605343844
## 335 High 0.299117365 0.700882635
## 339 High 0.677983251 0.322016749
## 346 High 0.381040169 0.618959831
## 347 High 0.084143549 0.915856451
## 350 High 0.332513268 0.667486732
## 353 High 0.375699877 0.624300123
## 358 High 0.356002112 0.643997888
## 365 High 0.253020718 0.746979282
## 367 High 0.189824624 0.810175376
## 370 High 0.019857926 0.980142074
## 379 High 0.077838519 0.922161481
## 386 High 0.315641390 0.684358610
## 394 High 0.536219674 0.463780326
## 396 High 0.135699295 0.864300705
## 400 High 0.087117926 0.912882074
## 404 High 0.069987179 0.930012821
## 405 High 0.543420114 0.456579886
## 413 High 0.154491627 0.845508373
## 415 High 0.396880018 0.603119982
## 417 High 0.218484683 0.781515317
## 418 High 0.444542179 0.555457821
## 423 High 0.286471132 0.713528868
## 434 High 0.321591910 0.678408090
## 437 High 0.181118453 0.818881547
## 440 High 0.308562671 0.691437329
## 449 High 0.375209738 0.624790262
## 450 High 0.294263105 0.705736895
## 457 High 0.375209738 0.624790262
## 467 High 0.239925904 0.760074096
## 469 High 0.219694251 0.780305749
## 474 High 0.923897112 0.076102888
## 475 High 0.883349944 0.116650056
## 485 High 0.085422969 0.914577031
## 504 Low 0.291557950 0.708442050
## 511 Low 0.648063716 0.351936284
## 512 Low 0.433262622 0.566737378
## 517 Low 0.081862252 0.918137748
## 519 Low 0.768466561 0.231533439
## 520 Low 0.120730626 0.879269374
## 522 Low 0.925451743 0.074548257
## 527 Low 0.645300585 0.354699415
## 528 Low 0.271085584 0.728914416
## 529 Low 0.310908209 0.689091791
## 537 Low 0.160364901 0.839635099
## 540 Low 0.906505047 0.093494953
## 541 Low 0.445392323 0.554607677
## 547 Low 0.908345685 0.091654315
## 550 Low 0.648063716 0.351936284
## 555 Low 0.553749570 0.446250430
## 564 Low 0.076566109 0.923433891
## 570 Low 0.381921892 0.618078108
## 573 Low 0.263325647 0.736674353
## 575 Low 0.445392323 0.554607677
## 578 Low 0.200665198 0.799334802
## 581 Low 0.263325647 0.736674353
## 585 Low 0.328862698 0.671137302
## 590 Low 0.758709382 0.241290618
## 601 Low 0.954499169 0.045500831
## 602 Low 0.642143769 0.357856231
## 607 Low 0.609779376 0.390220624
## 610 Low 0.596902022 0.403097978
## 618 Low 0.770717139 0.229282861
## 624 Low 0.328862698 0.671137302
## 626 Low 0.264394512 0.735605488
## 627 Low 0.325327914 0.674672086
## 634 Low 0.859870031 0.140129969
## 640 Low 0.993333081 0.006666919
## 642 Low 0.183956965 0.816043035
## 643 Low 0.513477586 0.486522414
## 644 Low 0.822581654 0.177418346
## 645 Low 0.716527602 0.283472398
## 646 Low 0.656710655 0.343289345
## 647 Low 0.810727112 0.189272888
## 652 Low 0.207004348 0.792995652
## 658 Low 0.633864017 0.366135983
## 659 Low 0.756657553 0.243342447
## 660 Low 0.915400833 0.084599167
## 664 Low 0.317520700 0.682479300
## 666 Low 0.417931911 0.582068089
## 667 Low 0.877428595 0.122571405
## 675 Low 0.513477586 0.486522414
## 680 Low 0.994852845 0.005147155
## 681 Low 0.919786159 0.080213841
## 687 Low 0.785159445 0.214840555
## 694 Low 0.776291590 0.223708410
## 697 Low 0.504279816 0.495720184
## 701 Low 0.306165429 0.693834571
## 705 Low 0.986170693 0.013829307
## 707 Low 0.683050824 0.316949176
## 710 Low 0.639259875 0.360740125
## 716 Low 0.912237098 0.087762902
## 719 Low 0.933456382 0.066543618
## 720 Low 0.986060028 0.013939972
## 725 Low 0.989191020 0.010808980
## 727 Low 0.306165429 0.693834571
## 730 Low 0.468166324 0.531833676
## 738 Low 0.784293052 0.215706948
## 745 Low 0.580737162 0.419262838
## 748 Low 0.706782117 0.293217883
## 751 Low 0.980575777 0.019424223
## 756 Low 0.717150407 0.282849593
## 766 Low 0.850966796 0.149033204
## 769 Low 0.535328740 0.464671260
## 783 Low 0.811655480 0.188344520
## 785 Low 0.841947001 0.158052999
## 790 Low 0.923371376 0.076628624
## 793 Low 0.811655480 0.188344520
## 795 Low 0.995359452 0.004640548
## 796 Low 0.988726123 0.011273877
## 797 Low 0.613968875 0.386031125
## 801 Low 0.717464025 0.282535975
## 811 Low 0.486145842 0.513854158
## 812 Low 0.961781245 0.038218755
## 815 Low 0.940897618 0.059102382
## 816 Low 0.735876018 0.264123982
## 817 Low 0.926306993 0.073693007
## 824 Low 0.844929335 0.155070665
## 825 Low 0.844929335 0.155070665
## 826 Low 0.844929335 0.155070665
## 830 Low 0.855747468 0.144252532
## 837 Low 0.910197711 0.089802289
## 838 Low 0.735876018 0.264123982
## 844 Low 0.885768710 0.114231290
## 845 Low 0.976017309 0.023982691
## 847 Low 0.924759731 0.075240269
## 850 Low 0.874286786 0.125713214
## 852 Low 0.899365871 0.100634129
## 853 Low 0.899365871 0.100634129
## 861 Low 0.920297779 0.079702221
## 868 Low 0.980571498 0.019428502
## 874 Low 0.937576780 0.062423220
## 879 High 0.152866850 0.847133150
## 895 High 0.039547020 0.960452980
## 899 High 0.006274238 0.993725762
## 903 High 0.039547020 0.960452980
## 917 High 0.029052094 0.970947906
## 927 High 0.059988529 0.940011471
## 929 High 0.153251009 0.846748991
## 931 High 0.059988529 0.940011471
## 933 High 0.383904739 0.616095261
## 944 High 0.056057332 0.943942668
## 947 High 0.086098117 0.913901883
## 949 High 0.191869316 0.808130684
## 953 High 0.021217814 0.978782186
## 958 High 0.451442468 0.548557532
## 961 High 0.024500716 0.975499284
## 963 High 0.158030320 0.841969680
## 964 High 0.118943078 0.881056922
## 973 High 0.054972558 0.945027442
## 976 High 0.076566109 0.923433891
## 977 High 0.225743072 0.774256928
## 980 High 0.194325725 0.805674275
## 983 High 0.564399605 0.435600395
## 984 High 0.225743072 0.774256928
## 986 High 0.102017831 0.897982169
## 989 High 0.215977016 0.784022984
## 991 High 0.028136447 0.971863553
## 996 High 0.096066005 0.903933995
## 997 High 0.409217772 0.590782228
## 999 High 0.050174667 0.949825333
## 1000 High 0.049962301 0.950037699
## 1003 High 0.087117926 0.912882074
## 1008 High 0.110301043 0.889698957
## 1009 High 0.390880675 0.609119325
## 1014 High 0.054342796 0.945657204
## 1015 High 0.507479791 0.492520209
## 1040 High 0.173822737 0.826177263
## 1042 High 0.348406639 0.651593361
## 1043 High 0.664831519 0.335168481
## 1050 High 0.104964385 0.895035615
## 1052 High 0.217627219 0.782372781
## 1056 High 0.430563503 0.569436497
## 1070 High 0.576047259 0.423952741
## 1073 High 0.426680447 0.573319553
## 1074 High 0.199973894 0.800026106
## 1079 High 0.365577914 0.634422086
## 1080 High 0.495056564 0.504943436
## 1085 High 0.114818593 0.885181407
## 1087 High 0.677563755 0.322436245
## 1096 High 0.946509683 0.053490317
## 1099 High 0.476707448 0.523292552
## 1100 High 0.613540731 0.386459269
## 1102 High 0.037272700 0.962727300
## 1107 Low 0.378660320 0.621339680
## 1109 Low 0.746345913 0.253654087
## 1114 Low 0.404291532 0.595708468
## 1118 Low 0.392930424 0.607069576
## 1123 Low 0.449318373 0.550681627
## 1132 Low 0.827326214 0.172673786
## 1134 Low 0.625608459 0.374391541
## 1137 Low 0.325327914 0.674672086
## 1154 Low 0.325327914 0.674672086
## 1155 Low 0.600080216 0.399919784
## 1157 Low 0.848337304 0.151662696
## 1162 Low 0.513477586 0.486522414
## 1164 Low 0.183956965 0.816043035
## 1171 Low 0.980575777 0.019424223
## 1172 Low 0.459326567 0.540673433
## 1175 Low 0.709821865 0.290178135
## 1177 Low 0.578623299 0.421376701
## 1179 Low 0.955896251 0.044103749
## 1183 Low 0.250350289 0.749649711
## 1185 Low 0.971167041 0.028832959
## 1189 Low 0.957922810 0.042077190
## 1211 Low 0.714877986 0.285122014
## 1218 Low 0.994292098 0.005707902
## 1224 Low 0.372416312 0.627583688
## 1225 Low 0.306165429 0.693834571
## 1227 Low 0.950050219 0.049949781
## 1232 Low 0.993656325 0.006343675
## 1235 Low 0.777300182 0.222699818
## 1238 Low 0.805234045 0.194765955
## 1240 Low 0.900348204 0.099651796
## 1241 Low 0.714877986 0.285122014
## 1248 Low 0.855747468 0.144252532
## 1258 Low 0.735876018 0.264123982
## 1261 Low 0.881558385 0.118441615
## 1263 Low 0.844929335 0.155070665
## 1269 Low 0.961633901 0.038366099
## 1270 Low 0.995359452 0.004640548
## 1271 Low 0.874286786 0.125713214
## 1272 Low 0.899365871 0.100634129
## 1280 Low 0.937576780 0.062423220
## 1286 Low 0.983472472 0.016527528
## 1287 Low 0.985133774 0.014866226
## 1289 Low 0.962532737 0.037467263
## 1290 Low 0.899365871 0.100634129
## 1291 High 0.183663572 0.816336428
## 1294 High 0.737553964 0.262446036
## 1305 Low 0.961175729 0.038824271
## 1308 High 0.907626255 0.092373745
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_WT_Test_ROC <- roc(response = LR_WT_Test$LR_WT_Observed,
predictor = LR_WT_Test$LR_WT_Predicted.High,
levels = rev(levels(LR_WT_Test$LR_WT_Observed)))
(LR_WT_Test_ROCCurveAUC <- auc(LR_WT_Test_ROC)[1])
## [1] 0.8891629
1.5.10 Logistic Regression With Outlier Spatial Sign Treatment
(LR_SST)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
Spatial Sign
Treatment is a multivariate extension of the concept of sign. The
method projects the centered and scaled variables onto a unit sphere and
is related to global contrast normalization. Spatial signs, when used as
multivariate estimators of covariance structures are more robust to
outlying observations.
[A] The logistic regression model from the
stats
package was implemented through the
caret
package with Spatial Sign treatment applied to treat data outliers but
no any treatment applied for data skewness.
[B] The model does not contain any
hyperparameter.
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration is fixed due to
the absence of a hyperparameter
[C.2] ROC Curve AUC = 0.86608
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] HydrophilicFactor variable (numeric)
[D.2] NumNonHAtoms variable (numeric)
[D.3] NumAtoms
variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] ROC Curve AUC = 0.88786
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train <- Solubility_Train
PMA_PreModelling_Test <- Solubility_Test
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Test_LR <- PMA_PreModelling_Test
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
##################################
# Treating data skewness
# for the train set
##################################
# No actions applied
##################################
# Treating data outliers
# for the train set
##################################
Transform_SpatialSign <- preProcess(PMA_PreModelling_Train_LR, method = c("spatialSign"))
PMA_PreModelling_Train_LR_SST <- predict(Transform_SpatialSign, PMA_PreModelling_Train_LR.Numeric)
PMA_PreModelling_Train_LR_SST$Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_SST[,sapply(PMA_PreModelling_Train_LR_SST, is.numeric)],
y = PMA_PreModelling_Train_LR_SST$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SST Train Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Train_LR_SST[,sapply(PMA_PreModelling_Train_LR_SST, is.numeric)],
y = PMA_PreModelling_Train_LR_SST$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SST Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_SST$Log_Solubility_Class)))))

##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR_SST$Log_Solubility_Class,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
LR_SST_Tune <- train(x = PMA_PreModelling_Train_LR_SST[,!names(PMA_PreModelling_Train_LR_SST) %in% c("Log_Solubility_Class")],
y = PMA_PreModelling_Train_LR_SST$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_SST_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8660886 0.7375415 0.8493469
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 0.17984 1.38112 0.05357 -1.58797
## NumCarbon
## -1.66022
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 868.7 AIC: 878.7
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8660886 0.7375415 0.8493469 0.04170766 0.06066318 0.05810749
(LR_SST_Train_ROCCurveAUC <- LR_SST_Tune$results$ROC)
## [1] 0.8660886
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_SST_VarImp <- varImp(LR_SST_Tune, scale = TRUE)
plot(LR_SST_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Treating data skewness
# for the test set
##################################
# No actions applied
##################################
# Treating data outliers
# for the test set
##################################
PMA_PreModelling_Test_LR_SST <- predict(Transform_SpatialSign, PMA_PreModelling_Test_LR.Numeric)
PMA_PreModelling_Test_LR_SST$Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_SST[,sapply(PMA_PreModelling_Test_LR_SST, is.numeric)],
y = PMA_PreModelling_Test_LR_SST$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SST Test Set : Numeric Predictor Distribution by Response Level")

featurePlot(x = PMA_PreModelling_Test_LR_SST[,sapply(PMA_PreModelling_Test_LR_SST, is.numeric)],
y = PMA_PreModelling_Test_LR_SST$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SST Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_SST$Log_Solubility_Class)))))

##################################
# Independently evaluating the model
# on the test set
##################################
LR_SST_Test <- data.frame(LR_SST_Observed = PMA_PreModelling_Test_LR_SST$Log_Solubility_Class,
LR_SST_Predicted = predict(LR_SST_Tune,
PMA_PreModelling_Test_LR_SST[,!names(PMA_PreModelling_Test_LR_SST) %in% c("Log_Solubility_Class")],
type = "prob"))
LR_SST_Test
## LR_SST_Observed LR_SST_Predicted.Low LR_SST_Predicted.High
## 20 High 0.09601045 0.90398955
## 21 High 0.07124817 0.92875183
## 23 High 0.08708570 0.91291430
## 25 High 0.06240783 0.93759217
## 28 High 0.08655612 0.91344388
## 31 High 0.13801964 0.86198036
## 32 High 0.16206633 0.83793367
## 33 High 0.16608858 0.83391142
## 34 High 0.16608858 0.83391142
## 37 High 0.26722180 0.73277820
## 38 High 0.26722180 0.73277820
## 42 High 0.88124946 0.11875054
## 49 High 0.18230992 0.81769008
## 54 High 0.09206998 0.90793002
## 55 High 0.08258681 0.91741319
## 58 High 0.24825896 0.75174104
## 60 High 0.13669151 0.86330849
## 61 High 0.16608858 0.83391142
## 65 High 0.10668377 0.89331623
## 69 High 0.30827773 0.69172227
## 73 High 0.11783250 0.88216750
## 86 High 0.15534315 0.84465685
## 90 High 0.09330688 0.90669312
## 91 High 0.09693780 0.90306220
## 93 High 0.09330688 0.90669312
## 96 High 0.09693780 0.90306220
## 98 High 0.10148441 0.89851559
## 100 High 0.12149523 0.87850477
## 104 High 0.85645369 0.14354631
## 112 High 0.21358439 0.78641561
## 115 High 0.88435693 0.11564307
## 119 High 0.17266138 0.82733862
## 128 High 0.17266138 0.82733862
## 130 High 0.08503124 0.91496876
## 139 High 0.08503124 0.91496876
## 143 High 0.05944353 0.94055647
## 145 High 0.18313278 0.81686722
## 146 High 0.17266138 0.82733862
## 149 High 0.13740764 0.86259236
## 150 High 0.18130010 0.81869990
## 152 High 0.16608858 0.83391142
## 157 High 0.26658480 0.73341520
## 161 High 0.17711690 0.82288310
## 162 High 0.07695031 0.92304969
## 166 High 0.41489928 0.58510072
## 167 High 0.19476630 0.80523370
## 173 High 0.17346217 0.82653783
## 176 High 0.17266138 0.82733862
## 182 High 0.14009484 0.85990516
## 187 High 0.09412624 0.90587376
## 190 High 0.08209356 0.91790644
## 194 High 0.13023242 0.86976758
## 195 High 0.16037985 0.83962015
## 201 High 0.09832548 0.90167452
## 207 High 0.12012692 0.87987308
## 208 High 0.34709451 0.65290549
## 215 High 0.13801964 0.86198036
## 222 High 0.20060734 0.79939266
## 224 High 0.09514653 0.90485347
## 231 High 0.66717290 0.33282710
## 236 High 0.12558408 0.87441592
## 237 High 0.15067576 0.84932424
## 240 High 0.14905340 0.85094660
## 243 High 0.09832548 0.90167452
## 248 High 0.16037985 0.83962015
## 251 High 0.83891581 0.16108419
## 256 High 0.48090891 0.51909109
## 258 High 0.14809516 0.85190484
## 262 High 0.34709451 0.65290549
## 266 High 0.27148151 0.72851849
## 272 High 0.42076835 0.57923165
## 280 High 0.37148413 0.62851587
## 283 High 0.33866037 0.66133963
## 286 High 0.25364415 0.74635585
## 287 High 0.15032543 0.84967457
## 289 High 0.14473001 0.85526999
## 290 High 0.36443155 0.63556845
## 298 High 0.24314628 0.75685372
## 305 High 0.17653728 0.82346272
## 306 High 0.24087855 0.75912145
## 312 High 0.09619891 0.90380109
## 320 High 0.25289581 0.74710419
## 325 High 0.13114586 0.86885414
## 332 High 0.05741702 0.94258298
## 333 High 0.33447896 0.66552104
## 335 High 0.24314628 0.75685372
## 339 High 0.76354018 0.23645982
## 346 High 0.37652804 0.62347196
## 347 High 0.13821283 0.86178717
## 350 High 0.32057761 0.67942239
## 353 High 0.26481814 0.73518186
## 358 High 0.36381440 0.63618560
## 365 High 0.15607953 0.84392047
## 367 High 0.12195706 0.87804294
## 370 High 0.09000562 0.90999438
## 379 High 0.14337494 0.85662506
## 386 High 0.27698113 0.72301887
## 394 High 0.74716571 0.25283429
## 396 High 0.08716053 0.91283947
## 400 High 0.15067576 0.84932424
## 404 High 0.13313956 0.86686044
## 405 High 0.50020364 0.49979636
## 413 High 0.10227313 0.89772687
## 415 High 0.24406283 0.75593717
## 417 High 0.22429372 0.77570628
## 418 High 0.66553672 0.33446328
## 423 High 0.23928941 0.76071059
## 434 High 0.24070864 0.75929136
## 437 High 0.23599992 0.76400008
## 440 High 0.43499212 0.56500788
## 449 High 0.54702006 0.45297994
## 450 High 0.15824323 0.84175677
## 457 High 0.54702006 0.45297994
## 467 High 0.32837858 0.67162142
## 469 High 0.18858281 0.81141719
## 474 High 0.87686350 0.12313650
## 475 High 0.87672351 0.12327649
## 485 High 0.28334254 0.71665746
## 504 Low 0.33447964 0.66552036
## 511 Low 0.60427805 0.39572195
## 512 Low 0.27333938 0.72666062
## 517 Low 0.13237259 0.86762741
## 519 Low 0.82383869 0.17616131
## 520 Low 0.16215286 0.83784714
## 522 Low 0.89837761 0.10162239
## 527 Low 0.79771009 0.20228991
## 528 Low 0.25999106 0.74000894
## 529 Low 0.49083183 0.50916817
## 537 Low 0.17724045 0.82275955
## 540 Low 0.89009261 0.10990739
## 541 Low 0.42590525 0.57409475
## 547 Low 0.90735268 0.09264732
## 550 Low 0.54923224 0.45076776
## 555 Low 0.76716356 0.23283644
## 564 Low 0.14548176 0.85451824
## 570 Low 0.61515621 0.38484379
## 573 Low 0.16732141 0.83267859
## 575 Low 0.42590525 0.57409475
## 578 Low 0.21645684 0.78354316
## 581 Low 0.16732141 0.83267859
## 585 Low 0.19579720 0.80420280
## 590 Low 0.89369063 0.10630937
## 601 Low 0.77618942 0.22381058
## 602 Low 0.83477322 0.16522678
## 607 Low 0.85385593 0.14614407
## 610 Low 0.86401175 0.13598825
## 618 Low 0.76648549 0.23351451
## 624 Low 0.19579720 0.80420280
## 626 Low 0.39079903 0.60920097
## 627 Low 0.24191991 0.75808009
## 634 Low 0.60202668 0.39797332
## 640 Low 0.85724614 0.14275386
## 642 Low 0.20979096 0.79020904
## 643 Low 0.56251635 0.43748365
## 644 Low 0.90646801 0.09353199
## 645 Low 0.82298391 0.17701609
## 646 Low 0.69510923 0.30489077
## 647 Low 0.74412785 0.25587215
## 652 Low 0.20093976 0.79906024
## 658 Low 0.78748187 0.21251813
## 659 Low 0.78880883 0.21119117
## 660 Low 0.86232533 0.13767467
## 664 Low 0.44522589 0.55477411
## 666 Low 0.21629092 0.78370908
## 667 Low 0.87244386 0.12755614
## 675 Low 0.56251635 0.43748365
## 680 Low 0.85164563 0.14835437
## 681 Low 0.86714281 0.13285719
## 687 Low 0.86569004 0.13430996
## 694 Low 0.91766599 0.08233401
## 697 Low 0.52480798 0.47519202
## 701 Low 0.31589620 0.68410380
## 705 Low 0.81924861 0.18075139
## 707 Low 0.77057558 0.22942442
## 710 Low 0.59761749 0.40238251
## 716 Low 0.82909104 0.17090896
## 719 Low 0.86938439 0.13061561
## 720 Low 0.85732525 0.14267475
## 725 Low 0.85641884 0.14358116
## 727 Low 0.31589620 0.68410380
## 730 Low 0.26046315 0.73953685
## 738 Low 0.83701684 0.16298316
## 745 Low 0.70271329 0.29728671
## 748 Low 0.79057047 0.20942953
## 751 Low 0.84112961 0.15887039
## 756 Low 0.84777333 0.15222667
## 766 Low 0.83906038 0.16093962
## 769 Low 0.29897447 0.70102553
## 783 Low 0.87595878 0.12404122
## 785 Low 0.89290405 0.10709595
## 790 Low 0.87181803 0.12818197
## 793 Low 0.87595878 0.12404122
## 795 Low 0.85791849 0.14208151
## 796 Low 0.85311015 0.14688985
## 797 Low 0.75584701 0.24415299
## 801 Low 0.53488474 0.46511526
## 811 Low 0.28134812 0.71865188
## 812 Low 0.84937551 0.15062449
## 815 Low 0.84844617 0.15155383
## 816 Low 0.68518843 0.31481157
## 817 Low 0.91238717 0.08761283
## 824 Low 0.89683201 0.10316799
## 825 Low 0.89683201 0.10316799
## 826 Low 0.89683201 0.10316799
## 830 Low 0.89256275 0.10743725
## 837 Low 0.90590054 0.09409946
## 838 Low 0.68518843 0.31481157
## 844 Low 0.90510276 0.09489724
## 845 Low 0.90252118 0.09747882
## 847 Low 0.92129203 0.07870797
## 850 Low 0.90778429 0.09221571
## 852 Low 0.91163676 0.08836324
## 853 Low 0.91163676 0.08836324
## 861 Low 0.91093261 0.08906739
## 868 Low 0.91054857 0.08945143
## 874 Low 0.90756578 0.09243422
## 879 High 0.16682367 0.83317633
## 895 High 0.09101760 0.90898240
## 899 High 0.08837534 0.91162466
## 903 High 0.09101760 0.90898240
## 917 High 0.09619149 0.90380851
## 927 High 0.09206998 0.90793002
## 929 High 0.17346217 0.82653783
## 931 High 0.09206998 0.90793002
## 933 High 0.51744048 0.48255952
## 944 High 0.08037883 0.91962117
## 947 High 0.09330688 0.90669312
## 949 High 0.10386583 0.89613417
## 953 High 0.07173711 0.92826289
## 958 High 0.45201022 0.54798978
## 961 High 0.05848371 0.94151629
## 963 High 0.15241824 0.84758176
## 964 High 0.09832548 0.90167452
## 973 High 0.07223382 0.92776618
## 976 High 0.14548176 0.85451824
## 977 High 0.16037985 0.83962015
## 980 High 0.33393088 0.66606912
## 983 High 0.40332216 0.59667784
## 984 High 0.16037985 0.83962015
## 986 High 0.09619891 0.90380109
## 989 High 0.20478385 0.79521615
## 991 High 0.06136135 0.93863865
## 996 High 0.15817455 0.84182545
## 997 High 0.29133806 0.70866194
## 999 High 0.08708570 0.91291430
## 1000 High 0.08096203 0.91903797
## 1003 High 0.15067576 0.84932424
## 1008 High 0.08653032 0.91346968
## 1009 High 0.33866037 0.66133963
## 1014 High 0.06211801 0.93788199
## 1015 High 0.68254254 0.31745746
## 1040 High 0.10197668 0.89802332
## 1042 High 0.36045604 0.63954396
## 1043 High 0.62172343 0.37827657
## 1050 High 0.07340589 0.92659411
## 1052 High 0.15785081 0.84214919
## 1056 High 0.22781432 0.77218568
## 1070 High 0.82152666 0.17847334
## 1073 High 0.26265879 0.73734121
## 1074 High 0.19955598 0.80044402
## 1079 High 0.33568619 0.66431381
## 1080 High 0.60786098 0.39213902
## 1085 High 0.14911681 0.85088319
## 1087 High 0.89957835 0.10042165
## 1096 High 0.85587597 0.14412403
## 1099 High 0.68096864 0.31903136
## 1100 High 0.69132245 0.30867755
## 1102 High 0.09812815 0.90187185
## 1107 Low 0.33306906 0.66693094
## 1109 Low 0.82831781 0.17168219
## 1114 Low 0.34072119 0.65927881
## 1118 Low 0.60266074 0.39733926
## 1123 Low 0.51544312 0.48455688
## 1132 Low 0.89086611 0.10913389
## 1134 Low 0.86196038 0.13803962
## 1137 Low 0.24191991 0.75808009
## 1154 Low 0.24191991 0.75808009
## 1155 Low 0.82419178 0.17580822
## 1157 Low 0.79195710 0.20804290
## 1162 Low 0.56251635 0.43748365
## 1164 Low 0.20979096 0.79020904
## 1171 Low 0.84112961 0.15887039
## 1172 Low 0.26658480 0.73341520
## 1175 Low 0.81770909 0.18229091
## 1177 Low 0.73170491 0.26829509
## 1179 Low 0.86003399 0.13996601
## 1183 Low 0.25318113 0.74681887
## 1185 Low 0.88391319 0.11608681
## 1189 Low 0.77072878 0.22927122
## 1211 Low 0.80815455 0.19184545
## 1218 Low 0.87884779 0.12115221
## 1224 Low 0.24958622 0.75041378
## 1225 Low 0.31589620 0.68410380
## 1227 Low 0.87910711 0.12089289
## 1232 Low 0.85531787 0.14468213
## 1235 Low 0.90784054 0.09215946
## 1238 Low 0.80846734 0.19153266
## 1240 Low 0.88005225 0.11994775
## 1241 Low 0.80815455 0.19184545
## 1248 Low 0.89256275 0.10743725
## 1258 Low 0.68518843 0.31481157
## 1261 Low 0.86946402 0.13053598
## 1263 Low 0.89683201 0.10316799
## 1269 Low 0.89945602 0.10054398
## 1270 Low 0.84133006 0.15866994
## 1271 Low 0.90778429 0.09221571
## 1272 Low 0.91163676 0.08836324
## 1280 Low 0.90756578 0.09243422
## 1286 Low 0.90492295 0.09507705
## 1287 Low 0.90812642 0.09187358
## 1289 Low 0.89739056 0.10260944
## 1290 Low 0.91163676 0.08836324
## 1291 High 0.09497549 0.90502451
## 1294 High 0.86886342 0.13113658
## 1305 Low 0.86138517 0.13861483
## 1308 High 0.75006281 0.24993719
##################################
# Reporting the independent evaluation results
# for the test set
##################################
LR_SST_Test_ROC <- roc(response = LR_SST_Test$LR_SST_Observed,
predictor = LR_SST_Test$LR_SST_Predicted.High,
levels = rev(levels(LR_SST_Test$LR_SST_Observed)))
(LR_SST_Test_ROCCurveAUC <- auc(LR_SST_Test_ROC)[1])
## [1] 0.8878694
1.6 Model Evaluation Summary
Model performance comparison:
[A] The transformation for skewed data generally
performed better for models which are sensitive to deviations against
the normality assumption (i.e. logistic regression) as compared to
treatment for extreme outliers.
[A.1] LR: Logistic Regression
(stats
package)
[A.1.1] LR_REF: Cross-Validation ROC Curve
AUC = 0.87475, Test ROC Curve AUC = 0.88447
[A.1.2] LR_BCT: Cross-Validation ROC Curve
AUC = 0.88878, Test ROC Curve AUC = 0.89676
[A.1.3] LR_YJT: Cross-Validation ROC Curve
AUC = 0.88070, Test ROC Curve AUC = 0.89061
[A.1.4] LR_ET: Cross-Validation ROC Curve
AUC = 0.88053, Test ROC Curve AUC = 0.89013
[A.1.5] LR_IHST: Cross-Validation ROC Curve
AUC = 0.87107, Test ROC Curve AUC = 0.87998
[A.1.6] LR_LOG10T: Cross-Validation ROC
Curve AUC = 0.89192, Test ROC Curve AUC = 0.89882
[A.1.7] LR_LNT: Cross-Validation ROC Curve
AUC = 0.89192, Test ROC Curve AUC = 0.89882 8
[A.1.8] LR_SRT: Cross-Validation ROC Curve
AUC = 0.88441, Test ROC Curve AUC = 0.89461
[A.1.9] LR_WT: Cross-Validation ROC Curve
AUC = 0.87428, Test ROC Curve AUC = 0.88916
[A.1.10] LR_SST: Cross-Validation ROC Curve
AUC = 0.86608, Test ROC Curve AUC = 0.88786
##################################
# Consolidating all evaluation results
# for the train and test sets
# using the ROC Curve AUC metric
##################################
Model <- c('LR_REF','LR_BCT','LR_YJT','LR_ET','LR_IHST','LR_LOG10T','LR_LNT','LR_SRT','LR_WT','LR_SST',
'LR_REF','LR_BCT','LR_YJT','LR_ET','LR_IHST','LR_LOG10T','LR_LNT','LR_SRT','LR_WT','LR_SST')
Set <- c(rep('Cross-Validation',10),rep('Test',10))
ROCCurveAUC <- c(LR_Train_ROCCurveAUC,LR_BCT_Train_ROCCurveAUC,
LR_YJT_Train_ROCCurveAUC,LR_ET_Train_ROCCurveAUC,
LR_IHST_Train_ROCCurveAUC,LR_LOG10T_Train_ROCCurveAUC,
LR_LNT_Train_ROCCurveAUC,LR_SRT_Train_ROCCurveAUC,
LR_WT_Train_ROCCurveAUC,LR_SST_Train_ROCCurveAUC,
LR_Test_ROCCurveAUC,LR_BCT_Test_ROCCurveAUC,
LR_YJT_Test_ROCCurveAUC,LR_ET_Test_ROCCurveAUC,
LR_IHST_Test_ROCCurveAUC,LR_LOG10T_Test_ROCCurveAUC,
LR_LNT_Test_ROCCurveAUC,LR_SRT_Test_ROCCurveAUC,
LR_WT_Test_ROCCurveAUC,LR_SST_Test_ROCCurveAUC)
ROCCurveAUC_Summary <- as.data.frame(cbind(Model,Set,ROCCurveAUC))
ROCCurveAUC_Summary$ROCCurveAUC <- as.numeric(as.character(ROCCurveAUC_Summary$ROCCurveAUC))
ROCCurveAUC_Summary$Set <- factor(ROCCurveAUC_Summary$Set,
levels = c("Cross-Validation",
"Test"))
ROCCurveAUC_Summary$Model <- factor(ROCCurveAUC_Summary$Model,
levels = c('LR_REF',
'LR_BCT',
'LR_YJT',
'LR_ET',
'LR_IHST',
'LR_LOG10T',
'LR_LNT',
'LR_SRT',
'LR_WT',
'LR_SST'))
print(ROCCurveAUC_Summary, row.names=FALSE)
## Model Set ROCCurveAUC
## LR_REF Cross-Validation 0.8747542
## LR_BCT Cross-Validation 0.8887838
## LR_YJT Cross-Validation 0.8807066
## LR_ET Cross-Validation 0.8805333
## LR_IHST Cross-Validation 0.8710722
## LR_LOG10T Cross-Validation 0.8919210
## LR_LNT Cross-Validation 0.8919210
## LR_SRT Cross-Validation 0.8844149
## LR_WT Cross-Validation 0.8742874
## LR_SST Cross-Validation 0.8660886
## LR_REF Test 0.8844739
## LR_BCT Test 0.8967622
## LR_YJT Test 0.8906181
## LR_ET Test 0.8901330
## LR_IHST Test 0.8799871
## LR_LOG10T Test 0.8988237
## LR_LNT Test 0.8988237
## LR_SRT Test 0.8946198
## LR_WT Test 0.8891629
## LR_SST Test 0.8878694
(ROCCurveAUC_Plot <- dotplot(Model ~ ROCCurveAUC,
data = ROCCurveAUC_Summary,
groups = Set,
main = "Classification Model Performance Comparison",
ylab = "Model",
xlab = "ROC Curve AUC",
auto.key = list(adj = 1),
type=c("p", "h"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 2))
