1.2 Data Quality Assessment
Data quality assessment:
[A] No missing observations noted for any
variable.
[B] Low variance observed for 127 variables with
First.Second.Mode.Ratio>5.
[B.1]-[B.33] FP013 to FP045 variables (factor)
[B.34]-[B.45] FP048 to FP059 variables (factor)
[B.46] FP114
variable (factor)
[B.47]-[B.50] FP119 to FP122 variable (factor)
[B.51]-[B.88] FP124 to FP161 variables (factor)
[B.89]-[B.118] FP172 to FP201 variables (factor)
[B.119]-[B.124] FP203 to FP208 variables (factor)
[B.125] NumSulfer variable (numeric)
[B.126] NumChlorine variable (numeric)
[B.127] NumHalogen variable (numeric)
[C] Low variance observed for 4 variables with
Unique.Count.Ratio<0.01.
[C.1] NumDblBonds variable (numeric)
[C.2] NumNitrogen variable (numeric)
[C.3] NumSulfer variable (numeric)
[C.4] NumRings
variable (numeric)
[D] High skewness observed for 3 variables with
Skewness>3 or Skewness<(-3).
[D.1] NumSulfer variable (numeric)
[D.2] NumChlorine variable (numeric)
[D.3] HydrophilicFactor variable (numeric)
##################################
# Loading dataset
##################################
DQA <- Solubility_Train
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 solTrainY numeric 951 0 1.000
## 2 2 FP001 integer 951 0 1.000
## 3 3 FP002 integer 951 0 1.000
## 4 4 FP003 integer 951 0 1.000
## 5 5 FP004 integer 951 0 1.000
## 6 6 FP005 integer 951 0 1.000
## 7 7 FP006 integer 951 0 1.000
## 8 8 FP007 integer 951 0 1.000
## 9 9 FP008 integer 951 0 1.000
## 10 10 FP009 integer 951 0 1.000
## 11 11 FP010 integer 951 0 1.000
## 12 12 FP011 integer 951 0 1.000
## 13 13 FP012 integer 951 0 1.000
## 14 14 FP013 integer 951 0 1.000
## 15 15 FP014 integer 951 0 1.000
## 16 16 FP015 integer 951 0 1.000
## 17 17 FP016 integer 951 0 1.000
## 18 18 FP017 integer 951 0 1.000
## 19 19 FP018 integer 951 0 1.000
## 20 20 FP019 integer 951 0 1.000
## 21 21 FP020 integer 951 0 1.000
## 22 22 FP021 integer 951 0 1.000
## 23 23 FP022 integer 951 0 1.000
## 24 24 FP023 integer 951 0 1.000
## 25 25 FP024 integer 951 0 1.000
## 26 26 FP025 integer 951 0 1.000
## 27 27 FP026 integer 951 0 1.000
## 28 28 FP027 integer 951 0 1.000
## 29 29 FP028 integer 951 0 1.000
## 30 30 FP029 integer 951 0 1.000
## 31 31 FP030 integer 951 0 1.000
## 32 32 FP031 integer 951 0 1.000
## 33 33 FP032 integer 951 0 1.000
## 34 34 FP033 integer 951 0 1.000
## 35 35 FP034 integer 951 0 1.000
## 36 36 FP035 integer 951 0 1.000
## 37 37 FP036 integer 951 0 1.000
## 38 38 FP037 integer 951 0 1.000
## 39 39 FP038 integer 951 0 1.000
## 40 40 FP039 integer 951 0 1.000
## 41 41 FP040 integer 951 0 1.000
## 42 42 FP041 integer 951 0 1.000
## 43 43 FP042 integer 951 0 1.000
## 44 44 FP043 integer 951 0 1.000
## 45 45 FP044 integer 951 0 1.000
## 46 46 FP045 integer 951 0 1.000
## 47 47 FP046 integer 951 0 1.000
## 48 48 FP047 integer 951 0 1.000
## 49 49 FP048 integer 951 0 1.000
## 50 50 FP049 integer 951 0 1.000
## 51 51 FP050 integer 951 0 1.000
## 52 52 FP051 integer 951 0 1.000
## 53 53 FP052 integer 951 0 1.000
## 54 54 FP053 integer 951 0 1.000
## 55 55 FP054 integer 951 0 1.000
## 56 56 FP055 integer 951 0 1.000
## 57 57 FP056 integer 951 0 1.000
## 58 58 FP057 integer 951 0 1.000
## 59 59 FP058 integer 951 0 1.000
## 60 60 FP059 integer 951 0 1.000
## 61 61 FP060 integer 951 0 1.000
## 62 62 FP061 integer 951 0 1.000
## 63 63 FP062 integer 951 0 1.000
## 64 64 FP063 integer 951 0 1.000
## 65 65 FP064 integer 951 0 1.000
## 66 66 FP065 integer 951 0 1.000
## 67 67 FP066 integer 951 0 1.000
## 68 68 FP067 integer 951 0 1.000
## 69 69 FP068 integer 951 0 1.000
## 70 70 FP069 integer 951 0 1.000
## 71 71 FP070 integer 951 0 1.000
## 72 72 FP071 integer 951 0 1.000
## 73 73 FP072 integer 951 0 1.000
## 74 74 FP073 integer 951 0 1.000
## 75 75 FP074 integer 951 0 1.000
## 76 76 FP075 integer 951 0 1.000
## 77 77 FP076 integer 951 0 1.000
## 78 78 FP077 integer 951 0 1.000
## 79 79 FP078 integer 951 0 1.000
## 80 80 FP079 integer 951 0 1.000
## 81 81 FP080 integer 951 0 1.000
## 82 82 FP081 integer 951 0 1.000
## 83 83 FP082 integer 951 0 1.000
## 84 84 FP083 integer 951 0 1.000
## 85 85 FP084 integer 951 0 1.000
## 86 86 FP085 integer 951 0 1.000
## 87 87 FP086 integer 951 0 1.000
## 88 88 FP087 integer 951 0 1.000
## 89 89 FP088 integer 951 0 1.000
## 90 90 FP089 integer 951 0 1.000
## 91 91 FP090 integer 951 0 1.000
## 92 92 FP091 integer 951 0 1.000
## 93 93 FP092 integer 951 0 1.000
## 94 94 FP093 integer 951 0 1.000
## 95 95 FP094 integer 951 0 1.000
## 96 96 FP095 integer 951 0 1.000
## 97 97 FP096 integer 951 0 1.000
## 98 98 FP097 integer 951 0 1.000
## 99 99 FP098 integer 951 0 1.000
## 100 100 FP099 integer 951 0 1.000
## 101 101 FP100 integer 951 0 1.000
## 102 102 FP101 integer 951 0 1.000
## 103 103 FP102 integer 951 0 1.000
## 104 104 FP103 integer 951 0 1.000
## 105 105 FP104 integer 951 0 1.000
## 106 106 FP105 integer 951 0 1.000
## 107 107 FP106 integer 951 0 1.000
## 108 108 FP107 integer 951 0 1.000
## 109 109 FP108 integer 951 0 1.000
## 110 110 FP109 integer 951 0 1.000
## 111 111 FP110 integer 951 0 1.000
## 112 112 FP111 integer 951 0 1.000
## 113 113 FP112 integer 951 0 1.000
## 114 114 FP113 integer 951 0 1.000
## 115 115 FP114 integer 951 0 1.000
## 116 116 FP115 integer 951 0 1.000
## 117 117 FP116 integer 951 0 1.000
## 118 118 FP117 integer 951 0 1.000
## 119 119 FP118 integer 951 0 1.000
## 120 120 FP119 integer 951 0 1.000
## 121 121 FP120 integer 951 0 1.000
## 122 122 FP121 integer 951 0 1.000
## 123 123 FP122 integer 951 0 1.000
## 124 124 FP123 integer 951 0 1.000
## 125 125 FP124 integer 951 0 1.000
## 126 126 FP125 integer 951 0 1.000
## 127 127 FP126 integer 951 0 1.000
## 128 128 FP127 integer 951 0 1.000
## 129 129 FP128 integer 951 0 1.000
## 130 130 FP129 integer 951 0 1.000
## 131 131 FP130 integer 951 0 1.000
## 132 132 FP131 integer 951 0 1.000
## 133 133 FP132 integer 951 0 1.000
## 134 134 FP133 integer 951 0 1.000
## 135 135 FP134 integer 951 0 1.000
## 136 136 FP135 integer 951 0 1.000
## 137 137 FP136 integer 951 0 1.000
## 138 138 FP137 integer 951 0 1.000
## 139 139 FP138 integer 951 0 1.000
## 140 140 FP139 integer 951 0 1.000
## 141 141 FP140 integer 951 0 1.000
## 142 142 FP141 integer 951 0 1.000
## 143 143 FP142 integer 951 0 1.000
## 144 144 FP143 integer 951 0 1.000
## 145 145 FP144 integer 951 0 1.000
## 146 146 FP145 integer 951 0 1.000
## 147 147 FP146 integer 951 0 1.000
## 148 148 FP147 integer 951 0 1.000
## 149 149 FP148 integer 951 0 1.000
## 150 150 FP149 integer 951 0 1.000
## 151 151 FP150 integer 951 0 1.000
## 152 152 FP151 integer 951 0 1.000
## 153 153 FP152 integer 951 0 1.000
## 154 154 FP153 integer 951 0 1.000
## 155 155 FP154 integer 951 0 1.000
## 156 156 FP155 integer 951 0 1.000
## 157 157 FP156 integer 951 0 1.000
## 158 158 FP157 integer 951 0 1.000
## 159 159 FP158 integer 951 0 1.000
## 160 160 FP159 integer 951 0 1.000
## 161 161 FP160 integer 951 0 1.000
## 162 162 FP161 integer 951 0 1.000
## 163 163 FP162 integer 951 0 1.000
## 164 164 FP163 integer 951 0 1.000
## 165 165 FP164 integer 951 0 1.000
## 166 166 FP165 integer 951 0 1.000
## 167 167 FP166 integer 951 0 1.000
## 168 168 FP167 integer 951 0 1.000
## 169 169 FP168 integer 951 0 1.000
## 170 170 FP169 integer 951 0 1.000
## 171 171 FP170 integer 951 0 1.000
## 172 172 FP171 integer 951 0 1.000
## 173 173 FP172 integer 951 0 1.000
## 174 174 FP173 integer 951 0 1.000
## 175 175 FP174 integer 951 0 1.000
## 176 176 FP175 integer 951 0 1.000
## 177 177 FP176 integer 951 0 1.000
## 178 178 FP177 integer 951 0 1.000
## 179 179 FP178 integer 951 0 1.000
## 180 180 FP179 integer 951 0 1.000
## 181 181 FP180 integer 951 0 1.000
## 182 182 FP181 integer 951 0 1.000
## 183 183 FP182 integer 951 0 1.000
## 184 184 FP183 integer 951 0 1.000
## 185 185 FP184 integer 951 0 1.000
## 186 186 FP185 integer 951 0 1.000
## 187 187 FP186 integer 951 0 1.000
## 188 188 FP187 integer 951 0 1.000
## 189 189 FP188 integer 951 0 1.000
## 190 190 FP189 integer 951 0 1.000
## 191 191 FP190 integer 951 0 1.000
## 192 192 FP191 integer 951 0 1.000
## 193 193 FP192 integer 951 0 1.000
## 194 194 FP193 integer 951 0 1.000
## 195 195 FP194 integer 951 0 1.000
## 196 196 FP195 integer 951 0 1.000
## 197 197 FP196 integer 951 0 1.000
## 198 198 FP197 integer 951 0 1.000
## 199 199 FP198 integer 951 0 1.000
## 200 200 FP199 integer 951 0 1.000
## 201 201 FP200 integer 951 0 1.000
## 202 202 FP201 integer 951 0 1.000
## 203 203 FP202 integer 951 0 1.000
## 204 204 FP203 integer 951 0 1.000
## 205 205 FP204 integer 951 0 1.000
## 206 206 FP205 integer 951 0 1.000
## 207 207 FP206 integer 951 0 1.000
## 208 208 FP207 integer 951 0 1.000
## 209 209 FP208 integer 951 0 1.000
## 210 210 MolWeight numeric 951 0 1.000
## 211 211 NumAtoms integer 951 0 1.000
## 212 212 NumNonHAtoms integer 951 0 1.000
## 213 213 NumBonds integer 951 0 1.000
## 214 214 NumNonHBonds integer 951 0 1.000
## 215 215 NumMultBonds integer 951 0 1.000
## 216 216 NumRotBonds integer 951 0 1.000
## 217 217 NumDblBonds integer 951 0 1.000
## 218 218 NumAromaticBonds integer 951 0 1.000
## 219 219 NumHydrogen integer 951 0 1.000
## 220 220 NumCarbon integer 951 0 1.000
## 221 221 NumNitrogen integer 951 0 1.000
## 222 222 NumOxygen integer 951 0 1.000
## 223 223 NumSulfer integer 951 0 1.000
## 224 224 NumChlorine integer 951 0 1.000
## 225 225 NumHalogen integer 951 0 1.000
## 226 226 NumRings integer 951 0 1.000
## 227 227 HydrophilicFactor numeric 951 0 1.000
## 228 228 SurfaceArea1 numeric 951 0 1.000
## 229 229 SurfaceArea2 numeric 951 0 1.000
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,!names(DQA) %in% c("solTrainY")]
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,-(grep("FP", names(DQA.Predictors)))]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}
## [1] "There are 20 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <-as.data.frame(lapply(DQA.Predictors[(grep("FP", names(DQA.Predictors)))],factor))
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}
## [1] "There are 208 factor predictor variable(s)."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 1 FP001 factor 2 0 1
## 2 FP002 factor 2 1 0
## 3 FP003 factor 2 0 1
## 4 FP004 factor 2 1 0
## 5 FP005 factor 2 1 0
## 6 FP006 factor 2 0 1
## 7 FP007 factor 2 0 1
## 8 FP008 factor 2 0 1
## 9 FP009 factor 2 0 1
## 10 FP010 factor 2 0 1
## 11 FP011 factor 2 0 1
## 12 FP012 factor 2 0 1
## 13 FP013 factor 2 0 1
## 14 FP014 factor 2 0 1
## 15 FP015 factor 2 1 0
## 16 FP016 factor 2 0 1
## 17 FP017 factor 2 0 1
## 18 FP018 factor 2 0 1
## 19 FP019 factor 2 0 1
## 20 FP020 factor 2 0 1
## 21 FP021 factor 2 0 1
## 22 FP022 factor 2 0 1
## 23 FP023 factor 2 0 1
## 24 FP024 factor 2 0 1
## 25 FP025 factor 2 0 1
## 26 FP026 factor 2 0 1
## 27 FP027 factor 2 0 1
## 28 FP028 factor 2 0 1
## 29 FP029 factor 2 0 1
## 30 FP030 factor 2 0 1
## 31 FP031 factor 2 0 1
## 32 FP032 factor 2 0 1
## 33 FP033 factor 2 0 1
## 34 FP034 factor 2 0 1
## 35 FP035 factor 2 0 1
## 36 FP036 factor 2 0 1
## 37 FP037 factor 2 0 1
## 38 FP038 factor 2 0 1
## 39 FP039 factor 2 0 1
## 40 FP040 factor 2 0 1
## 41 FP041 factor 2 0 1
## 42 FP042 factor 2 0 1
## 43 FP043 factor 2 0 1
## 44 FP044 factor 2 0 1
## 45 FP045 factor 2 0 1
## 46 FP046 factor 2 0 1
## 47 FP047 factor 2 0 1
## 48 FP048 factor 2 0 1
## 49 FP049 factor 2 0 1
## 50 FP050 factor 2 0 1
## 51 FP051 factor 2 0 1
## 52 FP052 factor 2 0 1
## 53 FP053 factor 2 0 1
## 54 FP054 factor 2 0 1
## 55 FP055 factor 2 0 1
## 56 FP056 factor 2 0 1
## 57 FP057 factor 2 0 1
## 58 FP058 factor 2 0 1
## 59 FP059 factor 2 0 1
## 60 FP060 factor 2 0 1
## 61 FP061 factor 2 0 1
## 62 FP062 factor 2 0 1
## 63 FP063 factor 2 0 1
## 64 FP064 factor 2 0 1
## 65 FP065 factor 2 1 0
## 66 FP066 factor 2 1 0
## 67 FP067 factor 2 0 1
## 68 FP068 factor 2 0 1
## 69 FP069 factor 2 0 1
## 70 FP070 factor 2 0 1
## 71 FP071 factor 2 0 1
## 72 FP072 factor 2 1 0
## 73 FP073 factor 2 0 1
## 74 FP074 factor 2 0 1
## 75 FP075 factor 2 0 1
## 76 FP076 factor 2 0 1
## 77 FP077 factor 2 0 1
## 78 FP078 factor 2 0 1
## 79 FP079 factor 2 1 0
## 80 FP080 factor 2 0 1
## 81 FP081 factor 2 0 1
## 82 FP082 factor 2 1 0
## 83 FP083 factor 2 0 1
## 84 FP084 factor 2 0 1
## 85 FP085 factor 2 0 1
## 86 FP086 factor 2 0 1
## 87 FP087 factor 2 1 0
## 88 FP088 factor 2 0 1
## 89 FP089 factor 2 0 1
## 90 FP090 factor 2 0 1
## 91 FP091 factor 2 0 1
## 92 FP092 factor 2 0 1
## 93 FP093 factor 2 0 1
## 94 FP094 factor 2 0 1
## 95 FP095 factor 2 0 1
## 96 FP096 factor 2 0 1
## 97 FP097 factor 2 0 1
## 98 FP098 factor 2 0 1
## 99 FP099 factor 2 0 1
## 100 FP100 factor 2 0 1
## 101 FP101 factor 2 0 1
## 102 FP102 factor 2 0 1
## 103 FP103 factor 2 0 1
## 104 FP104 factor 2 0 1
## 105 FP105 factor 2 0 1
## 106 FP106 factor 2 0 1
## 107 FP107 factor 2 0 1
## 108 FP108 factor 2 0 1
## 109 FP109 factor 2 0 1
## 110 FP110 factor 2 0 1
## 111 FP111 factor 2 0 1
## 112 FP112 factor 2 0 1
## 113 FP113 factor 2 0 1
## 114 FP114 factor 2 0 1
## 115 FP115 factor 2 0 1
## 116 FP116 factor 2 0 1
## 117 FP117 factor 2 0 1
## 118 FP118 factor 2 0 1
## 119 FP119 factor 2 0 1
## 120 FP120 factor 2 0 1
## 121 FP121 factor 2 0 1
## 122 FP122 factor 2 0 1
## 123 FP123 factor 2 0 1
## 124 FP124 factor 2 0 1
## 125 FP125 factor 2 0 1
## 126 FP126 factor 2 0 1
## 127 FP127 factor 2 0 1
## 128 FP128 factor 2 0 1
## 129 FP129 factor 2 0 1
## 130 FP130 factor 2 0 1
## 131 FP131 factor 2 0 1
## 132 FP132 factor 2 0 1
## 133 FP133 factor 2 0 1
## 134 FP134 factor 2 0 1
## 135 FP135 factor 2 0 1
## 136 FP136 factor 2 0 1
## 137 FP137 factor 2 0 1
## 138 FP138 factor 2 0 1
## 139 FP139 factor 2 0 1
## 140 FP140 factor 2 0 1
## 141 FP141 factor 2 0 1
## 142 FP142 factor 2 0 1
## 143 FP143 factor 2 0 1
## 144 FP144 factor 2 0 1
## 145 FP145 factor 2 0 1
## 146 FP146 factor 2 0 1
## 147 FP147 factor 2 0 1
## 148 FP148 factor 2 0 1
## 149 FP149 factor 2 0 1
## 150 FP150 factor 2 0 1
## 151 FP151 factor 2 0 1
## 152 FP152 factor 2 0 1
## 153 FP153 factor 2 0 1
## 154 FP154 factor 2 0 1
## 155 FP155 factor 2 0 1
## 156 FP156 factor 2 0 1
## 157 FP157 factor 2 0 1
## 158 FP158 factor 2 0 1
## 159 FP159 factor 2 0 1
## 160 FP160 factor 2 0 1
## 161 FP161 factor 2 0 1
## 162 FP162 factor 2 0 1
## 163 FP163 factor 2 0 1
## 164 FP164 factor 2 1 0
## 165 FP165 factor 2 0 1
## 166 FP166 factor 2 0 1
## 167 FP167 factor 2 0 1
## 168 FP168 factor 2 1 0
## 169 FP169 factor 2 0 1
## 170 FP170 factor 2 0 1
## 171 FP171 factor 2 0 1
## 172 FP172 factor 2 0 1
## 173 FP173 factor 2 0 1
## 174 FP174 factor 2 0 1
## 175 FP175 factor 2 0 1
## 176 FP176 factor 2 0 1
## 177 FP177 factor 2 0 1
## 178 FP178 factor 2 0 1
## 179 FP179 factor 2 0 1
## 180 FP180 factor 2 0 1
## 181 FP181 factor 2 0 1
## 182 FP182 factor 2 0 1
## 183 FP183 factor 2 0 1
## 184 FP184 factor 2 0 1
## 185 FP185 factor 2 0 1
## 186 FP186 factor 2 0 1
## 187 FP187 factor 2 0 1
## 188 FP188 factor 2 0 1
## 189 FP189 factor 2 0 1
## 190 FP190 factor 2 0 1
## 191 FP191 factor 2 0 1
## 192 FP192 factor 2 0 1
## 193 FP193 factor 2 0 1
## 194 FP194 factor 2 0 1
## 195 FP195 factor 2 0 1
## 196 FP196 factor 2 0 1
## 197 FP197 factor 2 0 1
## 198 FP198 factor 2 0 1
## 199 FP199 factor 2 0 1
## 200 FP200 factor 2 0 1
## 201 FP201 factor 2 0 1
## 202 FP202 factor 2 0 1
## 203 FP203 factor 2 0 1
## 204 FP204 factor 2 0 1
## 205 FP205 factor 2 0 1
## 206 FP206 factor 2 0 1
## 207 FP207 factor 2 0 1
## 208 FP208 factor 2 0 1
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio
## 1 482 469 0.002
## 2 513 438 0.002
## 3 536 415 0.002
## 4 556 395 0.002
## 5 551 400 0.002
## 6 570 381 0.002
## 7 605 346 0.002
## 8 641 310 0.002
## 9 685 266 0.002
## 10 781 170 0.002
## 11 747 204 0.002
## 12 783 168 0.002
## 13 793 158 0.002
## 14 798 153 0.002
## 15 818 133 0.002
## 16 812 139 0.002
## 17 814 137 0.002
## 18 826 125 0.002
## 19 835 116 0.002
## 20 837 114 0.002
## 21 836 115 0.002
## 22 852 99 0.002
## 23 834 117 0.002
## 24 844 107 0.002
## 25 841 110 0.002
## 26 871 80 0.002
## 27 858 93 0.002
## 28 850 101 0.002
## 29 854 97 0.002
## 30 862 89 0.002
## 31 866 85 0.002
## 32 881 70 0.002
## 33 885 66 0.002
## 34 875 76 0.002
## 35 882 69 0.002
## 36 879 72 0.002
## 37 884 67 0.002
## 38 869 82 0.002
## 39 880 71 0.002
## 40 886 65 0.002
## 41 891 60 0.002
## 42 897 54 0.002
## 43 888 63 0.002
## 44 894 57 0.002
## 45 898 53 0.002
## 46 651 300 0.002
## 47 698 253 0.002
## 48 833 118 0.002
## 49 835 116 0.002
## 50 844 107 0.002
## 51 847 104 0.002
## 52 864 87 0.002
## 53 862 89 0.002
## 54 879 72 0.002
## 55 900 51 0.002
## 56 889 62 0.002
## 57 837 114 0.002
## 58 843 108 0.002
## 59 899 52 0.002
## 60 493 458 0.002
## 61 526 425 0.002
## 62 535 416 0.002
## 63 546 405 0.002
## 64 555 396 0.002
## 65 564 387 0.002
## 66 580 371 0.002
## 67 590 361 0.002
## 68 607 344 0.002
## 69 607 344 0.002
## 70 613 338 0.002
## 71 640 311 0.002
## 72 626 325 0.002
## 73 656 295 0.002
## 74 642 309 0.002
## 75 629 322 0.002
## 76 639 312 0.002
## 77 646 305 0.002
## 78 662 289 0.002
## 79 656 295 0.002
## 80 663 288 0.002
## 81 686 265 0.002
## 82 679 272 0.002
## 83 691 260 0.002
## 84 679 272 0.002
## 85 708 243 0.002
## 86 695 256 0.002
## 87 691 260 0.002
## 88 701 250 0.002
## 89 716 235 0.002
## 90 714 237 0.002
## 91 737 214 0.002
## 92 719 232 0.002
## 93 719 232 0.002
## 94 731 220 0.002
## 95 742 209 0.002
## 96 744 207 0.002
## 97 727 224 0.002
## 98 725 226 0.002
## 99 735 216 0.002
## 100 731 220 0.002
## 101 726 225 0.002
## 102 759 192 0.002
## 103 743 208 0.002
## 104 739 212 0.002
## 105 746 205 0.002
## 106 769 182 0.002
## 107 750 201 0.002
## 108 756 195 0.002
## 109 783 168 0.002
## 110 755 196 0.002
## 111 764 187 0.002
## 112 766 185 0.002
## 113 765 186 0.002
## 114 803 148 0.002
## 115 781 170 0.002
## 116 768 183 0.002
## 117 781 170 0.002
## 118 768 183 0.002
## 119 796 155 0.002
## 120 793 158 0.002
## 121 818 133 0.002
## 122 795 156 0.002
## 123 792 159 0.002
## 124 797 154 0.002
## 125 803 148 0.002
## 126 810 141 0.002
## 127 818 133 0.002
## 128 810 141 0.002
## 129 819 132 0.002
## 130 851 100 0.002
## 131 831 120 0.002
## 132 832 119 0.002
## 133 831 120 0.002
## 134 830 121 0.002
## 135 831 120 0.002
## 136 836 115 0.002
## 137 841 110 0.002
## 138 845 106 0.002
## 139 873 78 0.002
## 140 845 106 0.002
## 141 840 111 0.002
## 142 847 104 0.002
## 143 874 77 0.002
## 144 852 99 0.002
## 145 852 99 0.002
## 146 853 98 0.002
## 147 851 100 0.002
## 148 868 83 0.002
## 149 865 86 0.002
## 150 876 75 0.002
## 151 898 53 0.002
## 152 873 78 0.002
## 153 877 74 0.002
## 154 915 36 0.002
## 155 885 66 0.002
## 156 884 67 0.002
## 157 892 59 0.002
## 158 900 51 0.002
## 159 884 67 0.002
## 160 886 65 0.002
## 161 888 63 0.002
## 162 480 471 0.002
## 163 498 453 0.002
## 164 597 354 0.002
## 165 619 332 0.002
## 166 636 315 0.002
## 167 639 312 0.002
## 168 633 318 0.002
## 169 774 177 0.002
## 170 776 175 0.002
## 171 790 161 0.002
## 172 807 144 0.002
## 173 816 135 0.002
## 174 827 124 0.002
## 175 823 128 0.002
## 176 835 116 0.002
## 177 836 115 0.002
## 178 836 115 0.002
## 179 858 93 0.002
## 180 849 102 0.002
## 181 862 89 0.002
## 182 857 94 0.002
## 183 879 72 0.002
## 184 871 80 0.002
## 185 870 81 0.002
## 186 878 73 0.002
## 187 882 69 0.002
## 188 886 65 0.002
## 189 878 73 0.002
## 190 882 69 0.002
## 191 884 67 0.002
## 192 893 58 0.002
## 193 892 59 0.002
## 194 895 56 0.002
## 195 893 58 0.002
## 196 897 54 0.002
## 197 901 50 0.002
## 198 897 54 0.002
## 199 906 45 0.002
## 200 904 47 0.002
## 201 901 50 0.002
## 202 706 245 0.002
## 203 842 109 0.002
## 204 857 94 0.002
## 205 877 74 0.002
## 206 894 57 0.002
## 207 897 54 0.002
## 208 844 107 0.002
## First.Second.Mode.Ratio
## 1 1.028
## 2 1.171
## 3 1.292
## 4 1.408
## 5 1.377
## 6 1.496
## 7 1.749
## 8 2.068
## 9 2.575
## 10 4.594
## 11 3.662
## 12 4.661
## 13 5.019
## 14 5.216
## 15 6.150
## 16 5.842
## 17 5.942
## 18 6.608
## 19 7.198
## 20 7.342
## 21 7.270
## 22 8.606
## 23 7.128
## 24 7.888
## 25 7.645
## 26 10.887
## 27 9.226
## 28 8.416
## 29 8.804
## 30 9.685
## 31 10.188
## 32 12.586
## 33 13.409
## 34 11.513
## 35 12.783
## 36 12.208
## 37 13.194
## 38 10.598
## 39 12.394
## 40 13.631
## 41 14.850
## 42 16.611
## 43 14.095
## 44 15.684
## 45 16.943
## 46 2.170
## 47 2.759
## 48 7.059
## 49 7.198
## 50 7.888
## 51 8.144
## 52 9.931
## 53 9.685
## 54 12.208
## 55 17.647
## 56 14.339
## 57 7.342
## 58 7.806
## 59 17.288
## 60 1.076
## 61 1.238
## 62 1.286
## 63 1.348
## 64 1.402
## 65 1.457
## 66 1.563
## 67 1.634
## 68 1.765
## 69 1.765
## 70 1.814
## 71 2.058
## 72 1.926
## 73 2.224
## 74 2.078
## 75 1.953
## 76 2.048
## 77 2.118
## 78 2.291
## 79 2.224
## 80 2.302
## 81 2.589
## 82 2.496
## 83 2.658
## 84 2.496
## 85 2.914
## 86 2.715
## 87 2.658
## 88 2.804
## 89 3.047
## 90 3.013
## 91 3.444
## 92 3.099
## 93 3.099
## 94 3.323
## 95 3.550
## 96 3.594
## 97 3.246
## 98 3.208
## 99 3.403
## 100 3.323
## 101 3.227
## 102 3.953
## 103 3.572
## 104 3.486
## 105 3.639
## 106 4.225
## 107 3.731
## 108 3.877
## 109 4.661
## 110 3.852
## 111 4.086
## 112 4.141
## 113 4.113
## 114 5.426
## 115 4.594
## 116 4.197
## 117 4.594
## 118 4.197
## 119 5.135
## 120 5.019
## 121 6.150
## 122 5.096
## 123 4.981
## 124 5.175
## 125 5.426
## 126 5.745
## 127 6.150
## 128 5.745
## 129 6.205
## 130 8.510
## 131 6.925
## 132 6.992
## 133 6.925
## 134 6.860
## 135 6.925
## 136 7.270
## 137 7.645
## 138 7.972
## 139 11.192
## 140 7.972
## 141 7.568
## 142 8.144
## 143 11.351
## 144 8.606
## 145 8.606
## 146 8.704
## 147 8.510
## 148 10.458
## 149 10.058
## 150 11.680
## 151 16.943
## 152 11.192
## 153 11.851
## 154 25.417
## 155 13.409
## 156 13.194
## 157 15.119
## 158 17.647
## 159 13.194
## 160 13.631
## 161 14.095
## 162 1.019
## 163 1.099
## 164 1.686
## 165 1.864
## 166 2.019
## 167 2.048
## 168 1.991
## 169 4.373
## 170 4.434
## 171 4.907
## 172 5.604
## 173 6.044
## 174 6.669
## 175 6.430
## 176 7.198
## 177 7.270
## 178 7.270
## 179 9.226
## 180 8.324
## 181 9.685
## 182 9.117
## 183 12.208
## 184 10.887
## 185 10.741
## 186 12.027
## 187 12.783
## 188 13.631
## 189 12.027
## 190 12.783
## 191 13.194
## 192 15.397
## 193 15.119
## 194 15.982
## 195 15.397
## 196 16.611
## 197 18.020
## 198 16.611
## 199 20.133
## 200 19.234
## 201 18.020
## 202 2.882
## 203 7.725
## 204 9.117
## 205 11.851
## 206 15.684
## 207 16.611
## 208 7.888
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 1 MolWeight numeric 646 0.679
## 2 NumAtoms integer 66 0.069
## 3 NumNonHAtoms integer 36 0.038
## 4 NumBonds integer 72 0.076
## 5 NumNonHBonds integer 39 0.041
## 6 NumMultBonds integer 25 0.026
## 7 NumRotBonds integer 15 0.016
## 8 NumDblBonds integer 8 0.008
## 9 NumAromaticBonds integer 16 0.017
## 10 NumHydrogen integer 41 0.043
## 11 NumCarbon integer 28 0.029
## 12 NumNitrogen integer 7 0.007
## 13 NumOxygen integer 11 0.012
## 14 NumSulfer integer 5 0.005
## 15 NumChlorine integer 11 0.012
## 16 NumHalogen integer 11 0.012
## 17 NumRings integer 8 0.008
## 18 HydrophilicFactor numeric 369 0.388
## 19 SurfaceArea1 numeric 252 0.265
## 20 SurfaceArea2 numeric 287 0.302
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 1 102.200 116.230 16 14
## 2 22.000 24.000 73 51
## 3 8.000 11.000 104 73
## 4 23.000 19.000 69 56
## 5 8.000 7.000 82 66
## 6 0.000 7.000 158 122
## 7 0.000 1.000 272 186
## 8 0.000 1.000 427 268
## 9 0.000 6.000 400 302
## 10 12.000 8.000 83 79
## 11 6.000 7.000 105 97
## 12 0.000 1.000 546 191
## 13 0.000 2.000 325 218
## 14 0.000 1.000 830 96
## 15 0.000 1.000 750 81
## 16 0.000 1.000 685 107
## 17 1.000 0.000 323 260
## 18 -0.828 -0.158 21 20
## 19 0.000 20.230 218 76
## 20 0.000 20.230 211 75
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 1 1.143 46.090 201.654 179.230 665.810 0.988 3.945
## 2 1.431 5.000 25.507 22.000 94.000 1.364 5.523
## 3 1.425 2.000 13.161 12.000 47.000 0.993 4.129
## 4 1.232 4.000 25.909 23.000 97.000 1.360 5.408
## 5 1.242 1.000 13.563 12.000 50.000 0.969 3.842
## 6 1.295 0.000 6.148 6.000 25.000 0.670 3.053
## 7 1.462 0.000 2.251 2.000 16.000 1.577 6.437
## 8 1.593 0.000 1.006 1.000 7.000 1.360 4.760
## 9 1.325 0.000 5.121 6.000 25.000 0.796 3.241
## 10 1.051 0.000 12.346 11.000 47.000 1.262 5.261
## 11 1.082 1.000 9.893 9.000 33.000 0.927 3.616
## 12 2.859 0.000 0.813 0.000 6.000 1.554 4.831
## 13 1.491 0.000 1.574 1.000 13.000 1.772 8.494
## 14 8.646 0.000 0.164 0.000 4.000 3.842 21.526
## 15 9.259 0.000 0.556 0.000 10.000 3.178 13.780
## 16 6.402 0.000 0.698 0.000 10.000 2.691 10.808
## 17 1.242 0.000 1.402 1.000 7.000 1.034 3.875
## 18 1.050 -0.985 -0.021 -0.314 13.483 3.404 27.504
## 19 2.868 0.000 36.459 29.100 331.940 1.714 9.714
## 20 2.813 0.000 40.234 33.120 331.940 1.475 7.485
## Percentile25th Percentile75th
## 1 122.605 264.340
## 2 17.000 31.000
## 3 8.000 17.000
## 4 17.000 31.500
## 5 8.000 18.000
## 6 1.000 10.000
## 7 0.000 3.500
## 8 0.000 2.000
## 9 0.000 6.000
## 10 7.000 16.000
## 11 6.000 12.000
## 12 0.000 1.000
## 13 0.000 2.000
## 14 0.000 0.000
## 15 0.000 0.000
## 16 0.000 1.000
## 17 0.000 2.000
## 18 -0.763 0.313
## 19 9.230 53.280
## 20 10.630 60.660
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 124 factor variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 13 FP013 factor 2 0 1
## 14 FP014 factor 2 0 1
## 15 FP015 factor 2 1 0
## 16 FP016 factor 2 0 1
## 17 FP017 factor 2 0 1
## 18 FP018 factor 2 0 1
## 19 FP019 factor 2 0 1
## 20 FP020 factor 2 0 1
## 21 FP021 factor 2 0 1
## 22 FP022 factor 2 0 1
## 23 FP023 factor 2 0 1
## 24 FP024 factor 2 0 1
## 25 FP025 factor 2 0 1
## 26 FP026 factor 2 0 1
## 27 FP027 factor 2 0 1
## 28 FP028 factor 2 0 1
## 29 FP029 factor 2 0 1
## 30 FP030 factor 2 0 1
## 31 FP031 factor 2 0 1
## 32 FP032 factor 2 0 1
## 33 FP033 factor 2 0 1
## 34 FP034 factor 2 0 1
## 35 FP035 factor 2 0 1
## 36 FP036 factor 2 0 1
## 37 FP037 factor 2 0 1
## 38 FP038 factor 2 0 1
## 39 FP039 factor 2 0 1
## 40 FP040 factor 2 0 1
## 41 FP041 factor 2 0 1
## 42 FP042 factor 2 0 1
## 43 FP043 factor 2 0 1
## 44 FP044 factor 2 0 1
## 45 FP045 factor 2 0 1
## 48 FP048 factor 2 0 1
## 49 FP049 factor 2 0 1
## 50 FP050 factor 2 0 1
## 51 FP051 factor 2 0 1
## 52 FP052 factor 2 0 1
## 53 FP053 factor 2 0 1
## 54 FP054 factor 2 0 1
## 55 FP055 factor 2 0 1
## 56 FP056 factor 2 0 1
## 57 FP057 factor 2 0 1
## 58 FP058 factor 2 0 1
## 59 FP059 factor 2 0 1
## 114 FP114 factor 2 0 1
## 119 FP119 factor 2 0 1
## 120 FP120 factor 2 0 1
## 121 FP121 factor 2 0 1
## 122 FP122 factor 2 0 1
## 124 FP124 factor 2 0 1
## 125 FP125 factor 2 0 1
## 126 FP126 factor 2 0 1
## 127 FP127 factor 2 0 1
## 128 FP128 factor 2 0 1
## 129 FP129 factor 2 0 1
## 130 FP130 factor 2 0 1
## 131 FP131 factor 2 0 1
## 132 FP132 factor 2 0 1
## 133 FP133 factor 2 0 1
## 134 FP134 factor 2 0 1
## 135 FP135 factor 2 0 1
## 136 FP136 factor 2 0 1
## 137 FP137 factor 2 0 1
## 138 FP138 factor 2 0 1
## 139 FP139 factor 2 0 1
## 140 FP140 factor 2 0 1
## 141 FP141 factor 2 0 1
## 142 FP142 factor 2 0 1
## 143 FP143 factor 2 0 1
## 144 FP144 factor 2 0 1
## 145 FP145 factor 2 0 1
## 146 FP146 factor 2 0 1
## 147 FP147 factor 2 0 1
## 148 FP148 factor 2 0 1
## 149 FP149 factor 2 0 1
## 150 FP150 factor 2 0 1
## 151 FP151 factor 2 0 1
## 152 FP152 factor 2 0 1
## 153 FP153 factor 2 0 1
## 154 FP154 factor 2 0 1
## 155 FP155 factor 2 0 1
## 156 FP156 factor 2 0 1
## 157 FP157 factor 2 0 1
## 158 FP158 factor 2 0 1
## 159 FP159 factor 2 0 1
## 160 FP160 factor 2 0 1
## 161 FP161 factor 2 0 1
## 172 FP172 factor 2 0 1
## 173 FP173 factor 2 0 1
## 174 FP174 factor 2 0 1
## 175 FP175 factor 2 0 1
## 176 FP176 factor 2 0 1
## 177 FP177 factor 2 0 1
## 178 FP178 factor 2 0 1
## 179 FP179 factor 2 0 1
## 180 FP180 factor 2 0 1
## 181 FP181 factor 2 0 1
## 182 FP182 factor 2 0 1
## 183 FP183 factor 2 0 1
## 184 FP184 factor 2 0 1
## 185 FP185 factor 2 0 1
## 186 FP186 factor 2 0 1
## 187 FP187 factor 2 0 1
## 188 FP188 factor 2 0 1
## 189 FP189 factor 2 0 1
## 190 FP190 factor 2 0 1
## 191 FP191 factor 2 0 1
## 192 FP192 factor 2 0 1
## 193 FP193 factor 2 0 1
## 194 FP194 factor 2 0 1
## 195 FP195 factor 2 0 1
## 196 FP196 factor 2 0 1
## 197 FP197 factor 2 0 1
## 198 FP198 factor 2 0 1
## 199 FP199 factor 2 0 1
## 200 FP200 factor 2 0 1
## 201 FP201 factor 2 0 1
## 203 FP203 factor 2 0 1
## 204 FP204 factor 2 0 1
## 205 FP205 factor 2 0 1
## 206 FP206 factor 2 0 1
## 207 FP207 factor 2 0 1
## 208 FP208 factor 2 0 1
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio
## 13 793 158 0.002
## 14 798 153 0.002
## 15 818 133 0.002
## 16 812 139 0.002
## 17 814 137 0.002
## 18 826 125 0.002
## 19 835 116 0.002
## 20 837 114 0.002
## 21 836 115 0.002
## 22 852 99 0.002
## 23 834 117 0.002
## 24 844 107 0.002
## 25 841 110 0.002
## 26 871 80 0.002
## 27 858 93 0.002
## 28 850 101 0.002
## 29 854 97 0.002
## 30 862 89 0.002
## 31 866 85 0.002
## 32 881 70 0.002
## 33 885 66 0.002
## 34 875 76 0.002
## 35 882 69 0.002
## 36 879 72 0.002
## 37 884 67 0.002
## 38 869 82 0.002
## 39 880 71 0.002
## 40 886 65 0.002
## 41 891 60 0.002
## 42 897 54 0.002
## 43 888 63 0.002
## 44 894 57 0.002
## 45 898 53 0.002
## 48 833 118 0.002
## 49 835 116 0.002
## 50 844 107 0.002
## 51 847 104 0.002
## 52 864 87 0.002
## 53 862 89 0.002
## 54 879 72 0.002
## 55 900 51 0.002
## 56 889 62 0.002
## 57 837 114 0.002
## 58 843 108 0.002
## 59 899 52 0.002
## 114 803 148 0.002
## 119 796 155 0.002
## 120 793 158 0.002
## 121 818 133 0.002
## 122 795 156 0.002
## 124 797 154 0.002
## 125 803 148 0.002
## 126 810 141 0.002
## 127 818 133 0.002
## 128 810 141 0.002
## 129 819 132 0.002
## 130 851 100 0.002
## 131 831 120 0.002
## 132 832 119 0.002
## 133 831 120 0.002
## 134 830 121 0.002
## 135 831 120 0.002
## 136 836 115 0.002
## 137 841 110 0.002
## 138 845 106 0.002
## 139 873 78 0.002
## 140 845 106 0.002
## 141 840 111 0.002
## 142 847 104 0.002
## 143 874 77 0.002
## 144 852 99 0.002
## 145 852 99 0.002
## 146 853 98 0.002
## 147 851 100 0.002
## 148 868 83 0.002
## 149 865 86 0.002
## 150 876 75 0.002
## 151 898 53 0.002
## 152 873 78 0.002
## 153 877 74 0.002
## 154 915 36 0.002
## 155 885 66 0.002
## 156 884 67 0.002
## 157 892 59 0.002
## 158 900 51 0.002
## 159 884 67 0.002
## 160 886 65 0.002
## 161 888 63 0.002
## 172 807 144 0.002
## 173 816 135 0.002
## 174 827 124 0.002
## 175 823 128 0.002
## 176 835 116 0.002
## 177 836 115 0.002
## 178 836 115 0.002
## 179 858 93 0.002
## 180 849 102 0.002
## 181 862 89 0.002
## 182 857 94 0.002
## 183 879 72 0.002
## 184 871 80 0.002
## 185 870 81 0.002
## 186 878 73 0.002
## 187 882 69 0.002
## 188 886 65 0.002
## 189 878 73 0.002
## 190 882 69 0.002
## 191 884 67 0.002
## 192 893 58 0.002
## 193 892 59 0.002
## 194 895 56 0.002
## 195 893 58 0.002
## 196 897 54 0.002
## 197 901 50 0.002
## 198 897 54 0.002
## 199 906 45 0.002
## 200 904 47 0.002
## 201 901 50 0.002
## 203 842 109 0.002
## 204 857 94 0.002
## 205 877 74 0.002
## 206 894 57 0.002
## 207 897 54 0.002
## 208 844 107 0.002
## First.Second.Mode.Ratio
## 13 5.019
## 14 5.216
## 15 6.150
## 16 5.842
## 17 5.942
## 18 6.608
## 19 7.198
## 20 7.342
## 21 7.270
## 22 8.606
## 23 7.128
## 24 7.888
## 25 7.645
## 26 10.887
## 27 9.226
## 28 8.416
## 29 8.804
## 30 9.685
## 31 10.188
## 32 12.586
## 33 13.409
## 34 11.513
## 35 12.783
## 36 12.208
## 37 13.194
## 38 10.598
## 39 12.394
## 40 13.631
## 41 14.850
## 42 16.611
## 43 14.095
## 44 15.684
## 45 16.943
## 48 7.059
## 49 7.198
## 50 7.888
## 51 8.144
## 52 9.931
## 53 9.685
## 54 12.208
## 55 17.647
## 56 14.339
## 57 7.342
## 58 7.806
## 59 17.288
## 114 5.426
## 119 5.135
## 120 5.019
## 121 6.150
## 122 5.096
## 124 5.175
## 125 5.426
## 126 5.745
## 127 6.150
## 128 5.745
## 129 6.205
## 130 8.510
## 131 6.925
## 132 6.992
## 133 6.925
## 134 6.860
## 135 6.925
## 136 7.270
## 137 7.645
## 138 7.972
## 139 11.192
## 140 7.972
## 141 7.568
## 142 8.144
## 143 11.351
## 144 8.606
## 145 8.606
## 146 8.704
## 147 8.510
## 148 10.458
## 149 10.058
## 150 11.680
## 151 16.943
## 152 11.192
## 153 11.851
## 154 25.417
## 155 13.409
## 156 13.194
## 157 15.119
## 158 17.647
## 159 13.194
## 160 13.631
## 161 14.095
## 172 5.604
## 173 6.044
## 174 6.669
## 175 6.430
## 176 7.198
## 177 7.270
## 178 7.270
## 179 9.226
## 180 8.324
## 181 9.685
## 182 9.117
## 183 12.208
## 184 10.887
## 185 10.741
## 186 12.027
## 187 12.783
## 188 13.631
## 189 12.027
## 190 12.783
## 191 13.194
## 192 15.397
## 193 15.119
## 194 15.982
## 195 15.397
## 196 16.611
## 197 18.020
## 198 16.611
## 199 20.133
## 200 19.234
## 201 18.020
## 203 7.725
## 204 9.117
## 205 11.851
## 206 15.684
## 207 16.611
## 208 7.888
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 3 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 14 NumSulfer integer 5 0.005 0.000
## 15 NumChlorine integer 11 0.012 0.000
## 16 NumHalogen integer 11 0.012 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 14 1.000 830 96 8.646
## 15 1.000 750 81 9.259
## 16 1.000 685 107 6.402
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 14 0.000 0.164 0.000 4.000 3.842 21.526 0.000 0.000
## 15 0.000 0.556 0.000 10.000 3.178 13.780 0.000 0.000
## 16 0.000 0.698 0.000 10.000 2.691 10.808 0.000 1.000
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "Low variance observed for 4 numeric variable(s) with Unique.Count.Ratio<0.01."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 8 NumDblBonds integer 8 0.008 0.000
## 12 NumNitrogen integer 7 0.007 0.000
## 14 NumSulfer integer 5 0.005 0.000
## 17 NumRings integer 8 0.008 1.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 8 1.000 427 268 1.593
## 12 1.000 546 191 2.859
## 14 1.000 830 96 8.646
## 17 0.000 323 260 1.242
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 8 0.000 1.006 1.000 7.000 1.360 4.760 0.000 2.000
## 12 0.000 0.813 0.000 6.000 1.554 4.831 0.000 1.000
## 14 0.000 0.164 0.000 4.000 3.842 21.526 0.000 0.000
## 17 0.000 1.402 1.000 7.000 1.034 3.875 0.000 2.000
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}
## [1] "High skewness observed for 3 numeric variable(s) with Skewness>3 or Skewness<(-3)."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 14 NumSulfer integer 5 0.005
## 15 NumChlorine integer 11 0.012
## 18 HydrophilicFactor numeric 369 0.388
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 14 0.000 1.000 830 96
## 15 0.000 1.000 750 81
## 18 -0.828 -0.158 21 20
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 14 8.646 0.000 0.164 0.000 4.000 3.842 21.526
## 15 9.259 0.000 0.556 0.000 10.000 3.178 13.780
## 18 1.050 -0.985 -0.021 -0.314 13.483 3.404 27.504
## Percentile25th Percentile75th
## 14 0.000 0.000
## 15 0.000 0.000
## 18 -0.763 0.313
1.4 Data Exploration
Exploratory data analysis:
[A] Numeric variables which demonstrated linear or
non-linear relationships with the Log_Solubility response variable
include:
[A.1] MolWeight variable (numeric)
[A.2] NumCarbon variable (numeric)
[A.3] NumChlorine variable (numeric)
[A.4] NumHalogen variable (numeric)
[A.5] NumMultBonds variable (numeric)
[B] Factor variables which demonstrated relatively
better differentiation of the Log_Solubility response variable between
its 1 and 0 structure levels include:
[B.1] FP207
variable (factor)
[B.2] FP190
variable (factor)
[B.3] FP197
variable (factor)
[B.4] FP196
variable (factor)
[B.5] FP193
variable (factor)
[B.6] FP184
variable (factor)
[B.7] FP172
variable (factor)
[B.8] FP149
variable (factor)
[B.9] FP112
variable (factor)
[B.10] FP107
variable (factor)
[B.11] FP089
variable (factor)
[B.12] FP076
variable (factor)
[B.13] FP059
variable (factor)
[B.14] FP049
variable (factor)
[B.15] FP044
variable (factor)
[B.16] FP014
variable (factor)
[B.17] FP013
variable (factor)
##################################
# Loading dataset
##################################
EDA <- PMA_PreModelling_Train
##################################
# Listing all predictors
##################################
EDA.Predictors <- EDA[,!names(EDA) %in% c("Log_Solubility")]
##################################
# Listing all numeric predictors
##################################
EDA.Predictors.Numeric <- EDA.Predictors[,sapply(EDA.Predictors, is.numeric)]
ncol(EDA.Predictors.Numeric)
## [1] 15
names(EDA.Predictors.Numeric)
## [1] "MolWeight" "NumBonds" "NumMultBonds"
## [4] "NumRotBonds" "NumDblBonds" "NumCarbon"
## [7] "NumNitrogen" "NumOxygen" "NumSulfer"
## [10] "NumChlorine" "NumHalogen" "NumRings"
## [13] "HydrophilicFactor" "SurfaceArea1" "SurfaceArea2"
##################################
# Listing all factor predictors
##################################
EDA.Predictors.Factor <- EDA.Predictors[,sapply(EDA.Predictors, is.factor)]
ncol(EDA.Predictors.Factor)
## [1] 205
names(EDA.Predictors.Factor)
## [1] "FP001" "FP002" "FP003" "FP004" "FP005" "FP006" "FP007" "FP008" "FP009"
## [10] "FP010" "FP011" "FP012" "FP013" "FP014" "FP015" "FP016" "FP017" "FP018"
## [19] "FP019" "FP020" "FP021" "FP022" "FP023" "FP024" "FP025" "FP026" "FP027"
## [28] "FP028" "FP029" "FP030" "FP031" "FP032" "FP033" "FP034" "FP035" "FP036"
## [37] "FP037" "FP038" "FP039" "FP040" "FP041" "FP042" "FP043" "FP044" "FP045"
## [46] "FP046" "FP047" "FP048" "FP049" "FP050" "FP051" "FP052" "FP053" "FP054"
## [55] "FP055" "FP056" "FP057" "FP058" "FP059" "FP060" "FP061" "FP062" "FP063"
## [64] "FP064" "FP065" "FP066" "FP067" "FP068" "FP069" "FP070" "FP071" "FP072"
## [73] "FP073" "FP074" "FP075" "FP076" "FP077" "FP078" "FP079" "FP080" "FP081"
## [82] "FP082" "FP083" "FP084" "FP085" "FP086" "FP087" "FP088" "FP089" "FP090"
## [91] "FP091" "FP092" "FP093" "FP094" "FP095" "FP096" "FP097" "FP098" "FP099"
## [100] "FP100" "FP101" "FP102" "FP103" "FP104" "FP105" "FP106" "FP107" "FP108"
## [109] "FP109" "FP110" "FP111" "FP112" "FP113" "FP114" "FP115" "FP116" "FP117"
## [118] "FP118" "FP119" "FP120" "FP121" "FP122" "FP123" "FP124" "FP125" "FP126"
## [127] "FP127" "FP128" "FP129" "FP130" "FP131" "FP132" "FP133" "FP134" "FP135"
## [136] "FP136" "FP137" "FP138" "FP139" "FP140" "FP141" "FP142" "FP143" "FP144"
## [145] "FP145" "FP146" "FP147" "FP148" "FP149" "FP150" "FP151" "FP152" "FP153"
## [154] "FP155" "FP156" "FP157" "FP158" "FP159" "FP160" "FP161" "FP162" "FP163"
## [163] "FP164" "FP165" "FP166" "FP167" "FP168" "FP169" "FP170" "FP171" "FP172"
## [172] "FP173" "FP174" "FP175" "FP176" "FP177" "FP178" "FP179" "FP180" "FP181"
## [181] "FP182" "FP183" "FP184" "FP185" "FP186" "FP187" "FP188" "FP189" "FP190"
## [190] "FP191" "FP192" "FP193" "FP194" "FP195" "FP196" "FP197" "FP198" "FP201"
## [199] "FP202" "FP203" "FP204" "FP205" "FP206" "FP207" "FP208"
##################################
# Formulating the scatter plots
##################################
featurePlot(x = EDA.Predictors.Numeric,
y = EDA$Log_Solubility,
between = list(x = 1, y = 1),
type = c("g", "p", "smooth"),
labels = rep("", 2))

##################################
# Restructuring the dataset for
# for boxplot analysis
##################################
Log_Solubility <- DPA$solTrainY
EDA.Boxplot.Source <- cbind(Log_Solubility,
EDA.Predictors.Factor)
EDA.Boxplot.Gathered.Group1 <- gather(EDA.Boxplot.Source,
'FP001','FP002','FP003','FP004','FP005',
'FP006','FP007','FP008','FP009','FP010',
'FP011','FP012','FP013','FP014','FP015',
'FP016','FP017','FP018','FP019','FP020',
'FP021','FP022','FP023','FP024','FP025',
'FP026','FP027','FP028','FP029','FP030',
'FP031','FP032','FP033','FP034','FP035',
'FP036','FP037','FP038','FP039','FP040',
key="Descriptor",
value="Structure")
EDA.Boxplot.Gathered.Group2 <- gather(EDA.Boxplot.Source,
'FP041','FP042','FP043','FP044','FP045',
'FP046','FP047','FP048','FP049','FP050',
'FP051','FP052','FP053','FP054','FP055',
'FP056','FP057','FP058','FP059','FP060',
'FP061','FP062','FP063','FP064','FP065',
'FP066','FP067','FP068','FP069','FP070',
'FP071','FP072','FP073','FP074','FP075',
'FP076','FP077','FP078','FP079','FP080',
key="Descriptor",
value="Structure")
EDA.Boxplot.Gathered.Group3 <- gather(EDA.Boxplot.Source,
'FP081','FP082','FP083','FP084','FP085',
'FP086','FP087','FP088','FP089','FP090',
'FP091','FP092','FP093','FP094','FP095',
'FP096','FP097','FP098','FP099','FP100',
'FP101','FP102','FP103','FP104','FP105',
'FP106','FP107','FP108','FP109','FP110',
'FP111','FP112','FP113','FP114','FP115',
'FP116','FP117','FP118','FP119','FP120',
key="Descriptor",
value="Structure")
EDA.Boxplot.Gathered.Group4 <- gather(EDA.Boxplot.Source,
'FP121','FP122','FP123','FP124','FP125',
'FP126','FP127','FP128','FP129','FP130',
'FP131','FP132','FP133','FP134','FP135',
'FP136','FP137','FP138','FP139','FP140',
'FP141','FP142','FP143','FP144','FP145',
'FP146','FP147','FP148','FP149','FP150',
'FP151','FP152','FP153','FP155','FP156',
'FP157','FP158','FP159','FP160','FP161',
key="Descriptor",
value="Structure")
EDA.Boxplot.Gathered.Group5 <- gather(EDA.Boxplot.Source,
'FP162','FP163','FP164','FP165','FP166',
'FP167','FP168','FP169','FP170','FP171',
'FP172','FP173','FP174','FP175','FP176',
'FP177','FP178','FP179','FP180','FP181',
'FP182','FP183','FP184','FP185','FP186',
'FP187','FP188','FP189','FP190','FP191',
'FP192','FP193','FP194','FP195','FP196',
'FP197','FP198','FP201','FP202','FP203',
'FP204','FP205','FP206','FP207','FP208',
key="Descriptor",
value="Structure")
bwplot(Log_Solubility~Structure|Descriptor,
data=EDA.Boxplot.Gathered.Group5,
ylab="Log Solubility",
xlab="Structure",
layout=(c(9,5)))

bwplot(Log_Solubility~Structure|Descriptor,
data=EDA.Boxplot.Gathered.Group4,
ylab="Log Solubility",
xlab="Structure",
layout=(c(9,5)))

bwplot(Log_Solubility~Structure|Descriptor,
data=EDA.Boxplot.Gathered.Group3,
ylab="Log Solubility",
xlab="Structure",
layout=(c(9,5)))

bwplot(Log_Solubility~Structure|Descriptor,
data=EDA.Boxplot.Gathered.Group2,
ylab="Log Solubility",
xlab="Structure",
layout=(c(9,5)))

bwplot(Log_Solubility~Structure|Descriptor,
data=EDA.Boxplot.Gathered.Group1,
ylab="Log Solubility",
xlab="Structure",
layout=(c(9,5)))

1.5 Predictive Model Development
1.5.1 Linear Regression (LR)
[A] The linear regression model from the
stats
package was implemented through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] intercept = intercept held constant at a
value of TRUE
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
intercept=TRUE
[C.2] Root-Mean-Square Error = 0.68719
[C.3] R-Squared = 0.88629
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] NumRotBonds variable (numeric)
[D.2] MolWeight variable (numeric)
[D.3] FP072
(Structure=1) variable (factor)
[D.4] NumOxygen variable (numeric)
[D.5] NumMultBonds variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.77258
[E.2] R-Squared = 0.86439
##################################
# Creating a local object
# for the train set
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_LR$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the linear regression model
# by setting the caret method to 'lm'
##################################
set.seed(12345678)
LR_Tune <- train(x = PMA_PreModelling_Train_LR[,!names(PMA_PreModelling_Train_LR) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_LR$Log_Solubility,
method = "lm",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_Tune
## Linear Regression
##
## 951 samples
## 220 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.6871912 0.8862948 0.5149368
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Coefficients:
## (Intercept) FP0011 FP0021 FP0031
## -4.066000 0.203162 0.010415 -0.083340
## FP0041 FP0051 FP0061 FP0071
## -0.371758 -0.377219 0.087430 0.021977
## FP0081 FP0091 FP0101 FP0111
## -0.006028 -0.596654 0.617362 0.073236
## FP0121 FP0131 FP0141 FP0151
## -0.080938 -0.677007 0.318195 -0.340700
## FP0161 FP0171 FP0181 FP0191
## -0.087389 -0.132981 -0.501254 0.182633
## FP0201 FP0211 FP0221 FP0231
## -0.068780 0.055915 0.362833 -0.254665
## FP0241 FP0251 FP0261 FP0271
## -0.427291 0.754962 0.357680 0.015031
## FP0281 FP0291 FP0301 FP0311
## 0.083591 -0.005925 -0.203563 0.263579
## FP0321 FP0331 FP0341 FP0351
## -1.142536 0.991357 -0.383055 -0.274878
## FP0361 FP0371 FP0381 FP0391
## 0.024001 0.228933 0.349438 -0.262034
## FP0401 FP0411 FP0421 FP0431
## 0.623611 -0.338887 -0.353304 -0.012580
## FP0441 FP0451 FP0461 FP0471
## -0.318968 0.093365 -0.082762 -0.004447
## FP0481 FP0491 FP0501 FP0511
## 0.303096 -0.022564 -0.171177 0.238581
## FP0521 FP0531 FP0541 FP0551
## -0.315815 0.313428 -0.137279 -0.411489
## FP0561 FP0571 FP0581 FP0591
## -0.118550 0.050641 0.116467 -0.088098
## FP0601 FP0611 FP0621 FP0631
## 0.327283 0.059817 -0.159297 0.839179
## FP0641 FP0651 FP0661 FP0671
## 0.301279 -0.100770 0.210044 -0.240241
## FP0681 FP0691 FP0701 FP0711
## 0.250900 -0.013854 0.107883 0.213760
## FP0721 FP0731 FP0741 FP0751
## 0.941298 -0.534633 0.169600 0.198712
## FP0761 FP0771 FP0781 FP0791
## 0.370435 0.137173 -0.451360 0.644861
## FP0801 FP0811 FP0821 FP0831
## 0.288844 -0.424812 0.080198 -0.486889
## FP0841 FP0851 FP0861 FP0871
## 0.292106 -0.471828 0.003212 -0.200902
## FP0881 FP0891 FP0901 FP0911
## 0.245689 0.241490 -0.130453 0.260164
## FP0921 FP0931 FP0941 FP0951
## 0.253515 0.322089 -0.297586 0.009927
## FP0961 FP0971 FP0981 FP0991
## -0.040920 -0.257427 -0.273002 0.302075
## FP1001 FP1011 FP1021 FP1031
## -0.567162 0.010766 0.101139 -0.120334
## FP1041 FP1051 FP1061 FP1071
## -0.126282 -0.172136 0.017081 0.273796
## FP1081 FP1091 FP1101 FP1111
## -0.280250 0.552120 0.343563 -0.535813
## FP1121 FP1131 FP1141 FP1151
## 0.282474 0.178287 -0.169603 -0.078620
## FP1161 FP1171 FP1181 FP1191
## 0.035097 0.354961 -0.202152 0.517463
## FP1201 FP1211 FP1221 FP1231
## -0.205402 -0.256731 0.229096 -0.090138
## FP1241 FP1251 FP1261 FP1271
## 0.197173 0.026748 -0.559281 -0.557537
## FP1281 FP1291 FP1301 FP1311
## -0.236470 -0.017449 -0.323162 0.205262
## FP1321 FP1331 FP1341 FP1351
## 0.049595 -0.275143 -0.286978 0.216807
## FP1361 FP1371 FP1381 FP1391
## 0.061525 -0.065240 -0.083785 -0.527359
## FP1401 FP1411 FP1421 FP1431
## 0.181520 0.320718 0.581450 0.682509
## FP1441 FP1451 FP1461 FP1471
## 0.376819 -0.414159 0.036687 0.208845
## FP1481 FP1491 FP1501 FP1511
## -0.149271 0.011702 0.128825 0.218887
## FP1521 FP1531 FP1551 FP1561
## -0.270079 -0.029909 0.289836 -0.599602
## FP1571 FP1581 FP1591 FP1601
## -0.665030 0.147579 0.094096 0.063586
## FP1611 FP1621 FP1631 FP1641
## -0.291284 0.137654 0.436455 0.545472
## FP1651 FP1661 FP1671 FP1681
## 0.418963 -0.079093 -0.656888 -0.061270
## FP1691 FP1701 FP1711 FP1721
## -0.104876 -0.358562 0.486741 -0.144907
## FP1731 FP1741 FP1751 FP1761
## 0.547299 -0.167793 0.062189 0.390344
## FP1771 FP1781 FP1791 FP1801
## -0.031023 0.296177 -0.207167 -1.163053
## FP1811 FP1821 FP1831 FP1841
## 0.270999 -0.063148 0.093444 0.436498
## FP1851 FP1861 FP1871 FP1881
## -0.267287 -0.166962 0.398059 0.120847
## FP1891 FP1901 FP1911 FP1921
## -0.367590 -0.023868 0.084442 -0.105360
## FP1931 FP1941 FP1951 FP1961
## 0.134538 0.361550 -0.204425 0.042328
## FP1971 FP1981 FP2011 FP2021
## 0.059010 0.035856 -0.389306 0.538293
## FP2031 FP2041 FP2051 FP2061
## 0.147608 -0.205335 0.109915 0.017044
## FP2071 FP2081 MolWeight NumBonds
## 0.037643 -0.371259 -0.524968 -0.717804
## NumMultBonds NumRotBonds NumDblBonds NumCarbon
## -0.740106 -0.523744 0.016234 -0.079046
## NumNitrogen NumOxygen NumSulfer NumChlorine
## 0.920733 1.593098 0.338963 -0.470240
## NumHalogen NumRings HydrophilicFactor SurfaceArea1
## -0.168289 -0.504171 0.160828 -0.030703
## SurfaceArea2
## -1.160557
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.6871912 0.8862948 0.5149368 0.05361825 0.02595212 0.04393313
(LR_Train_RMSE <- LR_Tune$results$RMSE)
## [1] 0.6871912
(LR_Train_Rsquared <- LR_Tune$results$Rsquared)
## [1] 0.8862948
##################################
# Identifying and plotting the
# best model predictors
##################################
LR_VarImp <- varImp(LR_Tune, scale = TRUE)
plot(LR_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Linear Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
LR_Test <- data.frame(LR_Observed = PMA_PreModelling_Test$Log_Solubility,
LR_Predicted = predict(LR_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Log_Solubility")]))
LR_Test
## LR_Observed LR_Predicted
## 20 0.93 0.85008095
## 21 0.85 0.22645838
## 23 0.81 -0.42222262
## 25 0.74 1.00132885
## 28 0.61 -0.18792964
## 31 0.58 1.57625536
## 32 0.57 0.46177128
## 33 0.56 0.57563083
## 34 0.52 0.12433631
## 37 0.45 -0.94825875
## 38 0.40 -0.67248930
## 42 0.36 -0.77359940
## 49 0.22 -0.06846625
## 54 0.08 -0.42768142
## 55 0.07 -1.18852676
## 58 0.02 -0.52352868
## 60 0.00 -0.23033399
## 61 -0.01 0.22225962
## 65 -0.07 -0.25802638
## 69 -0.12 -0.88885630
## 73 -0.17 0.71388503
## 86 -0.29 -0.05023695
## 90 -0.38 -0.79845706
## 91 -0.38 -0.79758079
## 93 -0.39 -1.11064695
## 96 -0.42 -0.84166972
## 98 -0.44 -0.82750408
## 100 -0.46 1.50757611
## 104 -0.48 -2.54181529
## 112 -0.60 -1.14251902
## 115 -0.63 -2.48231692
## 119 -0.66 -0.71467927
## 128 -0.72 -0.68561681
## 130 -0.72 -0.20123827
## 139 -0.80 0.38583368
## 143 -0.80 -1.26130642
## 145 -0.82 0.33086174
## 146 -0.82 -0.53799096
## 149 -0.84 0.24312506
## 150 -0.85 -0.77028093
## 152 -0.85 -0.45603900
## 157 -0.87 -1.83556260
## 161 -0.89 -1.11249644
## 162 -0.90 -0.06891881
## 166 -0.96 -1.20868589
## 167 -0.96 -0.71372802
## 173 -0.99 -0.37558064
## 176 -1.01 -0.78452740
## 182 -1.09 -1.21901070
## 187 -1.12 -0.41097472
## 190 -1.14 -0.23496326
## 194 -1.17 -1.79694522
## 195 -1.19 -1.64619814
## 201 -1.22 -1.40479017
## 207 -1.27 -2.05825298
## 208 -1.28 -1.27095362
## 215 -1.32 -1.18288381
## 222 -1.38 -1.33751727
## 224 -1.39 -1.56123858
## 231 -1.42 -1.31682939
## 236 -1.47 -0.91354257
## 237 -1.47 -1.58466302
## 240 -1.50 -0.41486011
## 243 -1.52 -1.20648193
## 248 -1.54 -1.23470906
## 251 -1.55 -2.04200996
## 256 -1.56 -3.35582923
## 258 -1.57 -1.85961077
## 262 -1.60 -1.63923610
## 266 -1.60 -2.54563385
## 272 -1.62 -1.31290546
## 280 -1.64 -2.41199921
## 283 -1.67 -1.67272479
## 286 -1.70 -3.50280923
## 287 -1.70 -2.01667176
## 289 -1.71 -1.87972128
## 290 -1.71 -2.28330830
## 298 -1.75 -1.77512870
## 305 -1.78 -1.84911044
## 306 -1.78 -2.47902069
## 312 -1.82 -1.68083809
## 320 -1.87 -1.85877716
## 325 -1.89 -2.09870742
## 332 -1.92 -1.92592002
## 333 -1.92 -1.37779054
## 335 -1.92 -1.34410458
## 339 -1.94 -3.26845478
## 346 -1.99 -2.89788882
## 347 -2.00 -2.25386758
## 350 -2.05 -2.20365376
## 353 -2.06 -1.35415444
## 358 -2.08 -2.06119614
## 365 -2.10 -2.50159849
## 367 -2.11 -1.50546116
## 370 -2.12 -0.56780235
## 379 -2.17 -2.09303346
## 386 -2.21 -1.88611582
## 394 -2.24 -3.67377079
## 396 -2.24 -1.68153752
## 400 -2.29 -2.40037711
## 404 -2.31 -2.31411050
## 405 -2.32 -2.08239282
## 413 -2.35 -2.56805312
## 415 -2.35 -2.44532458
## 417 -2.36 -2.46699221
## 418 -2.36 -2.46215229
## 423 -2.38 -2.36633963
## 434 -2.42 -2.63030569
## 437 -2.43 -2.88959719
## 440 -2.44 -3.15345313
## 449 -2.52 -2.24372454
## 450 -2.53 -2.99815343
## 457 -2.57 -3.20624071
## 467 -2.62 -2.84173397
## 469 -2.62 -1.94249851
## 474 -2.64 -2.91609082
## 475 -2.64 -2.54094314
## 485 -2.70 -2.24480771
## 504 -2.82 -1.78714328
## 511 -2.88 -2.82931574
## 512 -2.89 -2.33100039
## 517 -2.92 -1.43326292
## 519 -2.93 -3.84688550
## 520 -2.96 -2.85464686
## 522 -2.98 -2.33742088
## 527 -3.01 -2.92251938
## 528 -3.01 -3.36800030
## 529 -3.02 -4.40700551
## 537 -3.07 -3.56054879
## 540 -3.09 -2.98470384
## 541 -3.11 -3.09498026
## 547 -3.13 -3.72220144
## 550 -3.14 -1.94158777
## 555 -3.15 -3.45614360
## 564 -3.22 -2.57629523
## 570 -3.26 -3.30790592
## 573 -3.27 -2.90082040
## 575 -3.27 -2.87420511
## 578 -3.30 -3.05437242
## 581 -3.31 -2.41393690
## 585 -3.33 -2.18265117
## 590 -3.37 -2.40989770
## 601 -3.43 -3.38729357
## 602 -3.43 -2.35270924
## 607 -3.48 -2.99973092
## 610 -3.51 -3.17370073
## 618 -3.59 -2.17873527
## 624 -3.61 -2.51468217
## 626 -3.63 -3.51037615
## 627 -3.63 -3.21922593
## 634 -3.68 -2.80532421
## 640 -3.71 -3.07071758
## 642 -3.74 -2.40778100
## 643 -3.75 -3.79963926
## 644 -3.75 -2.29284417
## 645 -3.77 -3.17179970
## 646 -3.77 -4.15491500
## 647 -3.78 -5.11516765
## 652 -3.81 -3.82159114
## 658 -3.95 -4.53359831
## 659 -3.96 -5.20777439
## 660 -3.96 -4.11333917
## 664 -4.00 -3.28372401
## 666 -4.02 -5.01239248
## 667 -4.04 -4.22264635
## 675 -4.12 -3.59150010
## 680 -4.15 -4.38735138
## 681 -4.16 -3.23757225
## 687 -4.17 -4.47628097
## 694 -4.21 -4.86055552
## 697 -4.23 -3.76913678
## 701 -4.25 -3.41876819
## 705 -4.30 -3.75385720
## 707 -4.31 -5.68574464
## 710 -4.35 -4.69689933
## 716 -4.40 -3.92156254
## 719 -4.40 -4.15434815
## 720 -4.43 -4.79176377
## 725 -4.46 -4.45676917
## 727 -4.47 -3.26014350
## 730 -4.51 -4.89667861
## 738 -4.60 -3.69687365
## 745 -4.64 -4.67322230
## 748 -4.69 -4.91476196
## 751 -4.71 -4.14614635
## 756 -4.77 -3.76066662
## 766 -4.95 -4.14144414
## 769 -4.98 -4.95029159
## 783 -5.21 -5.69185088
## 785 -5.22 -5.27570230
## 790 -5.28 -4.51928303
## 793 -5.31 -2.82904775
## 795 -5.35 -4.87015999
## 796 -5.37 -4.94167346
## 797 -5.40 -4.52018485
## 801 -5.43 -4.32651144
## 811 -5.65 -5.43467584
## 812 -5.66 -4.20061155
## 815 -6.70 -4.83502856
## 816 -5.72 -5.24964473
## 817 -6.00 -6.99661140
## 824 -6.25 -6.31671321
## 825 -6.26 -6.26377712
## 826 -6.27 -6.35209719
## 830 -6.35 -5.84116193
## 837 -6.57 -6.13080915
## 838 -6.62 -5.19880527
## 844 -6.96 -6.09270325
## 845 -7.02 -7.86766325
## 847 -7.20 -7.30608069
## 850 -7.28 -7.06630010
## 852 -7.32 -7.58425651
## 853 -7.39 -7.68733990
## 861 -7.82 -8.24338048
## 868 -8.23 -8.86554341
## 874 -8.94 -8.46264055
## 879 1.07 -0.06836081
## 895 0.43 0.09145293
## 899 0.32 -0.18874752
## 903 0.00 0.10236587
## 917 -0.40 -0.87210988
## 927 -0.52 -0.56040022
## 929 -0.55 -0.64123656
## 931 -0.60 -0.79452125
## 933 -0.62 -2.65915478
## 944 -0.85 -1.24615891
## 947 -0.89 -0.73849423
## 949 -0.93 -0.47034398
## 953 -0.96 -0.36298026
## 958 -1.06 -2.05945241
## 961 -1.10 -1.48846796
## 963 -1.12 -1.06870665
## 964 -1.15 -0.72843534
## 973 -1.28 -0.66198581
## 976 -1.30 -1.76851393
## 977 -1.31 -1.18177298
## 980 -1.35 -3.02865129
## 983 -1.39 -1.98416265
## 984 -1.41 -1.64619814
## 986 -1.41 -1.72732694
## 989 -1.42 -0.66925460
## 991 -1.46 -1.44897044
## 996 -1.50 -1.76647533
## 997 -1.50 -1.73791710
## 999 -1.52 -1.53294349
## 1000 -1.52 -0.78304337
## 1003 -1.59 -1.73368722
## 1008 -1.61 -0.90572068
## 1009 -1.63 -1.22653769
## 1014 -1.71 -2.25389642
## 1015 -1.83 -2.10325323
## 1040 -2.05 -2.44739048
## 1042 -2.06 -2.32809439
## 1043 -2.07 -3.91495362
## 1050 -2.15 -2.73691095
## 1052 -2.16 -0.89823401
## 1056 -1.99 0.43630827
## 1070 -2.36 -1.75081513
## 1073 -2.38 -3.74589678
## 1074 -2.39 -1.48538378
## 1079 -2.46 -2.18137666
## 1080 -2.49 -2.21816817
## 1085 -2.54 -2.62996167
## 1087 -2.55 -2.95995956
## 1096 -2.63 -2.40473884
## 1099 -2.64 -1.49712170
## 1100 -2.67 -2.57217130
## 1102 -2.68 -2.40322416
## 1107 -2.77 -2.61897426
## 1109 -2.78 -3.09105143
## 1114 -2.82 -2.71897133
## 1118 -2.92 -3.45235955
## 1123 -3.03 -3.05502619
## 1132 -3.12 -3.50894894
## 1134 -3.16 -3.23220453
## 1137 -3.19 -2.99156103
## 1154 -3.54 -3.42070439
## 1155 -3.54 -2.37419582
## 1157 -3.59 -3.61119266
## 1162 -3.66 -2.94788569
## 1164 -3.68 -2.49781612
## 1171 -3.75 -3.87066896
## 1172 -3.76 -4.12315551
## 1175 -3.78 -3.64460328
## 1177 -3.80 -3.99045861
## 1179 -3.80 -3.34874930
## 1183 -3.85 -3.26377127
## 1185 -3.89 -3.24457394
## 1189 -3.95 -4.13454233
## 1211 -4.29 -4.81876716
## 1218 -4.42 -3.57301825
## 1224 -4.48 -4.18721344
## 1225 -4.48 -3.37011104
## 1227 -4.53 -4.89976101
## 1232 -4.63 -4.38145043
## 1235 -4.73 -3.93459358
## 1238 -4.84 -4.01093250
## 1240 -4.89 -3.79325854
## 1241 -4.89 -4.85088996
## 1248 -5.26 -5.73628615
## 1258 -6.09 -5.06155374
## 1261 -6.29 -5.76189270
## 1263 -6.29 -6.29644038
## 1269 -6.89 -5.15367692
## 1270 -6.96 -6.85306121
## 1271 -7.00 -6.79203297
## 1272 -7.05 -7.63440381
## 1280 -8.30 -8.78984377
## 1286 -8.66 -9.26339711
## 1287 -9.03 -10.09374869
## 1289 -10.41 -10.10907441
## 1290 -7.89 -7.36013668
## 1291 -2.32 -1.68789497
## 1294 0.39 -3.02945536
## 1305 -2.90 -5.11098058
## 1308 -2.47 -5.09654935
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(LR_Test_Metrics <- postResample(LR_Test[,2], LR_Test[,1]))
## RMSE Rsquared MAE
## 0.7725809 0.8643929 0.5711292
(LR_Test_RMSE <- LR_Test_Metrics[1])
## RMSE
## 0.7725809
(LR_Test_Rsquared <- LR_Test_Metrics[2])
## Rsquared
## 0.8643929
1.5.2 Penalized Linear Regression - Ridge (PLR_R)
[A] The penalized linear regression (ridge) model from
the
elasticnet
package was implemented through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] lambda =
weight decay made to vary across a range of values equal to 0.00 to
0.10
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
lambda=0.025
[C.2] Root-Mean-Square Error = 0.65275
[C.3] R-Squared = 0.89684
[D] The model does not allow for ranking of predictors
in terms of variable importance.
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.74148
[E.2] R-Squared = 0.87517
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_PLR_R <- as.data.frame(lapply(PMA_PreModelling_Train, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Train_PLR_R)
## [1] 951 221
PMA_PreModelling_Test_PLR_R <- as.data.frame(lapply(PMA_PreModelling_Test, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Test_PLR_R)
## [1] 316 221
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_PLR_R$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
PLR_R_Grid = expand.grid(lambda = seq(0, 0.10, length = 5))
##################################
# Running the penalized linear regression (ridge) model
# by setting the caret method to 'ridge'
##################################
set.seed(12345678)
PLR_R_Tune <- train(x = PMA_PreModelling_Train_PLR_R[,!names(PMA_PreModelling_Train_PLR_R) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_PLR_R$Log_Solubility,
method = "ridge",
tuneGrid = PLR_R_Grid,
trControl = KFold_Control,
preProc = c("center", "scale"))
##################################
# Reporting the cross-validation results
# for the train set
##################################
PLR_R_Tune
## Ridge Regression
##
## 951 samples
## 220 predictors
##
## Pre-processing: centered (220), scaled (220)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000 0.6871897 0.8862951 0.5149301
## 0.025 0.6527539 0.8968424 0.4976606
## 0.050 0.6590355 0.8957857 0.5027891
## 0.075 0.6700575 0.8937331 0.5124512
## 0.100 0.6831128 0.8915190 0.5240739
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.025.
##
## Call:
## elasticnet::enet(x = as.matrix(x), y = y, lambda = param$lambda)
## Sequence of moves:
## MolWeight NumCarbon NumChlorine FP044 NumHalogen FP089 FP072
## Var 206 211 215 44 216 89 72
## Step 1 2 3 4 5 6 7
## HydrophilicFactor NumMultBonds SurfaceArea1 FP063 FP059 FP142 FP206 FP135
## Var 218 208 219 63 59 142 203 135
## Step 8 9 10 11 12 13 14 15
## FP084 FP204 FP147 FP089 FP074 FP073 FP116 FP040 NumOxygen FP039 FP043
## Var 84 201 147 -89 74 73 116 40 213 39 43
## Step 16 17 18 19 20 21 22 23 24 25 26
## NumSulfer FP094 FP124 FP011 FP193 FP111 FP050 FP198 FP175 NumRotBonds
## Var 214 94 124 11 192 111 50 197 174 209
## Step 27 28 29 30 31 32 33 34 35 36
## FP172 FP080 NumBonds FP137 FP101 FP088 FP203 FP136 FP123 FP122 FP085 FP081
## Var 171 80 207 137 101 88 200 136 123 122 85 81
## Step 37 38 39 40 41 42 43 44 45 46 47 48
## FP042 FP070 FP202 FP126 HydrophilicFactor NumRings FP188 FP065 FP145 FP128
## Var 42 70 199 126 -218 217 187 65 145 128
## Step 49 50 51 52 53 54 55 56 57 58
## FP162 FP138 FP026 FP031 FP175 FP102 FP075 FP127 FP187 FP033 FP176 FP002
## Var 161 138 26 31 -174 102 75 127 186 33 175 2
## Step 59 60 61 62 63 64 65 66 67 68 69 70
## FP037 FP171 FP173 FP207 FP053 HydrophilicFactor FP166 FP099 FP003 FP023
## Var 37 170 172 204 53 218 165 99 3 23
## Step 71 72 73 74 75 76 77 78 79 80
## FP113 FP169 FP164 FP013 FP133 FP091 FP141 FP078 FP034 FP131 FP064 FP022
## Var 113 168 163 13 133 91 141 78 34 131 64 22
## Step 81 82 83 84 85 86 87 88 89 90 91 92
## FP100 FP201 FP184 FP004 FP159 FP103 FP054 FP036 FP083 HydrophilicFactor
## Var 100 198 183 4 158 103 54 36 83 -218
## Step 93 94 95 96 97 98 99 100 101 102
## FP049 FP104 FP015 FP149 FP109 FP150 FP012 FP163 FP168 FP098 FP186 FP060
## Var 49 104 15 149 109 150 12 162 167 98 185 60
## Step 103 104 105 106 107 108 109 110 111 112 113 114
## FP144 FP174 NumNitrogen FP073 FP016 FP071 FP018 FP155 FP105 FP185 FP115
## Var 144 173 212 -73 16 71 18 154 105 184 115
## Step 115 116 117 118 119 120 121 122 123 124 125
## FP079 FP148 FP027 FP181 FP157 FP204 FP167 FP152 FP161 FP087 FP205 FP017
## Var 79 148 27 180 156 -201 166 152 160 87 202 17
## Step 126 127 128 129 130 131 132 133 134 135 136 137
## FP066 FP089 FP038 FP143 FP134 FP035 FP009 FP090 FP160 FP180 FP119 FP170
## Var 66 89 38 143 134 35 9 90 159 179 119 169
## Step 138 139 140 141 142 143 144 145 146 147 148 149
## FP093 FP146 FP190 FP028 FP191 FP095 HydrophilicFactor FP140 FP068 FP069
## Var 93 146 189 28 190 95 218 140 68 69
## Step 150 151 152 153 154 155 156 157 158 159
## FP153 FP102 FP120 FP036 NumDblBonds FP048 FP077 FP158 FP076 FP156 FP021
## Var 153 -102 120 -36 210 48 77 157 76 155 21
## Step 160 161 162 163 164 165 166 167 168 169 170
## FP030 FP055 FP118 FP139 FP007 FP001 FP130 FP057 FP082 FP195 FP192 FP024
## Var 30 55 118 139 7 1 130 57 82 194 191 24
## Step 171 172 173 174 175 176 177 178 179 180 181 182
## FP051 FP036 FP032 FP067 FP045 FP008 FP102 FP092 FP121 FP151 FP046 FP097
## Var 51 36 32 67 45 8 102 92 121 151 46 97
## Step 183 184 185 186 187 188 189 190 191 192 193 194
## FP073 FP106 FP189 FP114 FP086 FP125 FP182 FP020 FP010 FP149 FP108 FP062
## Var 73 106 188 114 86 125 181 20 10 -149 108 62
## Step 195 196 197 198 199 200 201 202 203 204 205 206
## FP019 FP129 FP041 FP141 FP005 FP196 FP204 FP178 FP052 FP120 FP056 FP197
## Var 19 129 41 -141 5 195 201 177 52 -120 56 196
## Step 207 208 209 210 211 212 213 214 215 216 217 218
## FP112 SurfaceArea2 FP179 FP047 FP110 FP117 FP096 FP175 FP097 FP166 FP165
## Var 112 220 178 47 110 117 96 174 -97 -165 164
## Step 219 220 221 222 223 224 225 226 227 228 229
## FP207 FP107 FP149 FP177 FP025 FP183 FP029 FP141 FP061 FP208 FP132 FP006
## Var -204 107 149 176 25 182 29 141 61 205 132 6
## Step 230 231 232 233 234 235 236 237 238 239 240 241
## FP120 FP058 FP137 FP014 FP097 FP207 FP166 FP137 FP194
## Var 120 58 -137 14 97 204 165 137 193 251
## Step 242 243 244 245 246 247 248 249 250 251
## lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.000 0.6871897 0.8862951 0.5149301 0.05361883 0.02595243 0.04394202
## 2 0.025 0.6527539 0.8968424 0.4976606 0.05176540 0.02292189 0.03626743
## 3 0.050 0.6590355 0.8957857 0.5027891 0.05352912 0.02244986 0.03887249
## 4 0.075 0.6700575 0.8937331 0.5124512 0.05597932 0.02224904 0.04211679
## 5 0.100 0.6831128 0.8915190 0.5240739 0.05894427 0.02219650 0.04416185
(PLR_R_Train_RMSE <- PLR_R_Tune$results[PLR_R_Tune$results$lambda==PLR_R_Tune$bestTune$lambda,
c("RMSE")])
## [1] 0.6527539
(PLR_R_Train_Rsquared <- PLR_R_Tune$results[PLR_R_Tune$results$lambda==PLR_R_Tune$bestTune$lambda,
c("Rsquared")])
## [1] 0.8968424
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
PLR_R_Test <- data.frame(PLR_R_Observed = PMA_PreModelling_Test$Log_Solubility,
PLR_R_Predicted = predict(PLR_R_Tune,
PMA_PreModelling_Test_PLR_R[,!names(PMA_PreModelling_Test_PLR_R) %in% c("Log_Solubility")]))
PLR_R_Test
## PLR_R_Observed PLR_R_Predicted
## 1 0.93 6.655157e-01
## 2 0.85 3.336029e-01
## 3 0.81 -6.083225e-01
## 4 0.74 8.432412e-01
## 5 0.61 5.954138e-02
## 6 0.58 1.483561e+00
## 7 0.57 4.786292e-01
## 8 0.56 4.678409e-01
## 9 0.52 2.720858e-01
## 10 0.45 -8.469761e-01
## 11 0.40 -6.716471e-01
## 12 0.36 -9.064858e-01
## 13 0.22 6.878584e-02
## 14 0.08 -3.781082e-01
## 15 0.07 -1.027910e+00
## 16 0.02 -8.082206e-01
## 17 0.00 -2.666937e-01
## 18 -0.01 1.018631e-01
## 19 -0.07 1.770377e-01
## 20 -0.12 -1.038402e+00
## 21 -0.17 5.798321e-01
## 22 -0.29 -1.063792e-01
## 23 -0.38 -7.242987e-01
## 24 -0.38 -8.165523e-01
## 25 -0.39 -1.039680e+00
## 26 -0.42 -7.110655e-01
## 27 -0.44 -7.865257e-01
## 28 -0.46 1.330282e+00
## 29 -0.48 -2.228559e+00
## 30 -0.60 -1.277362e+00
## 31 -0.63 -2.498312e+00
## 32 -0.66 -7.541777e-01
## 33 -0.72 -6.822375e-01
## 34 -0.72 -8.540290e-02
## 35 -0.80 3.837001e-01
## 36 -0.80 -1.120133e+00
## 37 -0.82 3.754147e-01
## 38 -0.82 -6.354979e-01
## 39 -0.84 1.906519e-01
## 40 -0.85 -8.368592e-01
## 41 -0.85 -5.689798e-01
## 42 -0.87 -1.972862e+00
## 43 -0.89 -1.240478e+00
## 44 -0.90 1.884632e-01
## 45 -0.96 -1.496469e+00
## 46 -0.96 -7.421295e-01
## 47 -0.99 -4.135176e-01
## 48 -1.01 -8.128175e-01
## 49 -1.09 -1.054425e+00
## 50 -1.12 -4.603601e-01
## 51 -1.14 -3.738747e-01
## 52 -1.17 -1.692647e+00
## 53 -1.19 -1.575112e+00
## 54 -1.22 -1.276192e+00
## 55 -1.27 -1.911069e+00
## 56 -1.28 -1.231313e+00
## 57 -1.32 -1.249962e+00
## 58 -1.38 -1.462001e+00
## 59 -1.39 -1.609215e+00
## 60 -1.42 -1.659667e+00
## 61 -1.47 -9.900123e-01
## 62 -1.47 -1.547422e+00
## 63 -1.50 -9.496653e-01
## 64 -1.52 -1.278062e+00
## 65 -1.54 -1.349782e+00
## 66 -1.55 -2.178760e+00
## 67 -1.56 -3.033758e+00
## 68 -1.57 -1.865990e+00
## 69 -1.60 -1.333912e+00
## 70 -1.60 -2.694416e+00
## 71 -1.62 -1.586501e+00
## 72 -1.64 -2.551495e+00
## 73 -1.67 -1.718912e+00
## 74 -1.70 -3.265934e+00
## 75 -1.70 -2.062026e+00
## 76 -1.71 -2.223605e+00
## 77 -1.71 -2.401191e+00
## 78 -1.75 -1.933350e+00
## 79 -1.78 -1.615718e+00
## 80 -1.78 -2.427334e+00
## 81 -1.82 -1.311474e+00
## 82 -1.87 -1.853234e+00
## 83 -1.89 -2.114620e+00
## 84 -1.92 -1.962674e+00
## 85 -1.92 -1.295332e+00
## 86 -1.92 -1.414480e+00
## 87 -1.94 -3.361142e+00
## 88 -1.99 -2.646690e+00
## 89 -2.00 -2.222978e+00
## 90 -2.05 -2.324948e+00
## 91 -2.06 -1.582547e+00
## 92 -2.08 -2.143243e+00
## 93 -2.10 -2.561933e+00
## 94 -2.11 -1.463349e+00
## 95 -2.12 -6.012312e-01
## 96 -2.17 -2.061051e+00
## 97 -2.21 -1.837801e+00
## 98 -2.24 -2.787275e+00
## 99 -2.24 -1.527907e+00
## 100 -2.29 -2.241510e+00
## 101 -2.31 -2.241156e+00
## 102 -2.32 -2.172746e+00
## 103 -2.35 -2.589437e+00
## 104 -2.35 -2.246173e+00
## 105 -2.36 -2.552548e+00
## 106 -2.36 -1.982080e+00
## 107 -2.38 -2.371507e+00
## 108 -2.42 -2.633605e+00
## 109 -2.43 -3.251050e+00
## 110 -2.44 -3.328829e+00
## 111 -2.52 -2.431058e+00
## 112 -2.53 -2.940652e+00
## 113 -2.57 -3.084709e+00
## 114 -2.62 -3.066213e+00
## 115 -2.62 -2.772446e+00
## 116 -2.64 -3.044999e+00
## 117 -2.64 -3.066690e+00
## 118 -2.70 -2.221458e+00
## 119 -2.82 -2.524769e+00
## 120 -2.88 -2.669586e+00
## 121 -2.89 -2.263681e+00
## 122 -2.92 -1.157740e+00
## 123 -2.93 -3.512809e+00
## 124 -2.96 -2.763717e+00
## 125 -2.98 -2.632943e+00
## 126 -3.01 -2.737061e+00
## 127 -3.01 -3.437836e+00
## 128 -3.02 -3.596187e+00
## 129 -3.07 -3.450439e+00
## 130 -3.09 -3.062580e+00
## 131 -3.11 -3.013319e+00
## 132 -3.13 -3.807036e+00
## 133 -3.14 -1.895637e+00
## 134 -3.15 -3.611763e+00
## 135 -3.22 -2.416420e+00
## 136 -3.26 -3.400673e+00
## 137 -3.27 -2.819815e+00
## 138 -3.27 -2.848596e+00
## 139 -3.30 -2.957609e+00
## 140 -3.31 -2.433027e+00
## 141 -3.33 -2.267365e+00
## 142 -3.37 -2.244878e+00
## 143 -3.43 -3.533220e+00
## 144 -3.43 -2.508858e+00
## 145 -3.48 -3.007281e+00
## 146 -3.51 -3.457121e+00
## 147 -3.59 -2.391230e+00
## 148 -3.61 -2.605758e+00
## 149 -3.63 -3.498944e+00
## 150 -3.63 -3.458830e+00
## 151 -3.68 -2.013209e+00
## 152 -3.71 -3.511889e+00
## 153 -3.74 -2.378672e+00
## 154 -3.75 -3.673353e+00
## 155 -3.75 -2.741505e+00
## 156 -3.77 -3.312758e+00
## 157 -3.77 -4.323747e+00
## 158 -3.78 -5.341718e+00
## 159 -3.81 -3.765972e+00
## 160 -3.95 -4.451062e+00
## 161 -3.96 -5.456749e+00
## 162 -3.96 -4.163823e+00
## 163 -4.00 -3.563417e+00
## 164 -4.02 -4.766735e+00
## 165 -4.04 -4.490979e+00
## 166 -4.12 -3.526970e+00
## 167 -4.15 -5.014687e+00
## 168 -4.16 -3.767517e+00
## 169 -4.17 -4.519733e+00
## 170 -4.21 -4.771142e+00
## 171 -4.23 -4.326197e+00
## 172 -4.25 -3.514275e+00
## 173 -4.30 -3.626576e+00
## 174 -4.31 -5.475649e+00
## 175 -4.35 -4.889637e+00
## 176 -4.40 -4.027219e+00
## 177 -4.40 -4.357594e+00
## 178 -4.43 -4.812655e+00
## 179 -4.46 -4.606169e+00
## 180 -4.47 -3.111667e+00
## 181 -4.51 -5.073588e+00
## 182 -4.60 -3.961640e+00
## 183 -4.64 -4.699197e+00
## 184 -4.69 -4.796990e+00
## 185 -4.71 -4.016558e+00
## 186 -4.77 -3.675313e+00
## 187 -4.95 -4.503789e+00
## 188 -4.98 -4.647916e+00
## 189 -5.21 -5.850469e+00
## 190 -5.22 -5.651693e+00
## 191 -5.28 -4.406219e+00
## 192 -5.31 -2.990854e+00
## 193 -5.35 -4.756430e+00
## 194 -5.37 -5.112303e+00
## 195 -5.40 -4.660381e+00
## 196 -5.43 -4.627488e+00
## 197 -5.65 -5.654802e+00
## 198 -5.66 -4.361465e+00
## 199 -6.70 -4.951949e+00
## 200 -5.72 -5.252434e+00
## 201 -6.00 -7.373908e+00
## 202 -6.25 -6.481595e+00
## 203 -6.26 -6.359866e+00
## 204 -6.27 -6.655393e+00
## 205 -6.35 -5.786245e+00
## 206 -6.57 -6.003233e+00
## 207 -6.62 -5.319672e+00
## 208 -6.96 -5.961618e+00
## 209 -7.02 -7.442019e+00
## 210 -7.20 -7.175934e+00
## 211 -7.28 -7.284850e+00
## 212 -7.32 -7.510870e+00
## 213 -7.39 -7.915024e+00
## 214 -7.82 -8.410622e+00
## 215 -8.23 -8.962363e+00
## 216 -8.94 -8.512390e+00
## 217 1.07 -1.246604e-01
## 218 0.43 1.819635e-01
## 219 0.32 -2.103867e-01
## 220 0.00 3.000252e-02
## 221 -0.40 -8.028867e-01
## 222 -0.52 -5.428799e-01
## 223 -0.55 -7.255459e-01
## 224 -0.60 -8.720942e-01
## 225 -0.62 -2.578015e+00
## 226 -0.85 -1.233755e+00
## 227 -0.89 -7.755708e-01
## 228 -0.93 -8.709553e-01
## 229 -0.96 -3.004478e-04
## 230 -1.06 -2.097683e+00
## 231 -1.10 -1.627420e+00
## 232 -1.12 -1.042399e+00
## 233 -1.15 -8.097424e-01
## 234 -1.28 -3.394287e-01
## 235 -1.30 -1.611365e+00
## 236 -1.31 -1.228053e+00
## 237 -1.35 -2.935202e+00
## 238 -1.39 -1.949023e+00
## 239 -1.41 -1.575112e+00
## 240 -1.41 -1.367658e+00
## 241 -1.42 -6.161983e-01
## 242 -1.46 -1.949181e+00
## 243 -1.50 -1.646122e+00
## 244 -1.50 -2.341926e+00
## 245 -1.52 -1.558616e+00
## 246 -1.52 -6.113012e-01
## 247 -1.59 -1.526668e+00
## 248 -1.61 -7.851878e-01
## 249 -1.63 -1.126933e+00
## 250 -1.71 -2.365064e+00
## 251 -1.83 -2.131320e+00
## 252 -2.05 -1.966495e+00
## 253 -2.06 -2.341654e+00
## 254 -2.07 -3.859841e+00
## 255 -2.15 -2.627314e+00
## 256 -2.16 -9.795253e-01
## 257 -1.99 -1.015377e-01
## 258 -2.36 -1.884174e+00
## 259 -2.38 -4.009556e+00
## 260 -2.39 -1.575245e+00
## 261 -2.46 -2.268956e+00
## 262 -2.49 -2.345026e+00
## 263 -2.54 -2.776700e+00
## 264 -2.55 -3.045244e+00
## 265 -2.63 -2.509177e+00
## 266 -2.64 -1.597537e+00
## 267 -2.67 -2.667330e+00
## 268 -2.68 -2.027918e+00
## 269 -2.77 -2.595922e+00
## 270 -2.78 -2.919825e+00
## 271 -2.82 -2.662614e+00
## 272 -2.92 -3.597443e+00
## 273 -3.03 -3.447847e+00
## 274 -3.12 -3.483471e+00
## 275 -3.16 -3.082106e+00
## 276 -3.19 -3.263342e+00
## 277 -3.54 -3.554744e+00
## 278 -3.54 -2.429122e+00
## 279 -3.59 -3.697451e+00
## 280 -3.66 -2.985546e+00
## 281 -3.68 -2.316387e+00
## 282 -3.75 -3.907819e+00
## 283 -3.76 -3.946207e+00
## 284 -3.78 -3.908796e+00
## 285 -3.80 -4.103146e+00
## 286 -3.80 -4.615035e+00
## 287 -3.85 -3.349860e+00
## 288 -3.89 -3.742668e+00
## 289 -3.95 -4.270295e+00
## 290 -4.29 -4.954343e+00
## 291 -4.42 -4.604252e+00
## 292 -4.48 -4.326483e+00
## 293 -4.48 -3.248538e+00
## 294 -4.53 -4.955238e+00
## 295 -4.63 -4.477200e+00
## 296 -4.73 -4.094596e+00
## 297 -4.84 -4.174860e+00
## 298 -4.89 -3.892153e+00
## 299 -4.89 -4.934865e+00
## 300 -5.26 -5.652802e+00
## 301 -6.09 -5.179791e+00
## 302 -6.29 -5.970943e+00
## 303 -6.29 -6.355769e+00
## 304 -6.89 -5.674112e+00
## 305 -6.96 -6.876594e+00
## 306 -7.00 -7.023199e+00
## 307 -7.05 -7.793294e+00
## 308 -8.30 -8.895771e+00
## 309 -8.66 -8.719002e+00
## 310 -9.03 -9.331363e+00
## 311 -10.41 -1.001937e+01
## 312 -7.89 -7.531643e+00
## 313 -2.32 -1.692328e+00
## 314 0.39 -2.853248e+00
## 315 -2.90 -4.930309e+00
## 316 -2.47 -4.911651e+00
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(PLR_R_Test_Metrics <- postResample(PLR_R_Test[,2], PLR_R_Test[,1]))
## RMSE Rsquared MAE
## 0.7414774 0.8751709 0.5526653
(PLR_R_Test_RMSE <- PLR_R_Test_Metrics[1])
## RMSE
## 0.7414774
(PLR_R_Test_Rsquared <- PLR_R_Test_Metrics[2])
## Rsquared
## 0.8751709
1.5.3 Penalized Linear Regression - Lasso (PLR_L)
[A] The penalized linear regression (lasso) model from
the
elasticnet
package was implemented through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] fraction
= fraction of full solution made to vary across a range of values equal
to 0.05 to 1.00
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
fraction=0.525
[C.2] Root-Mean-Square Error = 0.64896
[C.3] R-Squared = 0.89763
[D] The model does not allow for ranking of predictors
in terms of variable importance.
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.73891
[E.2] R-Squared = 0.87468
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_PLR_L <- as.data.frame(lapply(PMA_PreModelling_Train, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Train_PLR_L)
## [1] 951 221
PMA_PreModelling_Test_PLR_L <- as.data.frame(lapply(PMA_PreModelling_Test, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Test_PLR_L)
## [1] 316 221
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_PLR_L$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
PLR_L_Grid = expand.grid(fraction = seq(0.05, 1.00, length = 5))
##################################
# Running the penalized linear regression (lasso) model
# by setting the caret method to 'lasso'
##################################
set.seed(12345678)
PLR_L_Tune <- train(x = PMA_PreModelling_Train_PLR_L[,!names(PMA_PreModelling_Train_PLR_L) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_PLR_L$Log_Solubility,
method = "lasso",
tuneGrid = PLR_L_Grid,
trControl = KFold_Control,
preProc = c("center", "scale"))
##################################
# Reporting the cross-validation results
# for the train set
##################################
PLR_L_Tune
## The lasso
##
## 951 samples
## 220 predictors
##
## Pre-processing: centered (220), scaled (220)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.0500 1.3714216 0.6930427 1.0614377
## 0.2875 0.6613587 0.8945923 0.5086228
## 0.5250 0.6489629 0.8976271 0.4924596
## 0.7625 0.6630176 0.8933415 0.4997783
## 1.0000 0.6871897 0.8862951 0.5149301
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.525.
##
## Call:
## elasticnet::enet(x = as.matrix(x), y = y, lambda = 0)
## Cp statistics of the Lasso fit
## Cp: 12147.631 8960.184 8880.588 8659.618 7297.388 7224.150 6934.162 5253.116 4620.595 3482.230 3096.590 2843.585 2618.559 2533.345 2518.911 2391.936 2092.306 2058.456 2054.991 2047.146 1898.978 1865.980 1696.256 1646.751 1520.984 1499.405 1455.990 1351.810 1235.766 1236.051 1204.983 1143.443 1141.487 1122.554 1124.274 1108.768 1065.801 987.005 956.350 951.645 953.350 933.746 924.143 885.546 854.090 855.297 827.167 804.988 798.345 793.217 783.876 773.862 747.774 748.174 712.944 682.511 671.203 628.058 624.066 605.453 603.143 604.207 595.627 581.308 573.393 561.422 558.363 558.929 558.062 523.230 502.926 483.996 477.690 455.641 449.095 429.235 430.321 418.363 417.038 416.781 418.042 417.637 418.832 419.515 420.611 399.826 396.865 396.332 397.607 386.916 380.295 377.827 378.699 379.500 369.926 371.149 361.951 357.389 351.252 348.606 347.668 349.403 340.928 338.847 332.763 332.305 332.025 323.501 318.564 314.492 313.714 310.364 305.040 304.899 305.952 298.866 297.875 299.313 299.862 301.455 302.825 304.260 305.011 304.244 304.082 298.960 300.959 299.493 296.499 295.133 287.224 280.270 280.103 280.995 275.558 272.528 274.168 275.842 275.606 268.698 269.525 268.020 257.321 257.816 256.343 248.587 246.926 244.175 242.235 243.180 243.526 244.893 244.625 240.380 240.615 235.903 231.627 232.361 231.969 223.326 218.912 215.456 214.262 212.596 213.554 212.821 210.162 212.086 213.966 214.766 214.858 212.604 211.935 212.224 212.416 214.050 214.374 214.753 214.384 211.681 210.300 211.432 210.741 211.079 212.362 214.246 214.047 212.754 213.634 213.982 211.562 210.798 210.145 210.053 209.230 208.898 207.942 208.718 210.453 211.166 211.176 212.857 210.926 209.684 210.157 206.758 206.758 208.234 209.798 204.682 206.103 204.992 203.600 202.250 203.742 205.446 202.423 203.462 201.459 202.813 201.771 199.699 198.260 197.356 196.158 195.899 196.748 197.919 195.392 190.400 190.763 189.951 190.155 190.096 185.087 185.045 185.878 186.544 188.461 189.805 191.098 192.053 190.526 187.096 186.957 185.210 186.771 188.733 187.974 184.653 186.614 188.064 187.687 189.630 189.168 188.555 188.585 189.835 191.736 193.491 194.387 195.915 197.833 199.497 197.044 195.553 195.200 194.979 196.883 198.001 198.171 196.941 194.709 192.111 188.874 188.548 190.043 189.789 189.356 191.213 193.153 194.556 196.220 197.921 197.855 197.460 199.219 201.202 202.831 204.209 205.756 207.727 209.678 207.674 209.511 211.417 213.384 215.295 217.232 219.046 221.000
## DF: 1 2 3 4 5 6 7 8 9 10 11 12 12 12 13 14 15 16 17 18 19 19 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 43 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 63 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 78 78 79 80 80 80 81 82 83 84 85 86 86 86 87 88 89 90 91 92 93 94 95 96 97 96 97 98 99 99 99 100 101 102 103 104 105 106 107 108 109 110 110 110 110 110 111 112 112 112 113 114 114 114 115 116 117 118 119 120 120 119 119 120 121 122 123 124 125 125 125 126 127 127 127 128 129 130 131 131 131 132 133 134 135 136 137 138 139 140 141 142 142 142 143 144 145 146 147 148 148 148 149 150 151 151 151 151 151 151 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 166 166 167 168 169 170 171 172 173 174 174 174 174 174 175 176 177 178 179 180 181 181 180 180 181 182 183 184 185 186 186 185 185 186 187 188 188 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 202 202 202 202 203 204 205 205 204 203 202 202 203 203 203 204 205 206 207 208 208 208 209 210 211 212 213 214 215 214 215 216 217 218 219 220 221
## Sequence of moves:
## MolWeight NumCarbon NumChlorine FP044 FP089 NumHalogen FP072
## Var 206 211 215 44 89 216 72
## Step 1 2 3 4 5 6 7
## HydrophilicFactor NumMultBonds SurfaceArea1 FP063 FP059 FP089 FP142 FP135
## Var 218 208 219 63 59 -89 142 135
## Step 8 9 10 11 12 13 14 15
## FP206 FP204 FP084 FP074 FP147 FP073 HydrophilicFactor FP116 FP040
## Var 203 201 84 74 147 73 -218 116 40
## Step 16 17 18 19 20 21 22 23 24
## NumOxygen FP043 FP039 NumSulfer FP111 FP094 FP124 FP011 FP198 FP193 FP137
## Var 213 43 39 214 111 94 124 11 197 192 137
## Step 25 26 27 28 29 30 31 32 33 34 35
## FP050 FP175 FP080 FP101 FP088 FP122 FP203 NumRotBonds FP136 FP085 FP123
## Var 50 174 80 101 88 122 200 209 136 85 123
## Step 36 37 38 39 40 41 42 43 44 45 46
## FP042 FP084 FP202 FP126 FP162 FP081 FP128 FP145 FP026 FP188 FP138 FP031
## Var 42 -84 199 126 161 81 128 145 26 187 138 31
## Step 47 48 49 50 51 52 53 54 55 56 57 58
## FP002 FP187 FP176 FP065 FP037 FP075 FP033 FP127 FP171 FP207 FP102 FP175
## Var 2 186 175 65 37 75 33 127 170 204 102 -174
## Step 59 60 61 62 63 64 65 66 67 68 69 70
## FP053 NumRings FP173 FP164 FP113 FP083 FP099 FP023 FP166 FP003 FP004 FP091
## Var 53 217 172 163 113 83 99 23 165 3 4 91
## Step 71 72 73 74 75 76 77 78 79 80 81 82
## FP100 FP064 FP131 FP133 FP080 FP169 FP078 FP012 FP073 FP201 FP184 FP172
## Var 100 64 131 133 -80 168 78 12 -73 198 183 171
## Step 83 84 85 86 87 88 89 90 91 92 93 94
## FP149 FP098 FP013 FP159 FP204 FP150 FP015 FP186 FP109 FP079 FP104 FP036
## Var 149 98 13 158 -201 150 15 185 109 79 104 36
## Step 95 96 97 98 99 100 101 102 103 104 105 106
## FP180 FP148 FP163 FP054 FP102 FP089 FP174 FP141 FP021 FP011 FP016
## Var 179 148 162 54 -102 89 173 141 21 -11 16
## Step 107 108 109 110 111 112 113 114 115 116 117
## NumNitrogen FP144 FP011 FP049 FP152 FP161 FP027 FP157 FP084 FP076 FP155
## Var 212 144 11 49 152 160 27 156 84 76 154
## Step 118 119 120 121 122 123 124 125 126 127 128
## FP123 FP103 FP166 FP018 FP167 FP071 FP116 FP185 FP034 FP191 FP180 NumBonds
## Var -123 103 -165 18 166 71 -116 184 34 190 -179 207
## Step 129 130 131 132 133 134 135 136 137 138 139 140
## FP017 FP181 FP170 FP022 FP080 FP035 SurfaceArea1 FP021 FP146 FP048 FP093
## Var 17 180 169 22 80 35 -219 -21 146 48 93
## Step 141 142 143 144 145 146 147 148 149 150 151
## FP038 FP130 FP119 FP129 FP036 FP090 FP087 FP140 FP203 FP180 FP082 FP105
## Var 38 130 119 129 -36 90 87 140 -200 179 82 105
## Step 152 153 154 155 156 157 158 159 160 161 162 163
## NumDblBonds FP055 FP011 FP009 FP153 FP067 FP139 FP077 FP066 FP158 FP156
## Var 210 55 -11 9 153 67 139 77 66 157 155
## Step 164 165 166 167 168 169 170 171 172 173 174
## FP168 FP118 FP114 FP060 FP129 FP143 FP073 FP045 FP068 FP024 FP095 FP069
## Var 167 118 114 60 -129 143 73 45 68 24 95 69
## Step 175 176 177 178 179 180 181 182 183 184 185 186
## FP043 FP134 FP030 FP051 FP032 FP022 FP160 FP149 FP092 FP141 FP121 FP190
## Var -43 134 30 51 32 -22 159 -149 92 -141 121 189
## Step 187 188 189 190 191 192 193 194 195 196 197 198
## FP165 FP178 FP007 FP070 FP132 FP028 FP189 FP108 FP116 FP117 FP195 FP010
## Var 164 177 7 70 132 28 188 108 116 117 194 10
## Step 199 200 201 202 203 204 205 206 207 208 209 210
## FP021 FP052 FP193 FP096 FP194 FP151 FP182 FP022 FP129 FP205 SurfaceArea2
## Var 21 52 -192 96 193 151 181 22 129 202 220
## Step 211 212 213 214 215 216 217 218 219 220 221
## FP011 FP069 FP179 FP137 FP123 FP097 FP208 FP107 FP175 FP110 FP041 FP120
## Var 11 -69 178 -137 123 97 205 107 174 110 41 120
## Step 222 223 224 225 226 227 228 229 230 231 232 233
## FP153 FP160 FP166 FP115 FP025 FP177 FP204 FP193 FP061 NumSulfer FP205
## Var -153 -159 165 115 25 176 201 192 61 -214 -202
## Step 234 235 236 237 238 239 240 241 242 243 244
## FP005 FP008 FP001 FP125 FP042 FP141 FP196 FP029 FP197 FP112 FP056
## Var 5 8 1 125 -42 141 195 29 196 112 56
## Step 245 246 247 248 249 250 251 252 253 254 255
## NumSulfer FP106 HydrophilicFactor FP057 FP205 FP020 FP019 FP062 FP014
## Var 214 106 218 57 202 20 19 62 14
## Step 256 257 258 259 260 261 262 263 264
## FP206 FP069 FP138 FP192 FP102 FP006 FP160 FP129 FP007 FP190 FP029 FP138
## Var -203 69 -138 191 102 6 159 -129 -7 -189 -29 138
## Step 265 266 267 268 269 270 271 272 273 274 275 276
## FP043 FP101 FP203 FP137 FP153 FP042 FP046 FP101 FP177 FP149 FP007 FP036
## Var 43 -101 200 137 153 42 46 101 -176 149 7 36
## Step 277 278 279 280 281 282 283 284 285 286 287 288
## FP206 FP183 FP047 FP086 FP049 FP058 FP177 FP129 FP190 FP049 FP029
## Var 203 182 47 86 -49 58 176 129 189 49 29
## Step 289 290 291 292 293 294 295 296 297 298 299
## SurfaceArea1
## Var 219 301
## Step 300 301
## fraction RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.0500 1.3714216 0.6930427 1.0614377 0.11094901 0.06769221 0.06732090
## 2 0.2875 0.6613587 0.8945923 0.5086228 0.05318164 0.02052497 0.04380793
## 3 0.5250 0.6489629 0.8976271 0.4924596 0.04674210 0.02063368 0.03218482
## 4 0.7625 0.6630176 0.8933415 0.4997783 0.05238428 0.02426442 0.04144706
## 5 1.0000 0.6871897 0.8862951 0.5149301 0.05361883 0.02595243 0.04394202
(PLR_L_Train_RMSE <- PLR_L_Tune$results[PLR_L_Tune$results$fraction==PLR_L_Tune$bestTune$fraction,
c("RMSE")])
## [1] 0.6489629
(PLR_L_Train_Rsquared <- PLR_L_Tune$results[PLR_L_Tune$results$fraction==PLR_L_Tune$bestTune$fraction,
c("Rsquared")])
## [1] 0.8976271
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
PLR_L_Test <- data.frame(PLR_L_Observed = PMA_PreModelling_Test$Log_Solubility,
PLR_L_Predicted = predict(PLR_L_Tune,
PMA_PreModelling_Test_PLR_L[,!names(PMA_PreModelling_Test_PLR_L) %in% c("Log_Solubility")]))
PLR_L_Test
## PLR_L_Observed PLR_L_Predicted
## 1 0.93 0.696775055
## 2 0.85 0.279350920
## 3 0.81 -0.531764425
## 4 0.74 0.738995667
## 5 0.61 -0.099258728
## 6 0.58 1.510972708
## 7 0.57 0.508081340
## 8 0.56 0.435049171
## 9 0.52 0.246122439
## 10 0.45 -0.923412219
## 11 0.40 -0.777923856
## 12 0.36 -0.883130465
## 13 0.22 -0.025862114
## 14 0.08 -0.348931025
## 15 0.07 -1.116015074
## 16 0.02 -0.696506527
## 17 0.00 -0.258151380
## 18 -0.01 0.009412014
## 19 -0.07 0.113617626
## 20 -0.12 -0.997069045
## 21 -0.17 0.723927170
## 22 -0.29 -0.084301471
## 23 -0.38 -0.717614771
## 24 -0.38 -0.886545809
## 25 -0.39 -1.062025692
## 26 -0.42 -0.781678345
## 27 -0.44 -0.734472594
## 28 -0.46 1.290539498
## 29 -0.48 -2.226162570
## 30 -0.60 -1.235241992
## 31 -0.63 -2.284735542
## 32 -0.66 -0.684830137
## 33 -0.72 -0.740096111
## 34 -0.72 -0.144164115
## 35 -0.80 0.330368697
## 36 -0.80 -1.213167987
## 37 -0.82 0.537007867
## 38 -0.82 -0.613165183
## 39 -0.84 0.221321155
## 40 -0.85 -0.904408794
## 41 -0.85 -0.426444653
## 42 -0.87 -1.735769138
## 43 -0.89 -1.221130929
## 44 -0.90 0.105981135
## 45 -0.96 -1.416375399
## 46 -0.96 -0.664104071
## 47 -0.99 -0.358741235
## 48 -1.01 -0.773201608
## 49 -1.09 -1.062745723
## 50 -1.12 -0.584113671
## 51 -1.14 -0.692465482
## 52 -1.17 -1.783095082
## 53 -1.19 -1.621653363
## 54 -1.22 -1.331617002
## 55 -1.27 -1.818780418
## 56 -1.28 -1.284108210
## 57 -1.32 -1.322170819
## 58 -1.38 -1.507263453
## 59 -1.39 -1.626772250
## 60 -1.42 -1.654953819
## 61 -1.47 -0.905182149
## 62 -1.47 -1.589744171
## 63 -1.50 -0.847287439
## 64 -1.52 -1.282428348
## 65 -1.54 -1.465053294
## 66 -1.55 -2.196145729
## 67 -1.56 -3.010647775
## 68 -1.57 -1.867221330
## 69 -1.60 -1.415277213
## 70 -1.60 -2.494036964
## 71 -1.62 -1.616449068
## 72 -1.64 -2.600897255
## 73 -1.67 -1.787197025
## 74 -1.70 -3.122240066
## 75 -1.70 -2.054005171
## 76 -1.71 -2.178463062
## 77 -1.71 -2.376952658
## 78 -1.75 -1.965728963
## 79 -1.78 -1.510121181
## 80 -1.78 -2.423276082
## 81 -1.82 -1.166566385
## 82 -1.87 -1.773312149
## 83 -1.89 -2.124444746
## 84 -1.92 -2.018171402
## 85 -1.92 -1.341373271
## 86 -1.92 -1.444321376
## 87 -1.94 -3.223994709
## 88 -1.99 -2.526462965
## 89 -2.00 -2.341432766
## 90 -2.05 -2.278845551
## 91 -2.06 -1.688365418
## 92 -2.08 -2.209833149
## 93 -2.10 -2.645390502
## 94 -2.11 -1.342539676
## 95 -2.12 -0.631788199
## 96 -2.17 -2.214863047
## 97 -2.21 -1.825444785
## 98 -2.24 -2.867260223
## 99 -2.24 -1.674835486
## 100 -2.29 -2.308687604
## 101 -2.31 -2.343324523
## 102 -2.32 -2.078619420
## 103 -2.35 -2.844175708
## 104 -2.35 -2.127668590
## 105 -2.36 -2.533484084
## 106 -2.36 -1.982062937
## 107 -2.38 -2.370288096
## 108 -2.42 -2.535807307
## 109 -2.43 -3.189706478
## 110 -2.44 -3.334444895
## 111 -2.52 -2.443131161
## 112 -2.53 -2.926965061
## 113 -2.57 -3.003896482
## 114 -2.62 -2.977492389
## 115 -2.62 -2.760198669
## 116 -2.64 -3.325236403
## 117 -2.64 -3.439032258
## 118 -2.70 -2.497946653
## 119 -2.82 -2.389403771
## 120 -2.88 -2.552102479
## 121 -2.89 -2.217966043
## 122 -2.92 -1.231861218
## 123 -2.93 -3.332496852
## 124 -2.96 -2.729673533
## 125 -2.98 -2.572734302
## 126 -3.01 -2.667790719
## 127 -3.01 -3.289083464
## 128 -3.02 -3.563604497
## 129 -3.07 -3.323972216
## 130 -3.09 -3.026689084
## 131 -3.11 -3.060210671
## 132 -3.13 -3.650220790
## 133 -3.14 -1.921849837
## 134 -3.15 -3.545390714
## 135 -3.22 -2.445678825
## 136 -3.26 -3.289116670
## 137 -3.27 -2.749382957
## 138 -3.27 -2.860445693
## 139 -3.30 -2.818830343
## 140 -3.31 -2.335638374
## 141 -3.33 -2.258790537
## 142 -3.37 -2.077745169
## 143 -3.43 -3.376236288
## 144 -3.43 -2.279780514
## 145 -3.48 -2.983772712
## 146 -3.51 -3.513288148
## 147 -3.59 -2.367631912
## 148 -3.61 -2.628943992
## 149 -3.63 -3.473330160
## 150 -3.63 -3.455159204
## 151 -3.68 -2.156604057
## 152 -3.71 -3.643893919
## 153 -3.74 -2.362066916
## 154 -3.75 -3.756466588
## 155 -3.75 -2.302796005
## 156 -3.77 -3.287034135
## 157 -3.77 -4.174210302
## 158 -3.78 -5.060426176
## 159 -3.81 -3.670615283
## 160 -3.95 -4.349909530
## 161 -3.96 -5.212693942
## 162 -3.96 -4.112299153
## 163 -4.00 -3.617061437
## 164 -4.02 -4.732237138
## 165 -4.04 -4.230150688
## 166 -4.12 -3.558513200
## 167 -4.15 -4.725952338
## 168 -4.16 -3.587352086
## 169 -4.17 -4.432884802
## 170 -4.21 -4.640913412
## 171 -4.23 -4.378043113
## 172 -4.25 -3.314015947
## 173 -4.30 -3.465419039
## 174 -4.31 -5.493632641
## 175 -4.35 -4.744344413
## 176 -4.40 -3.969478451
## 177 -4.40 -4.257997370
## 178 -4.43 -4.745523504
## 179 -4.46 -4.510586518
## 180 -4.47 -3.023754977
## 181 -4.51 -5.111993398
## 182 -4.60 -3.744522580
## 183 -4.64 -4.603617398
## 184 -4.69 -4.796472832
## 185 -4.71 -3.976400093
## 186 -4.77 -3.614641382
## 187 -4.95 -4.669930662
## 188 -4.98 -4.179255492
## 189 -5.21 -5.744332441
## 190 -5.22 -5.403651244
## 191 -5.28 -4.337172918
## 192 -5.31 -2.997916771
## 193 -5.35 -4.725606653
## 194 -5.37 -4.939665955
## 195 -5.40 -4.564707412
## 196 -5.43 -4.413729687
## 197 -5.65 -5.565050656
## 198 -5.66 -4.287482105
## 199 -6.70 -4.958690917
## 200 -5.72 -5.086849288
## 201 -6.00 -7.260717366
## 202 -6.25 -6.392068718
## 203 -6.26 -6.283502155
## 204 -6.27 -6.492722706
## 205 -6.35 -5.761142444
## 206 -6.57 -6.040895637
## 207 -6.62 -5.144977476
## 208 -6.96 -5.948936913
## 209 -7.02 -7.610741404
## 210 -7.20 -7.182641266
## 211 -7.28 -7.157697592
## 212 -7.32 -7.432934428
## 213 -7.39 -7.774820740
## 214 -7.82 -8.269199698
## 215 -8.23 -8.950711629
## 216 -8.94 -8.386654302
## 217 1.07 -0.214622812
## 218 0.43 0.208940637
## 219 0.32 -0.399368721
## 220 0.00 0.056561994
## 221 -0.40 -0.769290741
## 222 -0.52 -0.531669275
## 223 -0.55 -0.651485152
## 224 -0.60 -0.783950637
## 225 -0.62 -2.499473761
## 226 -0.85 -1.275268018
## 227 -0.89 -0.765145393
## 228 -0.93 -0.911002770
## 229 -0.96 -0.159062530
## 230 -1.06 -1.993965890
## 231 -1.10 -1.607910909
## 232 -1.12 -0.971075346
## 233 -1.15 -0.773830102
## 234 -1.28 -0.461914844
## 235 -1.30 -1.620774151
## 236 -1.31 -1.356486731
## 237 -1.35 -3.174400411
## 238 -1.39 -1.982371446
## 239 -1.41 -1.621653363
## 240 -1.41 -1.223902977
## 241 -1.42 -0.631093637
## 242 -1.46 -2.080658160
## 243 -1.50 -1.599089375
## 244 -1.50 -2.193001514
## 245 -1.52 -1.571032900
## 246 -1.52 -0.655223537
## 247 -1.59 -1.540810054
## 248 -1.61 -0.855785977
## 249 -1.63 -1.182040133
## 250 -1.71 -2.454445326
## 251 -1.83 -2.177896548
## 252 -2.05 -1.699500123
## 253 -2.06 -2.416523823
## 254 -2.07 -3.506056620
## 255 -2.15 -2.615868774
## 256 -2.16 -1.037187955
## 257 -1.99 -0.146522845
## 258 -2.36 -1.852814607
## 259 -2.38 -3.866103631
## 260 -2.39 -1.537484755
## 261 -2.46 -2.322784403
## 262 -2.49 -2.273558877
## 263 -2.54 -2.814918988
## 264 -2.55 -2.981684173
## 265 -2.63 -2.353472726
## 266 -2.64 -1.849508656
## 267 -2.67 -2.685489208
## 268 -2.68 -2.084766386
## 269 -2.77 -2.598438681
## 270 -2.78 -2.941925145
## 271 -2.82 -2.613379667
## 272 -2.92 -3.639098035
## 273 -3.03 -3.417266607
## 274 -3.12 -3.416567545
## 275 -3.16 -3.056557840
## 276 -3.19 -3.276541774
## 277 -3.54 -3.550729684
## 278 -3.54 -2.352647917
## 279 -3.59 -3.577961892
## 280 -3.66 -3.046180768
## 281 -3.68 -2.300947101
## 282 -3.75 -3.972119547
## 283 -3.76 -3.919947751
## 284 -3.78 -3.874151756
## 285 -3.80 -4.106195549
## 286 -3.80 -4.299403485
## 287 -3.85 -3.177916766
## 288 -3.89 -3.935682549
## 289 -3.95 -4.246928411
## 290 -4.29 -4.838309442
## 291 -4.42 -4.405006321
## 292 -4.48 -4.286944982
## 293 -4.48 -3.200053206
## 294 -4.53 -4.887325199
## 295 -4.63 -4.463809867
## 296 -4.73 -4.110392416
## 297 -4.84 -4.054421694
## 298 -4.89 -3.863573933
## 299 -4.89 -4.804951825
## 300 -5.26 -5.652575880
## 301 -6.09 -4.991967282
## 302 -6.29 -5.890490453
## 303 -6.29 -6.276045708
## 304 -6.89 -5.650652209
## 305 -6.96 -6.780581597
## 306 -7.00 -6.900928547
## 307 -7.05 -7.666254176
## 308 -8.30 -8.751989911
## 309 -8.66 -8.914147984
## 310 -9.03 -9.524766011
## 311 -10.41 -9.831681435
## 312 -7.89 -7.409485131
## 313 -2.32 -1.615257561
## 314 0.39 -2.659349386
## 315 -2.90 -4.948707174
## 316 -2.47 -5.115499794
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(PLR_L_Test_Metrics <- postResample(PLR_L_Test[,2], PLR_L_Test[,1]))
## RMSE Rsquared MAE
## 0.7389135 0.8746818 0.5538049
(PLR_L_Test_RMSE <- PLR_L_Test_Metrics[1])
## RMSE
## 0.7389135
(PLR_L_Test_Rsquared <- PLR_L_Test_Metrics[2])
## Rsquared
## 0.8746818
1.5.4 Penalized Linear Regression - ElasticNet (PLR_E)
[A] The penalized linear regression (elasticnet) model
from the
elasticnet
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] lambda =
weight decay made to vary across a range of values equal to 0.00 to
0.10
[B.2] fraction
= fraction of full solution made to vary across a range of values equal
to 0.05 to 1.00
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
lambda=0.010 and fraction=0.7625
[C.2] Root-Mean-Square Error = 0.64716
[C.3] R-Squared = 0.89822
[D] The model does not allow for ranking of predictors
in terms of variable importance.
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.73519
[E.2] R-Squared = 0.87618
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_PLR_E <- as.data.frame(lapply(PMA_PreModelling_Train, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Train_PLR_E)
## [1] 951 221
PMA_PreModelling_Test_PLR_E <- as.data.frame(lapply(PMA_PreModelling_Test, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Test_PLR_E)
## [1] 316 221
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_PLR_E$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
PLR_E_Grid = expand.grid(lambda = c(0, 0.01, 0.10),
fraction = seq(0.05, 1.00, length = 5))
##################################
# Running the penalized linear regression (elasticnet) model
# by setting the caret method to 'enet'
##################################
set.seed(12345678)
PLR_E_Tune <- train(x = PMA_PreModelling_Train_PLR_E[,!names(PMA_PreModelling_Train_PLR_E) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_PLR_E$Log_Solubility,
method = "enet",
tuneGrid = PLR_E_Grid,
trControl = KFold_Control,
preProc = c("center", "scale"))
##################################
# Reporting the cross-validation results
# for the train set
##################################
PLR_E_Tune
## Elasticnet
##
## 951 samples
## 220 predictors
##
## Pre-processing: centered (220), scaled (220)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00 0.0500 1.3714216 0.6930427 1.0614377
## 0.00 0.2875 0.6613587 0.8945923 0.5086228
## 0.00 0.5250 0.6489629 0.8976271 0.4924596
## 0.00 0.7625 0.6630176 0.8933415 0.4997783
## 0.00 1.0000 0.6871897 0.8862951 0.5149301
## 0.01 0.0500 1.5421085 0.6130789 1.1969314
## 0.01 0.2875 0.7137466 0.8789934 0.5468178
## 0.01 0.5250 0.6480166 0.8981058 0.4949997
## 0.01 0.7625 0.6471642 0.8982164 0.4916267
## 0.01 1.0000 0.6574773 0.8951502 0.4997426
## 0.10 0.0500 1.6926512 0.5217008 1.3038723
## 0.10 0.2875 0.8539515 0.8340789 0.6556144
## 0.10 0.5250 0.7069247 0.8808840 0.5402746
## 0.10 0.7625 0.6864824 0.8893575 0.5267061
## 0.10 1.0000 0.6831128 0.8915190 0.5240739
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.7625 and lambda = 0.01.
##
## Call:
## elasticnet::enet(x = as.matrix(x), y = y, lambda = param$lambda)
## Sequence of moves:
## MolWeight NumCarbon NumChlorine FP044 NumHalogen FP089 FP072
## Var 206 211 215 44 216 89 72
## Step 1 2 3 4 5 6 7
## HydrophilicFactor NumMultBonds SurfaceArea1 FP063 FP059 FP142 FP089 FP135
## Var 218 208 219 63 59 142 -89 135
## Step 8 9 10 11 12 13 14 15
## FP206 FP084 FP204 FP147 FP074 FP073 FP116 FP040 HydrophilicFactor
## Var 203 84 201 147 74 73 116 40 -218
## Step 16 17 18 19 20 21 22 23 24
## NumOxygen FP039 FP043 NumSulfer FP094 FP111 FP011 FP124 FP193 FP050 FP198
## Var 213 39 43 214 94 111 11 124 192 50 197
## Step 25 26 27 28 29 30 31 32 33 34 35
## FP175 FP137 FP080 NumRotBonds FP101 FP088 FP122 FP203 FP136 FP123 FP085
## Var 174 137 80 209 101 88 122 200 136 123 85
## Step 36 37 38 39 40 41 42 43 44 45 46
## FP042 FP081 FP202 FP126 FP084 FP162 FP145 FP128 FP188 FP026 FP065 FP138
## Var 42 81 199 126 -84 161 145 128 187 26 65 138
## Step 47 48 49 50 51 52 53 54 55 56 57 58
## NumBonds NumRings FP031 FP002 FP127 FP187 FP175 FP033 FP075 FP176 FP102
## Var 207 217 31 2 127 186 -174 33 75 175 102
## Step 59 60 61 62 63 64 65 66 67 68 69
## FP037 FP207 FP171 FP053 FP173 FP164 FP113 FP003 FP172 FP099 FP166 FP023
## Var 37 204 170 53 172 163 113 3 171 99 165 23
## Step 70 71 72 73 74 75 76 77 78 79 80 81
## FP091 FP100 FP131 FP133 FP064 FP169 FP078 FP083 FP004 FP013 FP201 FP184
## Var 91 100 131 133 64 168 78 83 4 13 198 183
## Step 82 83 84 85 86 87 88 89 90 91 92 93
## FP084 FP022 FP036 FP149 FP141 FP159 FP012 FP150 FP073 FP015 FP098 FP204
## Var 84 22 36 149 141 158 12 150 -73 15 98 -201
## Step 94 95 96 97 98 99 100 101 102 103 104 105
## FP186 FP054 FP104 FP034 FP109 FP049 FP103 FP163 FP021 FP174 FP148 FP144
## Var 185 54 104 34 109 49 103 162 21 173 148 144
## Step 106 107 108 109 110 111 112 113 114 115 116 117
## FP155 FP016 FP079 NumNitrogen FP089 FP168 FP161 FP071 FP185 FP157 FP166
## Var 154 16 79 212 89 167 160 71 184 156 -165
## Step 118 119 120 121 122 123 124 125 126 127 128
## FP027 FP152 FP167 FP060 FP018 FP076 FP102 FP087 FP017 FP021 FP181 FP035
## Var 27 152 166 60 18 76 -102 87 17 -21 180 35
## Step 129 130 131 132 133 134 135 136 137 138 139 140
## FP170 FP038 FP119 FP093 FP191 FP146 FP115 FP180 FP066 FP090 FP048 FP140
## Var 169 38 119 93 190 146 115 179 66 90 48 140
## Step 141 142 143 144 145 146 147 148 149 150 151 152
## FP036 FP009 FP105 FP134 FP077 FP069 FP153 FP160 NumDblBonds FP158 FP095
## Var -36 9 105 134 77 69 153 159 210 157 95
## Step 153 154 155 156 157 158 159 160 161 162 163
## FP068 FP139 FP055 FP203 FP082 FP143 FP156 FP130 FP118 FP030 FP190 FP011
## Var 68 139 55 -200 82 143 155 130 118 30 189 -11
## Step 164 165 166 167 168 169 170 171 172 173 174 175
## FP067 FP028 FP114 FP045 FP024 FP007 FP205 FP073 FP032 FP195 FP051 FP092
## Var 67 28 114 45 24 7 202 73 32 194 51 92
## Step 176 177 178 179 180 181 182 183 184 185 186 187
## FP149 FP141 FP021 FP115 FP132 FP189 FP121 FP178 FP151 FP001 FP192 FP010
## Var -149 -141 21 -115 132 188 121 177 151 1 191 10
## Step 188 189 190 191 192 193 194 195 196 197 198 199
## FP108 FP106 FP052 FP125 FP008 FP182 FP011 FP165 FP117 FP041 FP179 FP102
## Var 108 106 52 125 8 181 11 164 117 41 178 102
## Step 200 201 202 203 204 205 206 207 208 209 210 211
## FP070 FP196 FP043 FP129 FP204 FP096 FP005 FP132 FP061 FP086 FP057 FP036
## Var 70 195 -43 129 201 96 5 -132 61 86 57 36
## Step 212 213 214 215 216 217 218 219 220 221 222 223
## FP110 FP097 FP020 FP107 FP197 FP019 FP036 FP194 FP043 FP029 FP175 FP208
## Var 110 97 20 107 196 19 -36 193 43 29 174 205
## Step 224 225 226 227 228 229 230 231 232 233 234 235
## FP193 FP137 FP025 FP141 FP056 FP120 FP036 FP166 FP047 FP177 FP057
## Var -192 -137 25 141 56 120 36 165 47 176 -57
## Step 236 237 238 239 240 241 242 243 244 245 246
## SurfaceArea2 FP042 FP193 FP115 FP207 FP046 FP014 FP112 FP203 FP137 FP194
## Var 220 -42 192 115 -204 46 14 112 200 137 -193
## Step 247 248 249 250 251 252 253 254 255 256 257
## FP183 FP149 FP057 FP042 FP132 FP006 FP058 FP194 FP207 FP062
## Var 182 149 57 42 132 6 58 193 204 62
## Step 258 259 260 261 262 263 264 265 266 267
## HydrophilicFactor
## Var 218 269
## Step 268 269
## lambda fraction RMSE Rsquared MAE RMSESD RsquaredSD
## 1 0.00 0.0500 1.3714216 0.6930427 1.0614377 0.11094901 0.06769221
## 6 0.01 0.0500 1.5421085 0.6130789 1.1969314 0.12408312 0.07335623
## 11 0.10 0.0500 1.6926512 0.5217008 1.3038723 0.14784070 0.06347022
## 2 0.00 0.2875 0.6613587 0.8945923 0.5086228 0.05318164 0.02052497
## 7 0.01 0.2875 0.7137466 0.8789934 0.5468178 0.05698646 0.02622341
## 12 0.10 0.2875 0.8539515 0.8340789 0.6556144 0.07507936 0.03225781
## 3 0.00 0.5250 0.6489629 0.8976271 0.4924596 0.04674210 0.02063368
## 8 0.01 0.5250 0.6480166 0.8981058 0.4949997 0.05119816 0.02039617
## 13 0.10 0.5250 0.7069247 0.8808840 0.5402746 0.06940379 0.02444560
## 4 0.00 0.7625 0.6630176 0.8933415 0.4997783 0.05238428 0.02426442
## 9 0.01 0.7625 0.6471642 0.8982164 0.4916267 0.04859629 0.02155089
## 14 0.10 0.7625 0.6864824 0.8893575 0.5267061 0.06181857 0.02187491
## 5 0.00 1.0000 0.6871897 0.8862951 0.5149301 0.05361883 0.02595243
## 10 0.01 1.0000 0.6574773 0.8951502 0.4997426 0.05079803 0.02346660
## 15 0.10 1.0000 0.6831128 0.8915190 0.5240739 0.05894427 0.02219650
## MAESD
## 1 0.06732090
## 6 0.07068810
## 11 0.07940188
## 2 0.04380793
## 7 0.05465912
## 12 0.06437932
## 3 0.03218482
## 8 0.03932769
## 13 0.05812490
## 4 0.04144706
## 9 0.03273336
## 14 0.04620786
## 5 0.04394202
## 10 0.03674911
## 15 0.04416185
(PLR_E_Train_RMSE <- PLR_E_Tune$results[PLR_E_Tune$results$fraction==PLR_E_Tune$bestTune$fraction &
PLR_E_Tune$results$lambda==PLR_E_Tune$bestTune$lambda,
c("RMSE")])
## [1] 0.6471642
(PLR_E_Train_Rsquared <- PLR_E_Tune$results[PLR_E_Tune$results$fraction==PLR_E_Tune$bestTune$fraction &
PLR_E_Tune$results$lambda==PLR_E_Tune$bestTune$lambda,
c("Rsquared")])
## [1] 0.8982164
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
PLR_E_Test <- data.frame(PLR_E_Observed = PMA_PreModelling_Test$Log_Solubility,
PLR_E_Predicted = predict(PLR_E_Tune,
PMA_PreModelling_Test_PLR_E[,!names(PMA_PreModelling_Test_PLR_E) %in% c("Log_Solubility")]))
PLR_E_Test
## PLR_E_Observed PLR_E_Predicted
## 1 0.93 0.668846414
## 2 0.85 0.280893855
## 3 0.81 -0.579788791
## 4 0.74 0.771484158
## 5 0.61 -0.013116890
## 6 0.58 1.479188924
## 7 0.57 0.502045330
## 8 0.56 0.448134311
## 9 0.52 0.251150345
## 10 0.45 -0.922999335
## 11 0.40 -0.742385879
## 12 0.36 -0.927601984
## 13 0.22 -0.004649436
## 14 0.08 -0.378770481
## 15 0.07 -1.050183627
## 16 0.02 -0.738223827
## 17 0.00 -0.274711927
## 18 -0.01 0.001544168
## 19 -0.07 0.108071952
## 20 -0.12 -1.011201885
## 21 -0.17 0.642921463
## 22 -0.29 -0.115824840
## 23 -0.38 -0.724252834
## 24 -0.38 -0.878659858
## 25 -0.39 -1.050871441
## 26 -0.42 -0.773758490
## 27 -0.44 -0.770921665
## 28 -0.46 1.297609515
## 29 -0.48 -2.192540266
## 30 -0.60 -1.295327248
## 31 -0.63 -2.389036533
## 32 -0.66 -0.735581228
## 33 -0.72 -0.723246430
## 34 -0.72 -0.139643440
## 35 -0.80 0.335696333
## 36 -0.80 -1.182475461
## 37 -0.82 0.495335271
## 38 -0.82 -0.643228865
## 39 -0.84 0.185982859
## 40 -0.85 -0.889442552
## 41 -0.85 -0.504311722
## 42 -0.87 -1.835530536
## 43 -0.89 -1.244830228
## 44 -0.90 0.125969758
## 45 -0.96 -1.469282542
## 46 -0.96 -0.710138624
## 47 -0.99 -0.371042731
## 48 -1.01 -0.817195183
## 49 -1.09 -1.066168666
## 50 -1.12 -0.531973147
## 51 -1.14 -0.583582968
## 52 -1.17 -1.717398270
## 53 -1.19 -1.590696623
## 54 -1.22 -1.313860912
## 55 -1.27 -1.842553202
## 56 -1.28 -1.265173240
## 57 -1.32 -1.296518556
## 58 -1.38 -1.515897942
## 59 -1.39 -1.601633238
## 60 -1.42 -1.701851539
## 61 -1.47 -0.954899980
## 62 -1.47 -1.590463809
## 63 -1.50 -0.886858920
## 64 -1.52 -1.301509087
## 65 -1.54 -1.413138971
## 66 -1.55 -2.151822552
## 67 -1.56 -3.011443132
## 68 -1.57 -1.871043817
## 69 -1.60 -1.358903499
## 70 -1.60 -2.613933357
## 71 -1.62 -1.609950590
## 72 -1.64 -2.561940491
## 73 -1.67 -1.775451129
## 74 -1.70 -3.203143374
## 75 -1.70 -2.077848246
## 76 -1.71 -2.233098156
## 77 -1.71 -2.364623435
## 78 -1.75 -1.955760913
## 79 -1.78 -1.542622423
## 80 -1.78 -2.431109191
## 81 -1.82 -1.240546243
## 82 -1.87 -1.820485168
## 83 -1.89 -2.147924955
## 84 -1.92 -1.999416661
## 85 -1.92 -1.327066552
## 86 -1.92 -1.433184818
## 87 -1.94 -3.269375771
## 88 -1.99 -2.589352811
## 89 -2.00 -2.273981852
## 90 -2.05 -2.302609526
## 91 -2.06 -1.663985414
## 92 -2.08 -2.182090708
## 93 -2.10 -2.621346081
## 94 -2.11 -1.409511259
## 95 -2.12 -0.642149752
## 96 -2.17 -2.167288852
## 97 -2.21 -1.839903543
## 98 -2.24 -2.785209151
## 99 -2.24 -1.623901457
## 100 -2.29 -2.291456043
## 101 -2.31 -2.265371507
## 102 -2.32 -2.138037930
## 103 -2.35 -2.745688927
## 104 -2.35 -2.177792437
## 105 -2.36 -2.529500157
## 106 -2.36 -1.974312737
## 107 -2.38 -2.374156805
## 108 -2.42 -2.584850961
## 109 -2.43 -3.206856905
## 110 -2.44 -3.335026534
## 111 -2.52 -2.431242093
## 112 -2.53 -2.927480976
## 113 -2.57 -3.000359881
## 114 -2.62 -3.006403423
## 115 -2.62 -2.761609613
## 116 -2.64 -3.192093734
## 117 -2.64 -3.280482058
## 118 -2.70 -2.402088359
## 119 -2.82 -2.441730022
## 120 -2.88 -2.619264510
## 121 -2.89 -2.238521978
## 122 -2.92 -1.207154981
## 123 -2.93 -3.407337263
## 124 -2.96 -2.736808247
## 125 -2.98 -2.612069749
## 126 -3.01 -2.686772847
## 127 -3.01 -3.348956822
## 128 -3.02 -3.562040426
## 129 -3.07 -3.389024422
## 130 -3.09 -3.035041735
## 131 -3.11 -3.052142590
## 132 -3.13 -3.736591421
## 133 -3.14 -1.951100229
## 134 -3.15 -3.593327284
## 135 -3.22 -2.422479505
## 136 -3.26 -3.338648268
## 137 -3.27 -2.776129799
## 138 -3.27 -2.858582546
## 139 -3.30 -2.858808641
## 140 -3.31 -2.371606102
## 141 -3.33 -2.278769366
## 142 -3.37 -2.148336286
## 143 -3.43 -3.447573479
## 144 -3.43 -2.401650761
## 145 -3.48 -3.001259480
## 146 -3.51 -3.485969896
## 147 -3.59 -2.371321077
## 148 -3.61 -2.631247926
## 149 -3.63 -3.484346764
## 150 -3.63 -3.454265835
## 151 -3.68 -2.084504271
## 152 -3.71 -3.635895548
## 153 -3.74 -2.375746346
## 154 -3.75 -3.729406546
## 155 -3.75 -2.509516118
## 156 -3.77 -3.294687607
## 157 -3.77 -4.229586310
## 158 -3.78 -5.129443694
## 159 -3.81 -3.719102343
## 160 -3.95 -4.400027684
## 161 -3.96 -5.325231779
## 162 -3.96 -4.155973946
## 163 -4.00 -3.596455497
## 164 -4.02 -4.715188665
## 165 -4.04 -4.360562477
## 166 -4.12 -3.555720514
## 167 -4.15 -4.897037201
## 168 -4.16 -3.667023519
## 169 -4.17 -4.478220876
## 170 -4.21 -4.679344505
## 171 -4.23 -4.351502415
## 172 -4.25 -3.391447378
## 173 -4.30 -3.550395193
## 174 -4.31 -5.472567427
## 175 -4.35 -4.788203840
## 176 -4.40 -4.033696610
## 177 -4.40 -4.303364468
## 178 -4.43 -4.796155370
## 179 -4.46 -4.559413357
## 180 -4.47 -3.033733714
## 181 -4.51 -5.099405157
## 182 -4.60 -3.861458738
## 183 -4.64 -4.654397665
## 184 -4.69 -4.787021541
## 185 -4.71 -4.000790062
## 186 -4.77 -3.642986811
## 187 -4.95 -4.641108156
## 188 -4.98 -4.379311504
## 189 -5.21 -5.784468493
## 190 -5.22 -5.484909171
## 191 -5.28 -4.358137201
## 192 -5.31 -3.008072879
## 193 -5.35 -4.710948076
## 194 -5.37 -5.019460188
## 195 -5.40 -4.602930142
## 196 -5.43 -4.503639544
## 197 -5.65 -5.583949140
## 198 -5.66 -4.331119192
## 199 -6.70 -5.001635662
## 200 -5.72 -5.169796328
## 201 -6.00 -7.308435337
## 202 -6.25 -6.425975129
## 203 -6.26 -6.330091400
## 204 -6.27 -6.581465756
## 205 -6.35 -5.762684428
## 206 -6.57 -6.024074235
## 207 -6.62 -5.241527363
## 208 -6.96 -5.937208040
## 209 -7.02 -7.529659549
## 210 -7.20 -7.157945756
## 211 -7.28 -7.226209242
## 212 -7.32 -7.466705657
## 213 -7.39 -7.830991434
## 214 -7.82 -8.326389255
## 215 -8.23 -8.941071007
## 216 -8.94 -8.438086944
## 217 1.07 -0.212847918
## 218 0.43 0.186677380
## 219 0.32 -0.348378759
## 220 0.00 0.045434632
## 221 -0.40 -0.789167867
## 222 -0.52 -0.524621279
## 223 -0.55 -0.675962619
## 224 -0.60 -0.836243522
## 225 -0.62 -2.550028488
## 226 -0.85 -1.263393122
## 227 -0.89 -0.771067860
## 228 -0.93 -0.911434866
## 229 -0.96 -0.140652368
## 230 -1.06 -2.042335464
## 231 -1.10 -1.627873124
## 232 -1.12 -0.996695840
## 233 -1.15 -0.781726838
## 234 -1.28 -0.424568431
## 235 -1.30 -1.604207221
## 236 -1.31 -1.317255241
## 237 -1.35 -3.054247411
## 238 -1.39 -1.963387525
## 239 -1.41 -1.590696623
## 240 -1.41 -1.291910598
## 241 -1.42 -0.635717732
## 242 -1.46 -2.049227667
## 243 -1.50 -1.617025122
## 244 -1.50 -2.293466532
## 245 -1.52 -1.588327062
## 246 -1.52 -0.624424177
## 247 -1.59 -1.531116090
## 248 -1.61 -0.821311299
## 249 -1.63 -1.163621054
## 250 -1.71 -2.414921248
## 251 -1.83 -2.162002049
## 252 -2.05 -1.823105489
## 253 -2.06 -2.368208866
## 254 -2.07 -3.630586856
## 255 -2.15 -2.654775056
## 256 -2.16 -1.040564271
## 257 -1.99 -0.197735310
## 258 -2.36 -1.894851343
## 259 -2.38 -3.937286149
## 260 -2.39 -1.554269377
## 261 -2.46 -2.307514014
## 262 -2.49 -2.300420441
## 263 -2.54 -2.803495218
## 264 -2.55 -3.017964093
## 265 -2.63 -2.416237494
## 266 -2.64 -1.752190120
## 267 -2.67 -2.701718146
## 268 -2.68 -2.054472439
## 269 -2.77 -2.609142993
## 270 -2.78 -2.930664874
## 271 -2.82 -2.639804601
## 272 -2.92 -3.614277595
## 273 -3.03 -3.453234661
## 274 -3.12 -3.447334096
## 275 -3.16 -3.048998810
## 276 -3.19 -3.285417898
## 277 -3.54 -3.555808300
## 278 -3.54 -2.382536191
## 279 -3.59 -3.655298560
## 280 -3.66 -3.034395192
## 281 -3.68 -2.300203903
## 282 -3.75 -3.959871364
## 283 -3.76 -3.918213256
## 284 -3.78 -3.868384096
## 285 -3.80 -4.110286865
## 286 -3.80 -4.426213518
## 287 -3.85 -3.242958707
## 288 -3.89 -3.842185232
## 289 -3.95 -4.259076603
## 290 -4.29 -4.888379114
## 291 -4.42 -4.549047883
## 292 -4.48 -4.285249844
## 293 -4.48 -3.218975882
## 294 -4.53 -4.908688084
## 295 -4.63 -4.482653232
## 296 -4.73 -4.110587825
## 297 -4.84 -4.130849965
## 298 -4.89 -3.882265342
## 299 -4.89 -4.837162144
## 300 -5.26 -5.651126226
## 301 -6.09 -5.095823433
## 302 -6.29 -5.951733556
## 303 -6.29 -6.314009294
## 304 -6.89 -5.649162041
## 305 -6.96 -6.806439862
## 306 -7.00 -6.949428220
## 307 -7.05 -7.735107705
## 308 -8.30 -8.810751696
## 309 -8.66 -8.808557379
## 310 -9.03 -9.425311508
## 311 -10.41 -9.895151102
## 312 -7.89 -7.458326683
## 313 -2.32 -1.656390580
## 314 0.39 -2.761746386
## 315 -2.90 -4.937502427
## 316 -2.47 -4.999453735
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(PLR_E_Test_Metrics <- postResample(PLR_E_Test[,2], PLR_E_Test[,1]))
## RMSE Rsquared MAE
## 0.7351873 0.8761762 0.5504336
(PLR_E_Test_RMSE <- PLR_E_Test_Metrics[1])
## RMSE
## 0.7351873
(PLR_E_Test_Rsquared <- PLR_E_Test_Metrics[2])
## Rsquared
## 0.8761762
1.5.5 Principal Component Regression (PCR)
[A] The principal component regression model from the
pls
package was implemented through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] ncomp =
number of components made to vary across a range of values equal to 1 to
35
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
ncomp=34
[C.2] Root-Mean-Square Error = 0.74260
[C.3] R-Squared = 0.86772
[D] The model does not allow for ranking of predictors
in terms of variable importance.
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.84483
[E.2] R-Squared = 0.83516
##################################
# Creating a local object
# for the train set
##################################
PMA_PreModelling_Train_PCR <- PMA_PreModelling_Train
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_PCR$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
PCR_Grid = expand.grid(ncomp = 1:35)
##################################
# Running the principal component regression model
# by setting the caret method to 'pcr'
##################################
set.seed(12345678)
PCR_Tune <- train(x = PMA_PreModelling_Train_PCR[,!names(PMA_PreModelling_Train_PCR) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_PCR$Log_Solubility,
method = "pcr",
tuneGrid = PCR_Grid,
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
PCR_Tune
## Principal Component Analysis
##
## 951 samples
## 220 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 2.0296889 0.01442414 1.5675994
## 2 1.6567709 0.34667895 1.3084248
## 3 1.3399066 0.56959232 1.0554564
## 4 1.3365025 0.57098448 1.0523863
## 5 1.1077389 0.70727215 0.8560733
## 6 1.0934315 0.71523392 0.8396607
## 7 1.0918104 0.71620758 0.8392329
## 8 1.0822762 0.72034455 0.8355520
## 9 0.9758776 0.77338983 0.7648700
## 10 0.9193282 0.79797036 0.7191366
## 11 0.9100087 0.80274346 0.7098640
## 12 0.9008101 0.80711914 0.7026402
## 13 0.8994621 0.80774236 0.7012849
## 14 0.8997422 0.80755616 0.7022250
## 15 0.8972326 0.80815361 0.6971850
## 16 0.8724231 0.81941761 0.6786738
## 17 0.8479644 0.82828435 0.6684450
## 18 0.8468783 0.82878646 0.6677195
## 19 0.8238325 0.83755774 0.6506819
## 20 0.8210854 0.83853781 0.6496191
## 21 0.8170708 0.84039342 0.6407918
## 22 0.7981577 0.84802017 0.6241175
## 23 0.7935049 0.84967647 0.6207379
## 24 0.7936238 0.84963015 0.6215020
## 25 0.7950743 0.84910027 0.6229236
## 26 0.7959497 0.84890735 0.6232711
## 27 0.7917201 0.85064965 0.6207289
## 28 0.7856018 0.85294606 0.6116164
## 29 0.7868587 0.85160687 0.6102646
## 30 0.7686928 0.85854250 0.5979590
## 31 0.7598560 0.86182884 0.5950581
## 32 0.7544523 0.86381691 0.5880242
## 33 0.7506357 0.86488656 0.5850940
## 34 0.7426083 0.86771537 0.5751721
## 35 0.7453166 0.86670191 0.5772234
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 34.
## Principal component regression , fitted with the singular value decomposition algorithm.
## Call:
## pcr(formula = .outcome ~ ., ncomp = ncomp, data = dat)
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 2.0296889 0.01442414 1.5675994 0.18519012 0.01370650 0.10605601
## 2 2 1.6567709 0.34667895 1.3084248 0.11847283 0.10128844 0.10795416
## 3 3 1.3399066 0.56959232 1.0554564 0.09770998 0.06849443 0.07696745
## 4 4 1.3365025 0.57098448 1.0523863 0.09650563 0.07030819 0.07599880
## 5 5 1.1077389 0.70727215 0.8560733 0.08184295 0.05070045 0.06351824
## 6 6 1.0934315 0.71523392 0.8396607 0.07559229 0.04492993 0.06175559
## 7 7 1.0918104 0.71620758 0.8392329 0.07511865 0.04349455 0.06138873
## 8 8 1.0822762 0.72034455 0.8355520 0.07326621 0.04682361 0.06128337
## 9 9 0.9758776 0.77338983 0.7648700 0.06964522 0.03464898 0.06535156
## 10 10 0.9193282 0.79797036 0.7191366 0.07361984 0.03602854 0.06431757
## 11 11 0.9100087 0.80274346 0.7098640 0.07141421 0.03037915 0.06273034
## 12 12 0.9008101 0.80711914 0.7026402 0.06848519 0.02973063 0.05972693
## 13 13 0.8994621 0.80774236 0.7012849 0.06984533 0.02920963 0.06106845
## 14 14 0.8997422 0.80755616 0.7022250 0.06850319 0.02895750 0.05863619
## 15 15 0.8972326 0.80815361 0.6971850 0.06764988 0.02917955 0.05763866
## 16 16 0.8724231 0.81941761 0.6786738 0.07191361 0.02836615 0.06605188
## 17 17 0.8479644 0.82828435 0.6684450 0.07312473 0.03029092 0.06153839
## 18 18 0.8468783 0.82878646 0.6677195 0.07541100 0.02945746 0.06126364
## 19 19 0.8238325 0.83755774 0.6506819 0.07286795 0.02830103 0.05844023
## 20 20 0.8210854 0.83853781 0.6496191 0.07502121 0.02738817 0.05717632
## 21 21 0.8170708 0.84039342 0.6407918 0.07248873 0.02467572 0.05691329
## 22 22 0.7981577 0.84802017 0.6241175 0.07722684 0.02299189 0.06220102
## 23 23 0.7935049 0.84967647 0.6207379 0.07379642 0.02348702 0.05821540
## 24 24 0.7936238 0.84963015 0.6215020 0.07342546 0.02316486 0.05802904
## 25 25 0.7950743 0.84910027 0.6229236 0.07377674 0.02344697 0.05852246
## 26 26 0.7959497 0.84890735 0.6232711 0.07207635 0.02257987 0.05798154
## 27 27 0.7917201 0.85064965 0.6207289 0.07105699 0.02176341 0.05815307
## 28 28 0.7856018 0.85294606 0.6116164 0.07237138 0.02341986 0.06013560
## 29 29 0.7868587 0.85160687 0.6102646 0.06768926 0.02704719 0.05771990
## 30 30 0.7686928 0.85854250 0.5979590 0.06442948 0.02674887 0.05450236
## 31 31 0.7598560 0.86182884 0.5950581 0.05592123 0.02580600 0.04900977
## 32 32 0.7544523 0.86381691 0.5880242 0.06025075 0.02663350 0.05237288
## 33 33 0.7506357 0.86488656 0.5850940 0.05823760 0.02791928 0.04824653
## 34 34 0.7426083 0.86771537 0.5751721 0.05377055 0.02667267 0.04901977
## 35 35 0.7453166 0.86670191 0.5772234 0.05224404 0.02655696 0.04814371
(PCR_Train_RMSE <- PCR_Tune$results[PCR_Tune$results$ncomp==PCR_Tune$bestTune$ncomp,
c("RMSE")])
## [1] 0.7426083
(PCR_Train_Rsquared <- PCR_Tune$results[PCR_Tune$results$ncomp==PCR_Tune$bestTune$ncomp,
c("Rsquared")])
## [1] 0.8677154
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
PCR_Test <- data.frame(PCR_Observed = PMA_PreModelling_Test$Log_Solubility,
PCR_Predicted = predict(PCR_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Log_Solubility")]))
PCR_Test
## PCR_Observed PCR_Predicted
## 1 0.93 0.462182268
## 2 0.85 0.316580448
## 3 0.81 -0.945880837
## 4 0.74 -0.293259569
## 5 0.61 0.404635957
## 6 0.58 0.733217245
## 7 0.57 -0.028483379
## 8 0.56 0.315707814
## 9 0.52 0.114526282
## 10 0.45 -0.986935760
## 11 0.40 -0.689009516
## 12 0.36 -2.179352029
## 13 0.22 -0.153574606
## 14 0.08 0.177580181
## 15 0.07 -1.379675327
## 16 0.02 -1.739622050
## 17 0.00 -0.776967168
## 18 -0.01 -0.591326254
## 19 -0.07 0.045838747
## 20 -0.12 -2.240547697
## 21 -0.17 0.327118691
## 22 -0.29 -0.322805421
## 23 -0.38 -0.340975848
## 24 -0.38 -1.313602576
## 25 -0.39 -0.861727039
## 26 -0.42 -1.451680481
## 27 -0.44 -0.474179666
## 28 -0.46 0.804611679
## 29 -0.48 -1.677483235
## 30 -0.60 -0.968405139
## 31 -0.63 -3.188136735
## 32 -0.66 -0.623192401
## 33 -0.72 -0.463389333
## 34 -0.72 -0.587561005
## 35 -0.80 -0.534014873
## 36 -0.80 -0.554172780
## 37 -0.82 -0.122282875
## 38 -0.82 -0.659049111
## 39 -0.84 0.570252763
## 40 -0.85 -1.478444252
## 41 -0.85 -0.539779301
## 42 -0.87 -2.395308530
## 43 -0.89 -1.192540375
## 44 -0.90 0.223560105
## 45 -0.96 -1.749350382
## 46 -0.96 -0.872753931
## 47 -0.99 -0.726103159
## 48 -1.01 -0.898962066
## 49 -1.09 -1.097879461
## 50 -1.12 -0.772302099
## 51 -1.14 -0.307991271
## 52 -1.17 -1.627889193
## 53 -1.19 -1.941436108
## 54 -1.22 -1.248394890
## 55 -1.27 -0.938117980
## 56 -1.28 -1.488884776
## 57 -1.32 -1.378055930
## 58 -1.38 -1.353955783
## 59 -1.39 -1.928209622
## 60 -1.42 -2.607835558
## 61 -1.47 -0.921378405
## 62 -1.47 -1.190814712
## 63 -1.50 -1.902176513
## 64 -1.52 -1.187231141
## 65 -1.54 -1.946745163
## 66 -1.55 -2.275539452
## 67 -1.56 -2.587219183
## 68 -1.57 -2.358002072
## 69 -1.60 -0.952500446
## 70 -1.60 -2.636971744
## 71 -1.62 -2.943902005
## 72 -1.64 -3.139592131
## 73 -1.67 -1.611531503
## 74 -1.70 -2.754256197
## 75 -1.70 -2.265948203
## 76 -1.71 -2.491616937
## 77 -1.71 -2.550917657
## 78 -1.75 -1.955587957
## 79 -1.78 -1.193975692
## 80 -1.78 -1.692866557
## 81 -1.82 -0.881504064
## 82 -1.87 -1.148979452
## 83 -1.89 -2.610859665
## 84 -1.92 -1.897992545
## 85 -1.92 -1.510320673
## 86 -1.92 -1.077079670
## 87 -1.94 -3.213930262
## 88 -1.99 -2.533788513
## 89 -2.00 -2.256290306
## 90 -2.05 -2.626307120
## 91 -2.06 -2.001038378
## 92 -2.08 -2.707752970
## 93 -2.10 -2.565556218
## 94 -2.11 -1.344135086
## 95 -2.12 -1.385845736
## 96 -2.17 -1.925358660
## 97 -2.21 -1.973684177
## 98 -2.24 -2.326429801
## 99 -2.24 -1.409145654
## 100 -2.29 -1.739801390
## 101 -2.31 -2.072667721
## 102 -2.32 -1.975447594
## 103 -2.35 -3.163667428
## 104 -2.35 -1.688681672
## 105 -2.36 -2.312923262
## 106 -2.36 -2.196530460
## 107 -2.38 -2.328140438
## 108 -2.42 -3.238598289
## 109 -2.43 -2.901984183
## 110 -2.44 -2.937477131
## 111 -2.52 -3.048212697
## 112 -2.53 -2.695754747
## 113 -2.57 -2.773728454
## 114 -2.62 -2.807118420
## 115 -2.62 -2.737497203
## 116 -2.64 -2.736818053
## 117 -2.64 -3.441255512
## 118 -2.70 -2.749881922
## 119 -2.82 -3.523453889
## 120 -2.88 -2.357894077
## 121 -2.89 -2.301650028
## 122 -2.92 -1.180398464
## 123 -2.93 -3.054401715
## 124 -2.96 -2.531055760
## 125 -2.98 -3.754526428
## 126 -3.01 -3.112725817
## 127 -3.01 -2.975724090
## 128 -3.02 -2.916900908
## 129 -3.07 -3.009323008
## 130 -3.09 -3.372478938
## 131 -3.11 -3.074854971
## 132 -3.13 -3.571721771
## 133 -3.14 -1.719787580
## 134 -3.15 -3.995569084
## 135 -3.22 -2.067589470
## 136 -3.26 -3.074966588
## 137 -3.27 -2.264274749
## 138 -3.27 -2.564007920
## 139 -3.30 -1.900229061
## 140 -3.31 -1.954089261
## 141 -3.33 -2.297730426
## 142 -3.37 -1.586020086
## 143 -3.43 -3.709588935
## 144 -3.43 -2.735537334
## 145 -3.48 -3.497607438
## 146 -3.51 -4.067987387
## 147 -3.59 -3.380665801
## 148 -3.61 -2.586232322
## 149 -3.63 -3.559068847
## 150 -3.63 -3.209219549
## 151 -3.68 -2.044011728
## 152 -3.71 -4.340950545
## 153 -3.74 -1.944809652
## 154 -3.75 -3.352402046
## 155 -3.75 -3.578456815
## 156 -3.77 -3.819627738
## 157 -3.77 -4.442839703
## 158 -3.78 -4.835774030
## 159 -3.81 -3.524309500
## 160 -3.95 -4.879064966
## 161 -3.96 -4.792206612
## 162 -3.96 -4.234330789
## 163 -4.00 -3.219178215
## 164 -4.02 -3.830199697
## 165 -4.04 -4.358420780
## 166 -4.12 -2.948752067
## 167 -4.15 -5.501447290
## 168 -4.16 -3.549054346
## 169 -4.17 -4.340408429
## 170 -4.21 -4.415403498
## 171 -4.23 -4.715752594
## 172 -4.25 -2.824363852
## 173 -4.30 -3.698913819
## 174 -4.31 -5.411467048
## 175 -4.35 -4.627827233
## 176 -4.40 -3.908063678
## 177 -4.40 -4.054909207
## 178 -4.43 -4.923837515
## 179 -4.46 -4.651534851
## 180 -4.47 -2.218006110
## 181 -4.51 -5.524697405
## 182 -4.60 -3.946655342
## 183 -4.64 -4.057825025
## 184 -4.69 -4.704944822
## 185 -4.71 -3.978170057
## 186 -4.77 -4.011425397
## 187 -4.95 -5.042474931
## 188 -4.98 -5.053214221
## 189 -5.21 -5.714703733
## 190 -5.22 -5.419091321
## 191 -5.28 -4.396016452
## 192 -5.31 -3.291142485
## 193 -5.35 -4.983434872
## 194 -5.37 -5.038913691
## 195 -5.40 -4.442449730
## 196 -5.43 -4.736215470
## 197 -5.65 -5.240423743
## 198 -5.66 -4.351381058
## 199 -6.70 -5.040054389
## 200 -5.72 -4.615187285
## 201 -6.00 -6.587120785
## 202 -6.25 -6.532152807
## 203 -6.26 -6.390327981
## 204 -6.27 -6.377405240
## 205 -6.35 -5.439261305
## 206 -6.57 -5.957509355
## 207 -6.62 -4.676276071
## 208 -6.96 -5.736849435
## 209 -7.02 -7.072417082
## 210 -7.20 -7.000998973
## 211 -7.28 -7.029806037
## 212 -7.32 -7.520922791
## 213 -7.39 -7.633211006
## 214 -7.82 -8.083969547
## 215 -8.23 -7.831992798
## 216 -8.94 -8.515673674
## 217 1.07 0.219276294
## 218 0.43 0.330756116
## 219 0.32 0.453690461
## 220 0.00 -0.122893337
## 221 -0.40 -0.897191014
## 222 -0.52 0.006136031
## 223 -0.55 -0.761653529
## 224 -0.60 -0.979753251
## 225 -0.62 -2.635726297
## 226 -0.85 -1.837743804
## 227 -0.89 -1.148776455
## 228 -0.93 -1.384262415
## 229 -0.96 0.523815330
## 230 -1.06 -1.915401741
## 231 -1.10 -1.892408601
## 232 -1.12 -1.556460179
## 233 -1.15 -0.667150802
## 234 -1.28 -1.031054071
## 235 -1.30 -1.420232370
## 236 -1.31 -1.804920337
## 237 -1.35 -3.166523244
## 238 -1.39 -1.568456582
## 239 -1.41 -1.941436108
## 240 -1.41 -0.861592267
## 241 -1.42 -0.724916742
## 242 -1.46 -2.256991358
## 243 -1.50 -1.447087059
## 244 -1.50 -2.804344982
## 245 -1.52 -2.111224594
## 246 -1.52 -1.306281870
## 247 -1.59 -1.592576468
## 248 -1.61 -1.376918862
## 249 -1.63 -0.998306905
## 250 -1.71 -2.696795839
## 251 -1.83 -2.420000757
## 252 -2.05 -1.441688717
## 253 -2.06 -2.758186386
## 254 -2.07 -4.042500416
## 255 -2.15 -2.431994838
## 256 -2.16 -2.275531571
## 257 -1.99 -0.173377342
## 258 -2.36 -2.336237964
## 259 -2.38 -3.442324638
## 260 -2.39 -1.900260379
## 261 -2.46 -2.462622866
## 262 -2.49 -2.104211599
## 263 -2.54 -2.672144704
## 264 -2.55 -3.216310658
## 265 -2.63 -2.828185945
## 266 -2.64 -2.753887744
## 267 -2.67 -1.966441007
## 268 -2.68 -1.967121062
## 269 -2.77 -2.450328328
## 270 -2.78 -3.303406099
## 271 -2.82 -2.256717032
## 272 -2.92 -4.145055031
## 273 -3.03 -2.939164798
## 274 -3.12 -3.522588040
## 275 -3.16 -1.951175133
## 276 -3.19 -3.282460244
## 277 -3.54 -3.271416451
## 278 -3.54 -2.612759118
## 279 -3.59 -3.605429789
## 280 -3.66 -2.620727432
## 281 -3.68 -1.924573732
## 282 -3.75 -4.004126140
## 283 -3.76 -3.058300701
## 284 -3.78 -4.147347143
## 285 -3.80 -3.963159969
## 286 -3.80 -5.772243733
## 287 -3.85 -2.254368698
## 288 -3.89 -3.951555216
## 289 -3.95 -4.618859163
## 290 -4.29 -4.867327525
## 291 -4.42 -5.030249065
## 292 -4.48 -3.903435024
## 293 -4.48 -2.226098113
## 294 -4.53 -4.998385380
## 295 -4.63 -4.686001225
## 296 -4.73 -4.639582512
## 297 -4.84 -4.091800836
## 298 -4.89 -3.972934607
## 299 -4.89 -4.716591126
## 300 -5.26 -5.324474783
## 301 -6.09 -4.557018150
## 302 -6.29 -6.652659640
## 303 -6.29 -6.310292077
## 304 -6.89 -6.191608905
## 305 -6.96 -6.431595674
## 306 -7.00 -7.161323096
## 307 -7.05 -7.491386180
## 308 -8.30 -8.525981441
## 309 -8.66 -7.858370455
## 310 -9.03 -8.052868044
## 311 -10.41 -9.426263256
## 312 -7.89 -7.622903239
## 313 -2.32 -1.836654177
## 314 0.39 -3.006531686
## 315 -2.90 -5.038923278
## 316 -2.47 -5.208029998
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(PCR_Test_Metrics <- postResample(PCR_Test[,2], PCR_Test[,1]))
## RMSE Rsquared MAE
## 0.8448324 0.8351614 0.6337370
(PCR_Test_RMSE <- PCR_Test_Metrics[1])
## RMSE
## 0.8448324
(PCR_Test_Rsquared <- PCR_Test_Metrics[2])
## Rsquared
## 0.8351614
1.5.6 Partial Least Squares (PLS)
[A] The partial least squares model from the
pls
package was implemented through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] ncomp =
number of components made to vary across a range of values equal to 1 to
35
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
ncomp=16
[C.1] Root-Mean-Square Error = 0.64404
[C.2] R-Squared = 0.89921
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] MolWeight variable (numeric)
[D.2] NumCarbon variable (numeric)
[D.3] NumMultBonds variable (numeric)
[D.4] NumChlorine variable (numeric)
[D.5] NumHalogen variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.76473
[E.2] R-Squared = 0.86706
##################################
# Creating a local object
# for the train set
##################################
PMA_PreModelling_Train_PLS <- PMA_PreModelling_Train
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_PLS$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
PLS_Grid = expand.grid(ncomp = 1:35)
##################################
# Running the partial least squares model
# by setting the caret method to 'pls'
##################################
set.seed(12345678)
PLS_Tune <- train(x = PMA_PreModelling_Train_PLS[,!names(PMA_PreModelling_Train_PLS) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_PLS$Log_Solubility,
method = "pls",
tuneGrid = PLS_Grid,
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
PLS_Tune
## Partial Least Squares
##
## 951 samples
## 220 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.1257315 0.6941062 0.8684986
## 2 0.9541662 0.7835253 0.7540264
## 3 0.8676624 0.8221432 0.6810922
## 4 0.7921963 0.8491996 0.6109496
## 5 0.7436770 0.8677605 0.5784938
## 6 0.7023456 0.8816418 0.5418892
## 7 0.6895688 0.8852670 0.5314509
## 8 0.6793308 0.8880567 0.5174052
## 9 0.6711184 0.8907599 0.5076668
## 10 0.6646817 0.8930123 0.5044063
## 11 0.6569810 0.8951394 0.5041779
## 12 0.6527074 0.8965189 0.4971062
## 13 0.6490005 0.8976341 0.4989624
## 14 0.6459830 0.8983443 0.4957764
## 15 0.6444142 0.8990045 0.4914714
## 16 0.6440406 0.8992123 0.4908844
## 17 0.6455088 0.8988869 0.4922801
## 18 0.6485785 0.8979029 0.4952112
## 19 0.6526084 0.8968343 0.4974662
## 20 0.6544471 0.8963104 0.4997403
## 21 0.6562105 0.8955441 0.5010076
## 22 0.6566659 0.8954656 0.4994650
## 23 0.6578121 0.8952035 0.5008882
## 24 0.6601307 0.8946705 0.5025501
## 25 0.6629940 0.8936550 0.5035348
## 26 0.6656543 0.8927872 0.5051810
## 27 0.6662820 0.8925428 0.5053838
## 28 0.6684003 0.8919317 0.5061196
## 29 0.6719706 0.8908891 0.5075432
## 30 0.6732406 0.8905865 0.5087067
## 31 0.6743116 0.8902882 0.5089104
## 32 0.6757629 0.8898430 0.5095610
## 33 0.6763937 0.8897242 0.5095410
## 34 0.6782967 0.8891499 0.5096571
## 35 0.6795773 0.8887826 0.5098305
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 16.
## Partial least squares regression , fitted with the orthogonal scores algorithm.
## Call:
## plsr(formula = .outcome ~ ., ncomp = ncomp, data = dat, method = "oscorespls")
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 1.1257315 0.6941062 0.8684986 0.08785965 0.06920539 0.06592163
## 2 2 0.9541662 0.7835253 0.7540264 0.08585840 0.03093668 0.07734570
## 3 3 0.8676624 0.8221432 0.6810922 0.07131188 0.02840465 0.06528264
## 4 4 0.7921963 0.8491996 0.6109496 0.06551658 0.02829300 0.05655513
## 5 5 0.7436770 0.8677605 0.5784938 0.06205082 0.02270654 0.04622536
## 6 6 0.7023456 0.8816418 0.5418892 0.05533813 0.01983644 0.04436123
## 7 7 0.6895688 0.8852670 0.5314509 0.05168219 0.02098024 0.04557589
## 8 8 0.6793308 0.8880567 0.5174052 0.04862126 0.02287536 0.04283374
## 9 9 0.6711184 0.8907599 0.5076668 0.04887598 0.02256537 0.04288068
## 10 10 0.6646817 0.8930123 0.5044063 0.05256807 0.02149819 0.04451135
## 11 11 0.6569810 0.8951394 0.5041779 0.05600095 0.02217211 0.04357013
## 12 12 0.6527074 0.8965189 0.4971062 0.05628616 0.02180750 0.04248493
## 13 13 0.6490005 0.8976341 0.4989624 0.05179806 0.02023778 0.03802353
## 14 14 0.6459830 0.8983443 0.4957764 0.05257732 0.02157699 0.03864626
## 15 15 0.6444142 0.8990045 0.4914714 0.05353578 0.02210539 0.03907895
## 16 16 0.6440406 0.8992123 0.4908844 0.05262752 0.02252339 0.03717169
## 17 17 0.6455088 0.8988869 0.4922801 0.05442467 0.02280276 0.03862195
## 18 18 0.6485785 0.8979029 0.4952112 0.05175533 0.02231314 0.04019243
## 19 19 0.6526084 0.8968343 0.4974662 0.05203807 0.02266594 0.03979290
## 20 20 0.6544471 0.8963104 0.4997403 0.05135115 0.02278692 0.03930285
## 21 21 0.6562105 0.8955441 0.5010076 0.04871939 0.02355241 0.03845644
## 22 22 0.6566659 0.8954656 0.4994650 0.04830068 0.02328930 0.03860606
## 23 23 0.6578121 0.8952035 0.5008882 0.04807952 0.02302174 0.03898917
## 24 24 0.6601307 0.8946705 0.5025501 0.04699277 0.02251990 0.03689860
## 25 25 0.6629940 0.8936550 0.5035348 0.04584397 0.02342117 0.03614755
## 26 26 0.6656543 0.8927872 0.5051810 0.04439360 0.02383698 0.03507013
## 27 27 0.6662820 0.8925428 0.5053838 0.04543114 0.02436476 0.03742399
## 28 28 0.6684003 0.8919317 0.5061196 0.04281628 0.02448916 0.03635414
## 29 29 0.6719706 0.8908891 0.5075432 0.04376714 0.02457799 0.03647787
## 30 30 0.6732406 0.8905865 0.5087067 0.04412259 0.02433861 0.03722085
## 31 31 0.6743116 0.8902882 0.5089104 0.04473642 0.02446474 0.03718359
## 32 32 0.6757629 0.8898430 0.5095610 0.04642356 0.02453491 0.03931718
## 33 33 0.6763937 0.8897242 0.5095410 0.04587084 0.02431574 0.03883305
## 34 34 0.6782967 0.8891499 0.5096571 0.04440711 0.02378521 0.03836758
## 35 35 0.6795773 0.8887826 0.5098305 0.04409193 0.02361950 0.03882448
(PLS_Train_RMSE <- PLS_Tune$results[PLS_Tune$results$ncomp==PLS_Tune$bestTune$ncomp,
c("RMSE")])
## [1] 0.6440406
(PLS_Train_Rsquared <- PLS_Tune$results[PLS_Tune$results$ncomp==PLS_Tune$bestTune$ncomp,
c("Rsquared")])
## [1] 0.8992123
##################################
# Identifying and plotting the
# best model predictors
##################################
PLS_VarImp <- varImp(PLS_Tune, scale = TRUE)
plot(PLS_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Partial Least Squares",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
PLS_Test <- data.frame(PLS_Observed = PMA_PreModelling_Test$Log_Solubility,
PLS_Predicted = predict(PLS_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Log_Solubility")]))
PLS_Test
## PLS_Observed PLS_Predicted
## 1 0.93 0.74516623
## 2 0.85 0.32286457
## 3 0.81 -0.68678635
## 4 0.74 0.69592854
## 5 0.61 -0.02063332
## 6 0.58 1.46184911
## 7 0.57 0.44414519
## 8 0.56 0.60745934
## 9 0.52 0.28515046
## 10 0.45 -0.82667942
## 11 0.40 -0.81267451
## 12 0.36 -0.64850163
## 13 0.22 -0.14771117
## 14 0.08 -0.39819158
## 15 0.07 -1.13005975
## 16 0.02 -0.54342796
## 17 0.00 -0.20980394
## 18 -0.01 0.11378822
## 19 -0.07 0.37618124
## 20 -0.12 -0.92441367
## 21 -0.17 0.43228521
## 22 -0.29 0.00811489
## 23 -0.38 -0.77028890
## 24 -0.38 -0.93219690
## 25 -0.39 -0.97794310
## 26 -0.42 -0.77065466
## 27 -0.44 -0.75690855
## 28 -0.46 1.24295736
## 29 -0.48 -2.17274777
## 30 -0.60 -1.30587845
## 31 -0.63 -2.21544824
## 32 -0.66 -0.80024408
## 33 -0.72 -0.62072649
## 34 -0.72 -0.09828913
## 35 -0.80 0.34290909
## 36 -0.80 -1.18886889
## 37 -0.82 0.66221622
## 38 -0.82 -0.62813729
## 39 -0.84 0.20935707
## 40 -0.85 -0.73127603
## 41 -0.85 -0.62471779
## 42 -0.87 -1.74166827
## 43 -0.89 -1.09236575
## 44 -0.90 0.01620119
## 45 -0.96 -1.39335311
## 46 -0.96 -0.74195691
## 47 -0.99 -0.21944765
## 48 -1.01 -0.83784671
## 49 -1.09 -1.04732099
## 50 -1.12 -0.46500547
## 51 -1.14 -0.65423556
## 52 -1.17 -1.70787162
## 53 -1.19 -1.60155096
## 54 -1.22 -1.27737980
## 55 -1.27 -1.90933656
## 56 -1.28 -1.13950216
## 57 -1.32 -1.29914009
## 58 -1.38 -1.36423658
## 59 -1.39 -1.54617665
## 60 -1.42 -1.55490958
## 61 -1.47 -1.02147287
## 62 -1.47 -1.56736833
## 63 -1.50 -0.85767316
## 64 -1.52 -1.28141641
## 65 -1.54 -1.48046202
## 66 -1.55 -2.34552576
## 67 -1.56 -3.01893991
## 68 -1.57 -1.89674619
## 69 -1.60 -1.19042598
## 70 -1.60 -2.30994380
## 71 -1.62 -1.26556707
## 72 -1.64 -2.39106649
## 73 -1.67 -1.76352007
## 74 -1.70 -3.02979969
## 75 -1.70 -2.14120973
## 76 -1.71 -2.30199354
## 77 -1.71 -2.32738510
## 78 -1.75 -1.82784190
## 79 -1.78 -1.67857037
## 80 -1.78 -2.43084787
## 81 -1.82 -1.39535781
## 82 -1.87 -2.06608800
## 83 -1.89 -2.13478150
## 84 -1.92 -1.95828264
## 85 -1.92 -1.43848151
## 86 -1.92 -1.40018560
## 87 -1.94 -3.15152921
## 88 -1.99 -2.58836079
## 89 -2.00 -2.43350116
## 90 -2.05 -2.16143503
## 91 -2.06 -1.43677439
## 92 -2.08 -2.29766191
## 93 -2.10 -2.48008812
## 94 -2.11 -1.24175529
## 95 -2.12 -0.57595713
## 96 -2.17 -2.11092901
## 97 -2.21 -1.83978590
## 98 -2.24 -2.75462929
## 99 -2.24 -1.53929965
## 100 -2.29 -2.26545288
## 101 -2.31 -2.41770577
## 102 -2.32 -2.13386632
## 103 -2.35 -2.77068700
## 104 -2.35 -2.27630694
## 105 -2.36 -2.53423634
## 106 -2.36 -1.91198678
## 107 -2.38 -2.41441602
## 108 -2.42 -2.53089810
## 109 -2.43 -3.33749556
## 110 -2.44 -3.19036414
## 111 -2.52 -2.27545717
## 112 -2.53 -3.04173664
## 113 -2.57 -3.08080431
## 114 -2.62 -2.80366660
## 115 -2.62 -2.48223715
## 116 -2.64 -3.50451066
## 117 -2.64 -3.61872227
## 118 -2.70 -2.53747690
## 119 -2.82 -2.61439893
## 120 -2.88 -2.81441783
## 121 -2.89 -2.17109119
## 122 -2.92 -1.22392862
## 123 -2.93 -3.39327798
## 124 -2.96 -2.64427695
## 125 -2.98 -2.21989078
## 126 -3.01 -2.56257240
## 127 -3.01 -3.48867159
## 128 -3.02 -3.55059224
## 129 -3.07 -3.35503650
## 130 -3.09 -2.93492570
## 131 -3.11 -3.00807576
## 132 -3.13 -3.65770226
## 133 -3.14 -2.00219329
## 134 -3.15 -3.56664342
## 135 -3.22 -2.66783119
## 136 -3.26 -3.34287167
## 137 -3.27 -2.88657518
## 138 -3.27 -2.90118907
## 139 -3.30 -3.05460695
## 140 -3.31 -2.36959803
## 141 -3.33 -2.25720169
## 142 -3.37 -2.16115690
## 143 -3.43 -3.34354398
## 144 -3.43 -2.38472019
## 145 -3.48 -2.86222931
## 146 -3.51 -3.51097380
## 147 -3.59 -1.90045149
## 148 -3.61 -2.64203809
## 149 -3.63 -3.51315051
## 150 -3.63 -3.44342931
## 151 -3.68 -1.86708149
## 152 -3.71 -3.76968428
## 153 -3.74 -2.30299100
## 154 -3.75 -3.64762358
## 155 -3.75 -2.62428090
## 156 -3.77 -3.13696015
## 157 -3.77 -4.22015240
## 158 -3.78 -5.69040930
## 159 -3.81 -3.67272006
## 160 -3.95 -4.42082593
## 161 -3.96 -5.36110080
## 162 -3.96 -4.18819513
## 163 -4.00 -3.57001838
## 164 -4.02 -5.01716681
## 165 -4.04 -4.31607551
## 166 -4.12 -3.53587473
## 167 -4.15 -4.93376854
## 168 -4.16 -3.64282935
## 169 -4.17 -4.48596196
## 170 -4.21 -4.62731700
## 171 -4.23 -4.37316805
## 172 -4.25 -3.45146883
## 173 -4.30 -3.52342335
## 174 -4.31 -5.70580727
## 175 -4.35 -4.64309157
## 176 -4.40 -4.14102773
## 177 -4.40 -4.09464184
## 178 -4.43 -4.69360246
## 179 -4.46 -4.53047986
## 180 -4.47 -3.24594717
## 181 -4.51 -5.10145111
## 182 -4.60 -3.59325502
## 183 -4.64 -4.64792161
## 184 -4.69 -4.94079150
## 185 -4.71 -4.02841192
## 186 -4.77 -3.40564565
## 187 -4.95 -4.62714984
## 188 -4.98 -3.72037252
## 189 -5.21 -5.69086767
## 190 -5.22 -5.71349252
## 191 -5.28 -4.32241689
## 192 -5.31 -2.97633559
## 193 -5.35 -4.73049637
## 194 -5.37 -5.01320884
## 195 -5.40 -4.61985993
## 196 -5.43 -4.49627474
## 197 -5.65 -5.49894785
## 198 -5.66 -4.31998082
## 199 -6.70 -5.03126968
## 200 -5.72 -4.99345717
## 201 -6.00 -7.44493866
## 202 -6.25 -6.38632161
## 203 -6.26 -6.30658011
## 204 -6.27 -6.61828828
## 205 -6.35 -5.78215846
## 206 -6.57 -6.04996054
## 207 -6.62 -5.08427265
## 208 -6.96 -6.11228466
## 209 -7.02 -7.73073818
## 210 -7.20 -7.14859732
## 211 -7.28 -7.08864886
## 212 -7.32 -7.35627710
## 213 -7.39 -7.69737633
## 214 -7.82 -8.21120559
## 215 -8.23 -9.28917253
## 216 -8.94 -8.34390493
## 217 1.07 -0.18673697
## 218 0.43 0.21920251
## 219 0.32 -0.80294519
## 220 0.00 0.25049656
## 221 -0.40 -0.82164221
## 222 -0.52 -0.48163966
## 223 -0.55 -0.55600576
## 224 -0.60 -0.76670069
## 225 -0.62 -2.48616925
## 226 -0.85 -1.38484047
## 227 -0.89 -0.61468811
## 228 -0.93 -0.75031525
## 229 -0.96 -0.26052538
## 230 -1.06 -1.95631799
## 231 -1.10 -1.70410907
## 232 -1.12 -0.99095724
## 233 -1.15 -0.72691439
## 234 -1.28 -0.44749129
## 235 -1.30 -1.82920071
## 236 -1.31 -1.40072052
## 237 -1.35 -2.84117775
## 238 -1.39 -2.32994756
## 239 -1.41 -1.60155096
## 240 -1.41 -1.35598735
## 241 -1.42 -0.55969822
## 242 -1.46 -2.03712578
## 243 -1.50 -1.64216074
## 244 -1.50 -1.96940438
## 245 -1.52 -1.53726702
## 246 -1.52 -0.47282000
## 247 -1.59 -1.48907422
## 248 -1.61 -0.73245535
## 249 -1.63 -1.32416935
## 250 -1.71 -2.46798249
## 251 -1.83 -2.17059291
## 252 -2.05 -1.64450839
## 253 -2.06 -2.34123762
## 254 -2.07 -3.32784866
## 255 -2.15 -2.52800080
## 256 -2.16 -0.90783766
## 257 -1.99 0.54163534
## 258 -2.36 -1.84641829
## 259 -2.38 -3.81334635
## 260 -2.39 -1.70383078
## 261 -2.46 -2.18438819
## 262 -2.49 -2.31226148
## 263 -2.54 -2.98101664
## 264 -2.55 -2.88699504
## 265 -2.63 -2.33250034
## 266 -2.64 -1.50217898
## 267 -2.67 -2.66499236
## 268 -2.68 -2.15374558
## 269 -2.77 -2.56856285
## 270 -2.78 -2.98994252
## 271 -2.82 -2.76635710
## 272 -2.92 -3.59929134
## 273 -3.03 -3.45320339
## 274 -3.12 -3.35485037
## 275 -3.16 -3.29895149
## 276 -3.19 -3.21661282
## 277 -3.54 -3.62961130
## 278 -3.54 -2.34156420
## 279 -3.59 -3.79258018
## 280 -3.66 -3.13048096
## 281 -3.68 -2.35661151
## 282 -3.75 -3.87439692
## 283 -3.76 -3.94072391
## 284 -3.78 -4.10197539
## 285 -3.80 -4.12056709
## 286 -3.80 -4.27716205
## 287 -3.85 -3.38186479
## 288 -3.89 -4.06169949
## 289 -3.95 -4.21471603
## 290 -4.29 -4.77518396
## 291 -4.42 -4.53990613
## 292 -4.48 -4.44961229
## 293 -4.48 -3.43841877
## 294 -4.53 -4.95817112
## 295 -4.63 -4.35759291
## 296 -4.73 -4.05065023
## 297 -4.84 -4.22520600
## 298 -4.89 -3.85329860
## 299 -4.89 -4.80375460
## 300 -5.26 -5.63390877
## 301 -6.09 -4.92519297
## 302 -6.29 -5.80307805
## 303 -6.29 -6.03815212
## 304 -6.89 -5.48895866
## 305 -6.96 -6.73734679
## 306 -7.00 -6.79982974
## 307 -7.05 -7.61763483
## 308 -8.30 -8.71246554
## 309 -8.66 -9.04631273
## 310 -9.03 -9.68264256
## 311 -10.41 -9.78744938
## 312 -7.89 -7.32881571
## 313 -2.32 -1.67527618
## 314 0.39 -2.65782329
## 315 -2.90 -5.22722039
## 316 -2.47 -5.38039716
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(PLS_Test_Metrics <- postResample(PLS_Test[,2], PLS_Test[,1]))
## RMSE Rsquared MAE
## 0.7647343 0.8670618 0.5743195
(PLS_Test_RMSE <- PLS_Test_Metrics[1])
## RMSE
## 0.7647343
(PLS_Test_Rsquared <- PLS_Test_Metrics[2])
## Rsquared
## 0.8670618
1.5.7 Averaged Neural Network (AVNN)
[A] The averaged neural network model from the
nnet
package was implemented through the
caret
package.
[B] The model contains 3 hyperparameters:
[B.1] size =
number of hidden units made to vary across a range of values equal to 1
to 13
[B.2] decay =
weight decay made to vary across a range of values equal to 0.00 to
0.10
[B.3] bag =
bagging held constant at a value of FALSE
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves size=13,
decay=0.01 and bag=FALSE
[C.2] Root-Mean-Square Error = 1.06088
[C.3] R-Squared = 0.74802
[D] The model does not allow for ranking of predictors
in terms of variable importance.
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.98624
[E.2] R-Squared = 0.78293
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_AVNN <- as.data.frame(lapply(PMA_PreModelling_Train, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Train_AVNN)
## [1] 951 221
PMA_PreModelling_Test_AVNN <- as.data.frame(lapply(PMA_PreModelling_Test, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Test_AVNN)
## [1] 316 221
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_AVNN$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
AVNN_Grid = expand.grid(decay = c(0.00, 0.01, 0.10),
size = c(1, 5, 9, 13),
bag = FALSE)
maxSize <- max(AVNN_Grid$size)
##################################
# Running the averaged neural network model
# by setting the caret method to 'avNNet'
##################################
set.seed(12345678)
AVNN_Tune <- train(x = PMA_PreModelling_Train_AVNN[,!names(PMA_PreModelling_Train_AVNN) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_AVNN$Log_Solubility,
method = "avNNet",
tuneGrid = AVNN_Grid,
trControl = KFold_Control,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = maxSize * ((ncol(PMA_PreModelling_Train_AVNN)-1) + 1) + maxSize + 1,
maxit = 5,
allowParallel = FALSE)
##################################
# Reporting the cross-validation results
# for the train set
##################################
AVNN_Tune
## Model Averaged Neural Network
##
## 951 samples
## 220 predictors
##
## Pre-processing: centered (220), scaled (220)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 1.454694 0.5172018 1.1109574
## 0.00 5 1.347390 0.6971736 1.0964081
## 0.00 9 1.792702 0.6637279 1.4885554
## 0.00 13 1.088251 0.7302139 0.8277263
## 0.01 1 1.448632 0.5272961 1.1222274
## 0.01 5 1.315333 0.7115074 1.0725117
## 0.01 9 1.777012 0.6453049 1.4544647
## 0.01 13 1.060884 0.7480237 0.8047786
## 0.10 1 1.478537 0.5099087 1.1292415
## 0.10 5 1.308401 0.6962179 1.0545170
## 0.10 9 1.807650 0.6393351 1.4786651
## 0.10 13 1.115552 0.7253052 0.8510722
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 13, decay = 0.01 and bag
## = FALSE.
## Model Averaged Neural Network with 5 Repeats
##
## a 220-13-1 network with 2887 weights
## options were - linear output units decay=0.01
## decay size bag RMSE Rsquared MAE RMSESD RsquaredSD
## 1 0.00 1 FALSE 1.454694 0.5172018 1.1109574 0.12683175 0.08309555
## 5 0.01 1 FALSE 1.448632 0.5272961 1.1222274 0.17580426 0.06594086
## 9 0.10 1 FALSE 1.478537 0.5099087 1.1292415 0.11601214 0.08200610
## 2 0.00 5 FALSE 1.347390 0.6971736 1.0964081 0.12991586 0.04513138
## 6 0.01 5 FALSE 1.315333 0.7115074 1.0725117 0.13161757 0.04596683
## 10 0.10 5 FALSE 1.308401 0.6962179 1.0545170 0.09774587 0.03100244
## 3 0.00 9 FALSE 1.792702 0.6637279 1.4885554 0.22327568 0.06591481
## 7 0.01 9 FALSE 1.777012 0.6453049 1.4544647 0.22293972 0.07472578
## 11 0.10 9 FALSE 1.807650 0.6393351 1.4786651 0.25501136 0.04410773
## 4 0.00 13 FALSE 1.088251 0.7302139 0.8277263 0.13964203 0.03951261
## 8 0.01 13 FALSE 1.060884 0.7480237 0.8047786 0.10299708 0.02500384
## 12 0.10 13 FALSE 1.115552 0.7253052 0.8510722 0.12135148 0.02874047
## MAESD
## 1 0.08554343
## 5 0.13029101
## 9 0.08595082
## 2 0.11431945
## 6 0.10837426
## 10 0.09158530
## 3 0.21618322
## 7 0.21689288
## 11 0.23327268
## 4 0.09673945
## 8 0.08015089
## 12 0.07083277
(AVNN_Train_RMSE <- AVNN_Tune$results[AVNN_Tune$results$decay==AVNN_Tune$bestTune$decay &
AVNN_Tune$results$size==AVNN_Tune$bestTune$size,
c("RMSE")])
## [1] 1.060884
(AVNN_Train_Rsquared <- AVNN_Tune$results[AVNN_Tune$results$decay==AVNN_Tune$bestTune$decay &
AVNN_Tune$results$size==AVNN_Tune$bestTune$size,
c("Rsquared")])
## [1] 0.7480237
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
AVNN_Test <- data.frame(AVNN_Observed = PMA_PreModelling_Test$Log_Solubility,
AVNN_Predicted = predict(AVNN_Tune,
PMA_PreModelling_Test_AVNN[,!names(PMA_PreModelling_Test_AVNN) %in% c("Log_Solubility")]))
AVNN_Test
## AVNN_Observed AVNN_Predicted
## 1 0.93 0.00835390
## 2 0.85 0.78571273
## 3 0.81 -0.51048960
## 4 0.74 -0.61831608
## 5 0.61 0.68492837
## 6 0.58 0.58222525
## 7 0.57 0.11210291
## 8 0.56 0.12698662
## 9 0.52 -0.12960828
## 10 0.45 -0.49082527
## 11 0.40 -0.40533715
## 12 0.36 -1.13928008
## 13 0.22 0.22318452
## 14 0.08 -0.70202613
## 15 0.07 -1.20477469
## 16 0.02 -1.03719974
## 17 0.00 -0.62075796
## 18 -0.01 -0.49328897
## 19 -0.07 0.66286832
## 20 -0.12 -0.69904157
## 21 -0.17 0.09073153
## 22 -0.29 -1.35399935
## 23 -0.38 -1.03811354
## 24 -0.38 -0.82194203
## 25 -0.39 -0.17005757
## 26 -0.42 -1.05297811
## 27 -0.44 -1.02337861
## 28 -0.46 0.38382574
## 29 -0.48 -2.09529708
## 30 -0.60 -1.24206210
## 31 -0.63 -2.60578058
## 32 -0.66 -0.38872809
## 33 -0.72 -0.77382880
## 34 -0.72 -0.43385534
## 35 -0.80 0.04545028
## 36 -0.80 -0.14925624
## 37 -0.82 0.23297104
## 38 -0.82 -0.80013641
## 39 -0.84 -0.27710142
## 40 -0.85 -1.70345814
## 41 -0.85 -0.76783426
## 42 -0.87 -3.02947397
## 43 -0.89 -1.51468908
## 44 -0.90 1.12761796
## 45 -0.96 -1.11731956
## 46 -0.96 -0.85399216
## 47 -0.99 -1.39298469
## 48 -1.01 -0.72232209
## 49 -1.09 -1.19918137
## 50 -1.12 0.18221554
## 51 -1.14 0.51859602
## 52 -1.17 -1.33064481
## 53 -1.19 -0.83994275
## 54 -1.22 0.08354657
## 55 -1.27 -1.27824935
## 56 -1.28 -1.09442111
## 57 -1.32 -1.31255854
## 58 -1.38 -1.08411707
## 59 -1.39 -1.19379705
## 60 -1.42 -2.04316995
## 61 -1.47 -0.56252323
## 62 -1.47 -1.60297815
## 63 -1.50 -2.80359125
## 64 -1.52 -0.87796640
## 65 -1.54 -0.96890154
## 66 -1.55 -3.15772190
## 67 -1.56 -1.31225645
## 68 -1.57 -2.04907753
## 69 -1.60 -1.89059589
## 70 -1.60 -2.98244610
## 71 -1.62 -2.29821948
## 72 -1.64 -2.69202325
## 73 -1.67 -2.38916972
## 74 -1.70 -3.22556202
## 75 -1.70 -1.61334157
## 76 -1.71 -2.05994845
## 77 -1.71 -2.19948977
## 78 -1.75 -2.13743135
## 79 -1.78 -0.79153623
## 80 -1.78 -1.64050145
## 81 -1.82 -0.51074680
## 82 -1.87 -2.07706563
## 83 -1.89 -2.50249476
## 84 -1.92 -1.72910335
## 85 -1.92 -1.55732946
## 86 -1.92 -1.03687112
## 87 -1.94 -3.62249029
## 88 -1.99 -2.18226488
## 89 -2.00 -1.98415246
## 90 -2.05 -3.52952353
## 91 -2.06 -2.46322259
## 92 -2.08 -1.53820408
## 93 -2.10 -3.07132848
## 94 -2.11 -1.97194029
## 95 -2.12 -0.68008252
## 96 -2.17 -1.10143571
## 97 -2.21 -1.72757518
## 98 -2.24 -2.23971064
## 99 -2.24 -0.70717337
## 100 -2.29 -1.67267736
## 101 -2.31 -1.95867771
## 102 -2.32 -1.91660905
## 103 -2.35 -2.47109392
## 104 -2.35 -0.94056709
## 105 -2.36 -2.24959670
## 106 -2.36 -1.44652998
## 107 -2.38 -3.02820191
## 108 -2.42 -3.78064848
## 109 -2.43 -2.44915089
## 110 -2.44 -2.80783086
## 111 -2.52 -2.43404320
## 112 -2.53 -1.91786300
## 113 -2.57 -2.15716850
## 114 -2.62 -2.59147642
## 115 -2.62 -4.28861584
## 116 -2.64 -2.44080054
## 117 -2.64 -2.82150146
## 118 -2.70 -0.96438386
## 119 -2.82 -3.07901330
## 120 -2.88 -3.62774695
## 121 -2.89 -3.14358987
## 122 -2.92 -0.76696761
## 123 -2.93 -2.04099107
## 124 -2.96 -2.36877417
## 125 -2.98 -3.36636463
## 126 -3.01 -2.25293743
## 127 -3.01 -3.06256241
## 128 -3.02 -1.84133063
## 129 -3.07 -3.07513117
## 130 -3.09 -3.80076282
## 131 -3.11 -2.38144290
## 132 -3.13 -3.63954367
## 133 -3.14 -2.61244568
## 134 -3.15 -2.98572459
## 135 -3.22 -1.72694956
## 136 -3.26 -4.26707540
## 137 -3.27 -2.90162584
## 138 -3.27 -2.23245869
## 139 -3.30 -2.21278855
## 140 -3.31 -2.61904022
## 141 -3.33 -1.55138367
## 142 -3.37 -2.76559371
## 143 -3.43 -4.41158035
## 144 -3.43 -2.85181196
## 145 -3.48 -3.20631535
## 146 -3.51 -4.08806369
## 147 -3.59 -2.41230699
## 148 -3.61 -1.72312668
## 149 -3.63 -2.81197400
## 150 -3.63 -3.93062803
## 151 -3.68 -1.49254880
## 152 -3.71 -3.60868789
## 153 -3.74 -2.29882160
## 154 -3.75 -2.74608985
## 155 -3.75 -2.96813956
## 156 -3.77 -4.70066581
## 157 -3.77 -4.75519430
## 158 -3.78 -5.00876028
## 159 -3.81 -3.37272557
## 160 -3.95 -3.70714823
## 161 -3.96 -5.86251786
## 162 -3.96 -4.55209739
## 163 -4.00 -2.58911154
## 164 -4.02 -2.51963442
## 165 -4.04 -4.50543716
## 166 -4.12 -2.79170717
## 167 -4.15 -4.99284968
## 168 -4.16 -2.99591311
## 169 -4.17 -3.84198527
## 170 -4.21 -4.56846868
## 171 -4.23 -3.93101395
## 172 -4.25 -3.35808714
## 173 -4.30 -3.78313824
## 174 -4.31 -3.74132143
## 175 -4.35 -5.79697188
## 176 -4.40 -3.92745774
## 177 -4.40 -4.85531112
## 178 -4.43 -4.51974115
## 179 -4.46 -4.57412102
## 180 -4.47 -2.58763321
## 181 -4.51 -4.36878843
## 182 -4.60 -4.70521062
## 183 -4.64 -4.31725299
## 184 -4.69 -4.03858514
## 185 -4.71 -4.14580224
## 186 -4.77 -4.61006469
## 187 -4.95 -2.71045438
## 188 -4.98 -5.08058226
## 189 -5.21 -5.98762635
## 190 -5.22 -5.22465904
## 191 -5.28 -4.50680156
## 192 -5.31 -3.71066709
## 193 -5.35 -4.60543848
## 194 -5.37 -4.54512108
## 195 -5.40 -4.88559019
## 196 -5.43 -4.31730992
## 197 -5.65 -5.16448836
## 198 -5.66 -4.33687084
## 199 -6.70 -4.54416493
## 200 -5.72 -5.23912914
## 201 -6.00 -6.16503175
## 202 -6.25 -6.08588672
## 203 -6.26 -6.08485674
## 204 -6.27 -6.26339342
## 205 -6.35 -5.58991667
## 206 -6.57 -5.40606700
## 207 -6.62 -5.19018394
## 208 -6.96 -5.22897476
## 209 -7.02 -5.65820122
## 210 -7.20 -5.66316999
## 211 -7.28 -6.33206179
## 212 -7.32 -6.11924582
## 213 -7.39 -6.30081062
## 214 -7.82 -6.32123413
## 215 -8.23 -6.29083688
## 216 -8.94 -6.28434163
## 217 1.07 0.23295450
## 218 0.43 0.28706554
## 219 0.32 0.43424401
## 220 0.00 -0.54398087
## 221 -0.40 -0.98778263
## 222 -0.52 0.06707668
## 223 -0.55 -0.49434335
## 224 -0.60 -1.19571555
## 225 -0.62 -1.61603359
## 226 -0.85 -1.46371481
## 227 -0.89 -1.24752620
## 228 -0.93 -0.23322076
## 229 -0.96 0.49310055
## 230 -1.06 -2.90980832
## 231 -1.10 -1.43823249
## 232 -1.12 -1.05774573
## 233 -1.15 -0.94895261
## 234 -1.28 -0.24414870
## 235 -1.30 -1.69181220
## 236 -1.31 -1.15680076
## 237 -1.35 -1.34345376
## 238 -1.39 -2.43615548
## 239 -1.41 -0.83994275
## 240 -1.41 -0.62713819
## 241 -1.42 -0.42126656
## 242 -1.46 -0.92502808
## 243 -1.50 -1.54331764
## 244 -1.50 -2.53425009
## 245 -1.52 -3.28944231
## 246 -1.52 -0.75450253
## 247 -1.59 -1.39755119
## 248 -1.61 -0.80812713
## 249 -1.63 -1.08335055
## 250 -1.71 -1.69518803
## 251 -1.83 -2.44577086
## 252 -2.05 -0.71095293
## 253 -2.06 -1.88001256
## 254 -2.07 -2.68440533
## 255 -2.15 -1.55437927
## 256 -2.16 -2.42083797
## 257 -1.99 -1.32568853
## 258 -2.36 -1.85484082
## 259 -2.38 -3.97836785
## 260 -2.39 -0.19994597
## 261 -2.46 -2.51584092
## 262 -2.49 -2.81503326
## 263 -2.54 -2.08462296
## 264 -2.55 -2.82351411
## 265 -2.63 -3.31583908
## 266 -2.64 -2.52634719
## 267 -2.67 -2.89598839
## 268 -2.68 -0.56201302
## 269 -2.77 -1.92602730
## 270 -2.78 -2.85209098
## 271 -2.82 -2.13056602
## 272 -2.92 -3.77351396
## 273 -3.03 -3.70074705
## 274 -3.12 -3.86003705
## 275 -3.16 -1.63053815
## 276 -3.19 -4.36452441
## 277 -3.54 -3.67372725
## 278 -3.54 -2.11935050
## 279 -3.59 -3.77519251
## 280 -3.66 -2.76360588
## 281 -3.68 -1.80986450
## 282 -3.75 -4.12951274
## 283 -3.76 -3.94918060
## 284 -3.78 -3.87609352
## 285 -3.80 -3.83140382
## 286 -3.80 -4.76717737
## 287 -3.85 -3.06788830
## 288 -3.89 -3.19657712
## 289 -3.95 -3.81279846
## 290 -4.29 -5.56382993
## 291 -4.42 -3.61559076
## 292 -4.48 -4.45091217
## 293 -4.48 -3.07783558
## 294 -4.53 -4.67087856
## 295 -4.63 -4.45586703
## 296 -4.73 -4.10678904
## 297 -4.84 -3.82555739
## 298 -4.89 -4.27538138
## 299 -4.89 -4.90839493
## 300 -5.26 -5.77900486
## 301 -6.09 -5.16890905
## 302 -6.29 -5.49249058
## 303 -6.29 -5.97719748
## 304 -6.89 -4.84655705
## 305 -6.96 -5.19380937
## 306 -7.00 -6.17027777
## 307 -7.05 -6.36064957
## 308 -8.30 -6.33323369
## 309 -8.66 -6.17292319
## 310 -9.03 -6.19240516
## 311 -10.41 -6.37723082
## 312 -7.89 -6.21823503
## 313 -2.32 -1.67539765
## 314 0.39 -2.24823570
## 315 -2.90 -4.04380850
## 316 -2.47 -3.52349712
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(AVNN_Test_Metrics <- postResample(AVNN_Test[,2], AVNN_Test[,1]))
## RMSE Rsquared MAE
## 0.9862466 0.7829311 0.7664094
(AVNN_Test_RMSE <- AVNN_Test_Metrics[1])
## RMSE
## 0.9862466
(AVNN_Test_Rsquared <- AVNN_Test_Metrics[2])
## Rsquared
## 0.7829311
1.5.8 Multivariate Adaptive Regression Splines (MARS)
[A] The multivariate adaptive regression splines model
from the
earth
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] nprune =
number of terms made to vary across a range of values equal to 1 to
5
[B.2] degree =
product degree made to vary across a range of values equal to 5 to
20
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves nprune=20
and degree=3
[C.2] Root-Mean-Square Error = 0.70348
[C.3] R-Squared = 0.88230
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] MolWeight variable (numeric)
[D.2] SurfaceArea2 variable (numeric)
[D.3] NumCarbon variable (numeric)
[D.4] SurfaceArea1 variable (numeric)
[D.5] FP142
(Structure=1) variable (factor)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.75804
[E.2] R-Squared = 0.86891
##################################
# Creating a local object
# for the train set
##################################
PMA_PreModelling_Train_MARS <- PMA_PreModelling_Train
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_MARS$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
MARS_Grid = expand.grid(expand.grid(degree = 1:5, nprune = seq(5, 20, length = 4)))
##################################
# Running the multivariate adaptive regression splines model
# by setting the caret method to 'earth'
##################################
set.seed(12345678)
MARS_Tune <- train(x = PMA_PreModelling_Train_MARS[,!names(PMA_PreModelling_Train_MARS) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_MARS$Log_Solubility,
method = "earth",
tuneGrid = MARS_Grid,
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
MARS_Tune
## Multivariate Adaptive Regression Spline
##
## 951 samples
## 220 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 5 0.9885345 0.7679877 0.7535049
## 1 10 0.8378705 0.8331533 0.6482106
## 1 15 0.7724676 0.8571440 0.5925168
## 1 20 0.7502562 0.8647265 0.5766279
## 2 5 1.0173780 0.7495592 0.7685247
## 2 10 0.8369038 0.8332926 0.6483745
## 2 15 0.7536825 0.8642340 0.5753097
## 2 20 0.7104771 0.8789853 0.5399575
## 3 5 1.0054768 0.7579555 0.7661151
## 3 10 0.8394402 0.8318432 0.6384177
## 3 15 0.7409402 0.8693494 0.5689538
## 3 20 0.7034801 0.8823049 0.5417897
## 4 5 1.0036472 0.7577222 0.7660022
## 4 10 0.8420512 0.8317263 0.6388350
## 4 15 0.7358285 0.8720765 0.5599648
## 4 20 0.7096746 0.8802622 0.5419139
## 5 5 1.0036472 0.7577222 0.7660022
## 5 10 0.8420512 0.8317263 0.6388350
## 5 15 0.7358285 0.8720765 0.5599648
## 5 20 0.7096746 0.8802622 0.5419139
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 20 and degree = 3.
## Selected 20 of 54 terms, and 14 of 220 predictors (nprune=20)
## Termination condition: RSq changed by less than 0.001 at 54 terms
## Importance: MolWeight, NumCarbon, SurfaceArea1, NumRotBonds, FP1421, ...
## Number of terms at each degree of interaction: 1 8 10 1
## GCV 0.4295012 RSS 367.8563 GRSq 0.8975708 RSq 0.9075576
## degree nprune RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 5 0.9885345 0.7679877 0.7535049 0.06217594 0.02392790 0.04607850
## 5 2 5 1.0173780 0.7495592 0.7685247 0.05718989 0.06196692 0.05106240
## 9 3 5 1.0054768 0.7579555 0.7661151 0.06779247 0.04314324 0.05183156
## 13 4 5 1.0036472 0.7577222 0.7660022 0.09683141 0.04816271 0.07672106
## 17 5 5 1.0036472 0.7577222 0.7660022 0.09683141 0.04816271 0.07672106
## 2 1 10 0.8378705 0.8331533 0.6482106 0.06679188 0.02230525 0.05226945
## 6 2 10 0.8369038 0.8332926 0.6483745 0.08936033 0.03616057 0.05782491
## 10 3 10 0.8394402 0.8318432 0.6384177 0.07319923 0.02514998 0.05911625
## 14 4 10 0.8420512 0.8317263 0.6388350 0.05710409 0.02418435 0.05010629
## 18 5 10 0.8420512 0.8317263 0.6388350 0.05710409 0.02418435 0.05010629
## 3 1 15 0.7724676 0.8571440 0.5925168 0.05872201 0.02799201 0.05136498
## 7 2 15 0.7536825 0.8642340 0.5753097 0.06302002 0.02684197 0.05073034
## 11 3 15 0.7409402 0.8693494 0.5689538 0.09111186 0.02236058 0.07082359
## 15 4 15 0.7358285 0.8720765 0.5599648 0.07979965 0.02056922 0.06118755
## 19 5 15 0.7358285 0.8720765 0.5599648 0.07979965 0.02056922 0.06118755
## 4 1 20 0.7502562 0.8647265 0.5766279 0.05189451 0.02990680 0.04122031
## 8 2 20 0.7104771 0.8789853 0.5399575 0.06605725 0.02465061 0.04947483
## 12 3 20 0.7034801 0.8823049 0.5417897 0.08445692 0.01978894 0.05903665
## 16 4 20 0.7096746 0.8802622 0.5419139 0.07222965 0.02381439 0.04888709
## 20 5 20 0.7096746 0.8802622 0.5419139 0.07222965 0.02381439 0.04888709
(MARS_Train_RMSE <- MARS_Tune$results[MARS_Tune$results$nprune==MARS_Tune$bestTune$nprune &
MARS_Tune$results$degree==MARS_Tune$bestTune$degree,
c("RMSE")])
## [1] 0.7034801
(MARS_Train_Rsquared <- MARS_Tune$results[MARS_Tune$results$nprune==MARS_Tune$bestTune$nprune &
MARS_Tune$results$degree==MARS_Tune$bestTune$degree,
c("Rsquared")])
## [1] 0.8823049
##################################
# Identifying and plotting the
# best model predictors
##################################
MARS_VarImp <- varImp(MARS_Tune, scale = TRUE)
plot(MARS_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Multivariate Adaptive Regression Splines",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
MARS_Test <- data.frame(MARS_Observed = PMA_PreModelling_Test$Log_Solubility,
MARS_Predicted = predict(MARS_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Log_Solubility")]))
MARS_Test
## MARS_Observed y
## 1 0.93 0.76892387
## 2 0.85 0.14312439
## 3 0.81 -0.34585499
## 4 0.74 -0.33314461
## 5 0.61 -0.45292704
## 6 0.58 0.97274206
## 7 0.57 0.48607461
## 8 0.56 0.73627468
## 9 0.52 -0.03806131
## 10 0.45 -0.14649919
## 11 0.40 -0.14649919
## 12 0.36 -1.79679729
## 13 0.22 -0.41538986
## 14 0.08 -0.40552544
## 15 0.07 -1.17892314
## 16 0.02 -0.49870512
## 17 0.00 -0.59427338
## 18 -0.01 -0.22958003
## 19 -0.07 0.25523921
## 20 -0.12 -0.83910301
## 21 -0.17 0.54520397
## 22 -0.29 0.61508860
## 23 -0.38 -0.91923808
## 24 -0.38 -0.74344480
## 25 -0.39 -0.91923808
## 26 -0.42 -0.74344480
## 27 -0.44 -0.59721214
## 28 -0.46 0.36577945
## 29 -0.48 -2.61004959
## 30 -0.60 -0.65168062
## 31 -0.63 -3.01395024
## 32 -0.66 -0.73554105
## 33 -0.72 -0.73554105
## 34 -0.72 -0.54774008
## 35 -0.80 -0.54774008
## 36 -0.80 -0.99254612
## 37 -0.82 0.09111766
## 38 -0.82 -0.73554105
## 39 -0.84 -0.62663374
## 40 -0.85 -0.80027030
## 41 -0.85 -0.34900714
## 42 -0.87 -1.55934291
## 43 -0.89 -1.37553750
## 44 -0.90 0.17140988
## 45 -0.96 -2.27526868
## 46 -0.96 0.32437683
## 47 -0.99 -0.23488426
## 48 -1.01 -0.73554105
## 49 -1.09 -1.24426320
## 50 -1.12 -1.38447918
## 51 -1.14 0.13754052
## 52 -1.17 -1.13685295
## 53 -1.19 -1.26101447
## 54 -1.22 -1.22800620
## 55 -1.27 -1.50775967
## 56 -1.28 -1.55867097
## 57 -1.32 -1.64040574
## 58 -1.38 -1.89603447
## 59 -1.39 -1.50959168
## 60 -1.42 -2.36130718
## 61 -1.47 -0.96904918
## 62 -1.47 -1.32309870
## 63 -1.50 -0.57828240
## 64 -1.52 -1.36259436
## 65 -1.54 -1.26101447
## 66 -1.55 -2.03224428
## 67 -1.56 -2.11989280
## 68 -1.57 -1.65889136
## 69 -1.60 -1.53401452
## 70 -1.60 -1.59059428
## 71 -1.62 -2.40166859
## 72 -1.64 -2.05873294
## 73 -1.67 -1.46853229
## 74 -1.70 -2.55740066
## 75 -1.70 -1.94955288
## 76 -1.71 -2.02070510
## 77 -1.71 -1.98783184
## 78 -1.75 -1.57432194
## 79 -1.78 -1.27949961
## 80 -1.78 -1.29043367
## 81 -1.82 -0.67092628
## 82 -1.87 -0.99664060
## 83 -1.89 -2.48329490
## 84 -1.92 -1.81942060
## 85 -1.92 -1.04037012
## 86 -1.92 -1.45328259
## 87 -1.94 -2.24827541
## 88 -1.99 -1.79937600
## 89 -2.00 -1.74270366
## 90 -2.05 -1.93979298
## 91 -2.06 -1.75787050
## 92 -2.08 -1.91837147
## 93 -2.10 -2.39028501
## 94 -2.11 -0.91090821
## 95 -2.12 0.60380783
## 96 -2.17 -2.01635355
## 97 -2.21 -2.47326982
## 98 -2.24 -1.94735591
## 99 -2.24 -1.45372800
## 100 -2.29 -2.08876198
## 101 -2.31 -1.50952526
## 102 -2.32 -1.96220573
## 103 -2.35 -2.14600574
## 104 -2.35 -1.61565689
## 105 -2.36 -3.73129268
## 106 -2.36 -1.96986410
## 107 -2.38 -2.85306228
## 108 -2.42 -2.39234978
## 109 -2.43 -3.76412015
## 110 -2.44 -3.24071824
## 111 -2.52 -2.24803538
## 112 -2.53 -2.28315411
## 113 -2.57 -2.24803538
## 114 -2.62 -3.10362983
## 115 -2.62 -3.00158745
## 116 -2.64 -2.96767777
## 117 -2.64 -3.35548368
## 118 -2.70 -4.25241319
## 119 -2.82 -3.45751861
## 120 -2.88 -1.93823961
## 121 -2.89 -2.39963380
## 122 -2.92 -1.39749682
## 123 -2.93 -3.08595702
## 124 -2.96 -2.54235854
## 125 -2.98 -3.93108321
## 126 -3.01 -2.35307133
## 127 -3.01 -4.50500549
## 128 -3.02 -2.51929331
## 129 -3.07 -2.69557749
## 130 -3.09 -2.83001114
## 131 -3.11 -3.25768913
## 132 -3.13 -4.10603182
## 133 -3.14 -1.23622872
## 134 -3.15 -3.84144279
## 135 -3.22 -2.76710780
## 136 -3.26 -2.80002180
## 137 -3.27 -1.98653550
## 138 -3.27 -3.25768913
## 139 -3.30 -3.65416530
## 140 -3.31 -1.60991855
## 141 -3.33 -2.53601483
## 142 -3.37 -2.75739769
## 143 -3.43 -3.39312647
## 144 -3.43 -2.43097739
## 145 -3.48 -3.15509827
## 146 -3.51 -3.62621348
## 147 -3.59 -3.51911679
## 148 -3.61 -3.15196542
## 149 -3.63 -4.03201026
## 150 -3.63 -3.44261322
## 151 -3.68 -1.63449819
## 152 -3.71 -4.50720229
## 153 -3.74 -4.02097476
## 154 -3.75 -3.64465652
## 155 -3.75 -3.88199138
## 156 -3.77 -3.52026993
## 157 -3.77 -4.23892372
## 158 -3.78 -4.45394764
## 159 -3.81 -3.08318538
## 160 -3.95 -4.27149407
## 161 -3.96 -4.59515852
## 162 -3.96 -3.65674380
## 163 -4.00 -3.26110326
## 164 -4.02 -3.78080404
## 165 -4.04 -4.24459971
## 166 -4.12 -3.64465652
## 167 -4.15 -4.19144272
## 168 -4.16 -3.80882138
## 169 -4.17 -5.02012232
## 170 -4.21 -4.76251871
## 171 -4.23 -4.43674661
## 172 -4.25 -5.05130369
## 173 -4.30 -3.87942381
## 174 -4.31 -5.65090762
## 175 -4.35 -4.59463177
## 176 -4.40 -3.75658431
## 177 -4.40 -4.45887269
## 178 -4.43 -4.95174143
## 179 -4.46 -4.45327859
## 180 -4.47 -4.91254849
## 181 -4.51 -4.99182180
## 182 -4.60 -3.62483206
## 183 -4.64 -4.09282596
## 184 -4.69 -5.31534355
## 185 -4.71 -4.05147739
## 186 -4.77 -3.82532960
## 187 -4.95 -4.65121197
## 188 -4.98 -4.37807284
## 189 -5.21 -5.75852554
## 190 -5.22 -5.04248295
## 191 -5.28 -3.78029468
## 192 -5.31 -3.02722452
## 193 -5.35 -5.02512514
## 194 -5.37 -4.90962912
## 195 -5.40 -4.27856689
## 196 -5.43 -4.78357040
## 197 -5.65 -5.69232707
## 198 -5.66 -4.26545272
## 199 -6.70 -5.64662620
## 200 -5.72 -4.88488055
## 201 -6.00 -6.45403152
## 202 -6.25 -6.33246176
## 203 -6.26 -6.33246176
## 204 -6.27 -6.33246176
## 205 -6.35 -5.59520463
## 206 -6.57 -6.07694265
## 207 -6.62 -4.70193613
## 208 -6.96 -5.80074158
## 209 -7.02 -7.72534637
## 210 -7.20 -7.34543990
## 211 -7.28 -6.86581992
## 212 -7.32 -7.36767153
## 213 -7.39 -7.36767153
## 214 -7.82 -8.17260423
## 215 -8.23 -8.11165366
## 216 -8.94 -8.76562128
## 217 1.07 0.14964568
## 218 0.43 0.01165068
## 219 0.32 0.12220789
## 220 0.00 -0.17657276
## 221 -0.40 -0.90002866
## 222 -0.52 -0.57307431
## 223 -0.55 -0.75852623
## 224 -0.60 -0.74062318
## 225 -0.62 -2.83283995
## 226 -0.85 -1.08794655
## 227 -0.89 -1.06917180
## 228 -0.93 -1.38582368
## 229 -0.96 -0.82392306
## 230 -1.06 -1.80039862
## 231 -1.10 -1.39937965
## 232 -1.12 -0.42597283
## 233 -1.15 -1.22800620
## 234 -1.28 -0.27842908
## 235 -1.30 -1.41326697
## 236 -1.31 -1.26101447
## 237 -1.35 -1.37918726
## 238 -1.39 -1.17035624
## 239 -1.41 -1.26101447
## 240 -1.41 -0.67092628
## 241 -1.42 -0.97892080
## 242 -1.46 -2.31873453
## 243 -1.50 -1.72223274
## 244 -1.50 -1.74317544
## 245 -1.52 -1.60327296
## 246 -1.52 -0.59809889
## 247 -1.59 -1.72955934
## 248 -1.61 -1.43551515
## 249 -1.63 -1.42629212
## 250 -1.71 -2.03463763
## 251 -1.83 -2.90210899
## 252 -2.05 -0.94393986
## 253 -2.06 -2.04450697
## 254 -2.07 -3.48517443
## 255 -2.15 -2.40954067
## 256 -2.16 -0.90605765
## 257 -1.99 -0.74041817
## 258 -2.36 -2.19175245
## 259 -2.38 -2.76552215
## 260 -2.39 -0.92341877
## 261 -2.46 -2.16624833
## 262 -2.49 -2.46461783
## 263 -2.54 -2.29690705
## 264 -2.55 -3.44594353
## 265 -2.63 -3.01538128
## 266 -2.64 -2.77620169
## 267 -2.67 -2.27927439
## 268 -2.68 -2.03810034
## 269 -2.77 -2.96249574
## 270 -2.78 -3.12735599
## 271 -2.82 -3.00441986
## 272 -2.92 -3.80173414
## 273 -3.03 -3.47481071
## 274 -3.12 -3.64366980
## 275 -3.16 -2.45235060
## 276 -3.19 -3.44261322
## 277 -3.54 -3.51752975
## 278 -3.54 -2.36402441
## 279 -3.59 -3.46086523
## 280 -3.66 -3.52725326
## 281 -3.68 -4.02097476
## 282 -3.75 -4.06139824
## 283 -3.76 -3.67602690
## 284 -3.78 -3.94705267
## 285 -3.80 -4.36861267
## 286 -3.80 -4.72117277
## 287 -3.85 -4.35910681
## 288 -3.89 -4.64471393
## 289 -3.95 -4.61902812
## 290 -4.29 -4.58222895
## 291 -4.42 -5.79748198
## 292 -4.48 -4.19672434
## 293 -4.48 -4.91254849
## 294 -4.53 -5.22016678
## 295 -4.63 -4.72479212
## 296 -4.73 -4.04285744
## 297 -4.84 -3.48882864
## 298 -4.89 -4.17422747
## 299 -4.89 -4.48292982
## 300 -5.26 -5.59520463
## 301 -6.09 -4.70193613
## 302 -6.29 -5.56779875
## 303 -6.29 -6.33246176
## 304 -6.89 -5.88920901
## 305 -6.96 -5.46080954
## 306 -7.00 -6.86581992
## 307 -7.05 -7.36767153
## 308 -8.30 -8.76562128
## 309 -8.66 -9.14422945
## 310 -9.03 -9.26828995
## 311 -10.41 -9.95165538
## 312 -7.89 -7.36767153
## 313 -2.32 -1.81686774
## 314 0.39 -2.98832962
## 315 -2.90 -5.35876149
## 316 -2.47 -3.26304651
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(MARS_Test_Metrics <- postResample(MARS_Test[,2], MARS_Test[,1]))
## RMSE Rsquared MAE
## 0.758042 0.868910 0.562053
(MARS_Test_RMSE <- MARS_Test_Metrics[1])
## RMSE
## 0.758042
(MARS_Test_Rsquared <- MARS_Test_Metrics[2])
## Rsquared
## 0.86891
1.5.9 Support Vector Machine - Radial Basis Function Kernel
(SVM_R)
[A] The support vector machine (radial basis function
kernel) model from the
kernlab
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] sigma =
sigma held constant at a value of 0.00285
[B.2] C = cost
made to vary across a range of 14 default values
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
sigma=0.00285 and C=16
[C.2] Root-Mean-Square Error = 0.59505
[C.3] R-Squared = 0.91551
[D] The model does not allow for ranking of predictors
in terms of variable importance.
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.62742
[E.2] R-Squared = 0.90989
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_SVM_R <- as.data.frame(lapply(PMA_PreModelling_Train, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Train_SVM_R)
## [1] 951 221
PMA_PreModelling_Test_SVM_R <- as.data.frame(lapply(PMA_PreModelling_Test, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Test_SVM_R)
## [1] 316 221
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_SVM_R$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# used a range of default values
##################################
# Running the support vector machine (radial basis function kernel) model
# by setting the caret method to 'svmRadial'
##################################
set.seed(12345678)
SVM_R_Tune <- train(x = PMA_PreModelling_Train_SVM_R[,!names(PMA_PreModelling_Train_SVM_R) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_SVM_R$Log_Solubility,
method = "svmRadial",
tuneLength = 14,
trControl = KFold_Control,
preProc = c("center", "scale"))
##################################
# Reporting the cross-validation results
# for the train set
##################################
SVM_R_Tune
## Support Vector Machines with Radial Basis Function Kernel
##
## 951 samples
## 220 predictors
##
## Pre-processing: centered (220), scaled (220)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.8096652 0.8648241 0.6037022
## 0.50 0.7082927 0.8880984 0.5294633
## 1.00 0.6537354 0.9006737 0.4849819
## 2.00 0.6221070 0.9086287 0.4586863
## 4.00 0.6102493 0.9116037 0.4481332
## 8.00 0.5981789 0.9147527 0.4394685
## 16.00 0.5950500 0.9155098 0.4386376
## 32.00 0.5964935 0.9151700 0.4401950
## 64.00 0.5970088 0.9150461 0.4394875
## 128.00 0.5988087 0.9145520 0.4408041
## 256.00 0.6002748 0.9141568 0.4416319
## 512.00 0.6023160 0.9135943 0.4428558
## 1024.00 0.6054508 0.9127164 0.4447153
## 2048.00 0.6093996 0.9116221 0.4483096
##
## Tuning parameter 'sigma' was held constant at a value of 0.002858301
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.002858301 and C = 16.
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 16
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.00285830098890164
##
## Number of Support Vectors : 619
##
## Objective Function Value : -275.9914
## Training error : 0.009889
## sigma C RMSE Rsquared MAE RMSESD RsquaredSD
## 1 0.002858301 0.25 0.8096652 0.8648241 0.6037022 0.07449474 0.02540316
## 2 0.002858301 0.50 0.7082927 0.8880984 0.5294633 0.05703847 0.02138058
## 3 0.002858301 1.00 0.6537354 0.9006737 0.4849819 0.05072661 0.01960991
## 4 0.002858301 2.00 0.6221070 0.9086287 0.4586863 0.04872850 0.01682004
## 5 0.002858301 4.00 0.6102493 0.9116037 0.4481332 0.04980713 0.01505246
## 6 0.002858301 8.00 0.5981789 0.9147527 0.4394685 0.05030562 0.01557070
## 7 0.002858301 16.00 0.5950500 0.9155098 0.4386376 0.04905689 0.01634957
## 8 0.002858301 32.00 0.5964935 0.9151700 0.4401950 0.04954128 0.01652382
## 9 0.002858301 64.00 0.5970088 0.9150461 0.4394875 0.05073287 0.01635371
## 10 0.002858301 128.00 0.5988087 0.9145520 0.4408041 0.05126175 0.01595339
## 11 0.002858301 256.00 0.6002748 0.9141568 0.4416319 0.05073171 0.01574943
## 12 0.002858301 512.00 0.6023160 0.9135943 0.4428558 0.04940859 0.01564832
## 13 0.002858301 1024.00 0.6054508 0.9127164 0.4447153 0.04790476 0.01530957
## 14 0.002858301 2048.00 0.6093996 0.9116221 0.4483096 0.04652339 0.01490467
## MAESD
## 1 0.05859916
## 2 0.04672074
## 3 0.04096699
## 4 0.03486431
## 5 0.03220601
## 6 0.03251493
## 7 0.03264716
## 8 0.03314391
## 9 0.03421506
## 10 0.03518379
## 11 0.03497268
## 12 0.03490923
## 13 0.03404608
## 14 0.03416602
SVM_R_Train_RMSE <- SVM_R_Tune$results[SVM_R_Tune$results$C==SVM_R_Tune$bestTune$C,
c("RMSE")]
SVM_R_Train_Rsquared <- SVM_R_Tune$results[SVM_R_Tune$results$C==SVM_R_Tune$bestTune$C,
c("Rsquared")]
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
SVM_R_Test <- data.frame(SVM_R_Observed = PMA_PreModelling_Test$Log_Solubility,
SVM_R_Predicted = predict(SVM_R_Tune,
PMA_PreModelling_Test_SVM_R[,!names(PMA_PreModelling_Test_SVM_R) %in% c("Log_Solubility")]))
SVM_R_Test
## SVM_R_Observed SVM_R_Predicted
## 1 0.93 0.377022305
## 2 0.85 0.615378735
## 3 0.81 -0.495500158
## 4 0.74 0.999716875
## 5 0.61 -0.273021690
## 6 0.58 1.144789709
## 7 0.57 0.776186139
## 8 0.56 0.351278512
## 9 0.52 0.164746867
## 10 0.45 -0.313673146
## 11 0.40 0.215267762
## 12 0.36 -1.444072128
## 13 0.22 -0.173733882
## 14 0.08 -0.107742817
## 15 0.07 -0.977523710
## 16 0.02 -0.367507982
## 17 0.00 0.005440399
## 18 -0.01 -0.043270643
## 19 -0.07 -0.534124545
## 20 -0.12 -0.314219905
## 21 -0.17 0.800729693
## 22 -0.29 -0.249230909
## 23 -0.38 -0.497962627
## 24 -0.38 -0.283376194
## 25 -0.39 -1.009446033
## 26 -0.42 -0.531411164
## 27 -0.44 -0.470814324
## 28 -0.46 0.650988718
## 29 -0.48 -2.253412310
## 30 -0.60 -1.023406891
## 31 -0.63 -2.192023433
## 32 -0.66 -0.685469901
## 33 -0.72 -0.470897408
## 34 -0.72 -0.193187463
## 35 -0.80 -0.099438571
## 36 -0.80 -0.851745808
## 37 -0.82 -0.117707185
## 38 -0.82 -0.707807306
## 39 -0.84 -0.250509212
## 40 -0.85 -0.718555103
## 41 -0.85 -0.457840413
## 42 -0.87 -1.605336586
## 43 -0.89 -1.266233178
## 44 -0.90 0.202970718
## 45 -0.96 -1.300590230
## 46 -0.96 -1.113669532
## 47 -0.99 -0.788372992
## 48 -1.01 -0.879754535
## 49 -1.09 -1.166775522
## 50 -1.12 -0.782898076
## 51 -1.14 -1.016515705
## 52 -1.17 -1.053804117
## 53 -1.19 -1.524217223
## 54 -1.22 -1.153215847
## 55 -1.27 -1.581349539
## 56 -1.28 -1.412468806
## 57 -1.32 -1.400032827
## 58 -1.38 -1.249607292
## 59 -1.39 -1.898505458
## 60 -1.42 -1.644696329
## 61 -1.47 -1.469277638
## 62 -1.47 -1.431606890
## 63 -1.50 -1.391886851
## 64 -1.52 -1.455094629
## 65 -1.54 -1.651900342
## 66 -1.55 -2.357220471
## 67 -1.56 -3.116362973
## 68 -1.57 -2.050972286
## 69 -1.60 -1.891256352
## 70 -1.60 -2.605498896
## 71 -1.62 -1.653764049
## 72 -1.64 -2.578493119
## 73 -1.67 -1.623587385
## 74 -1.70 -3.407635951
## 75 -1.70 -2.209337572
## 76 -1.71 -2.578630647
## 77 -1.71 -2.295490696
## 78 -1.75 -1.767491468
## 79 -1.78 -2.115841996
## 80 -1.78 -1.844700866
## 81 -1.82 -1.545720948
## 82 -1.87 -1.147509663
## 83 -1.89 -2.238332142
## 84 -1.92 -1.752672883
## 85 -1.92 -1.601158692
## 86 -1.92 -1.414138108
## 87 -1.94 -2.688640423
## 88 -1.99 -2.657220311
## 89 -2.00 -2.286036951
## 90 -2.05 -2.230578068
## 91 -2.06 -1.896853833
## 92 -2.08 -2.472398752
## 93 -2.10 -2.888216605
## 94 -2.11 -1.105576542
## 95 -2.12 -1.587921526
## 96 -2.17 -2.382127845
## 97 -2.21 -1.899554335
## 98 -2.24 -2.595826372
## 99 -2.24 -1.136841757
## 100 -2.29 -2.335588764
## 101 -2.31 -2.118959614
## 102 -2.32 -2.118769515
## 103 -2.35 -2.269277663
## 104 -2.35 -2.134777212
## 105 -2.36 -2.665765474
## 106 -2.36 -1.463534992
## 107 -2.38 -2.234915921
## 108 -2.42 -2.052935091
## 109 -2.43 -3.391729050
## 110 -2.44 -2.712549116
## 111 -2.52 -2.702197745
## 112 -2.53 -2.929447598
## 113 -2.57 -2.836040487
## 114 -2.62 -2.400751009
## 115 -2.62 -2.851182824
## 116 -2.64 -2.901480332
## 117 -2.64 -2.916113108
## 118 -2.70 -2.814932392
## 119 -2.82 -2.542155181
## 120 -2.88 -2.885532840
## 121 -2.89 -3.025616643
## 122 -2.92 -1.152556489
## 123 -2.93 -4.064703841
## 124 -2.96 -2.712008375
## 125 -2.98 -2.686608036
## 126 -3.01 -2.103323633
## 127 -3.01 -3.341049576
## 128 -3.02 -3.383212873
## 129 -3.07 -3.300387203
## 130 -3.09 -3.167463222
## 131 -3.11 -3.540992536
## 132 -3.13 -3.269241486
## 133 -3.14 -2.997857623
## 134 -3.15 -3.587759298
## 135 -3.22 -2.105110561
## 136 -3.26 -3.615825340
## 137 -3.27 -2.972121814
## 138 -3.27 -3.289672245
## 139 -3.30 -3.281242551
## 140 -3.31 -2.652632885
## 141 -3.33 -2.451874650
## 142 -3.37 -2.673141107
## 143 -3.43 -3.652189262
## 144 -3.43 -2.963520099
## 145 -3.48 -2.734132679
## 146 -3.51 -3.759421710
## 147 -3.59 -3.117634995
## 148 -3.61 -2.865708744
## 149 -3.63 -3.890887067
## 150 -3.63 -3.615553852
## 151 -3.68 -2.238188670
## 152 -3.71 -3.544321782
## 153 -3.74 -3.436753466
## 154 -3.75 -4.157748720
## 155 -3.75 -3.624836042
## 156 -3.77 -3.725467014
## 157 -3.77 -4.404887276
## 158 -3.78 -3.772025829
## 159 -3.81 -3.651606763
## 160 -3.95 -3.971767610
## 161 -3.96 -4.901292941
## 162 -3.96 -4.401856116
## 163 -4.00 -3.561658526
## 164 -4.02 -4.154945749
## 165 -4.04 -4.404172979
## 166 -4.12 -4.188957510
## 167 -4.15 -4.221349408
## 168 -4.16 -3.184883381
## 169 -4.17 -4.796306020
## 170 -4.21 -4.807876135
## 171 -4.23 -4.916891572
## 172 -4.25 -4.046710509
## 173 -4.30 -4.268222102
## 174 -4.31 -5.634051251
## 175 -4.35 -4.651869392
## 176 -4.40 -4.049443466
## 177 -4.40 -4.471285305
## 178 -4.43 -4.666848995
## 179 -4.46 -4.579655203
## 180 -4.47 -3.631150297
## 181 -4.51 -4.891675705
## 182 -4.60 -4.025784390
## 183 -4.64 -4.849530170
## 184 -4.69 -5.299473994
## 185 -4.71 -4.172402729
## 186 -4.77 -4.086780949
## 187 -4.95 -4.150190525
## 188 -4.98 -4.065105341
## 189 -5.21 -6.030279827
## 190 -5.22 -5.164765070
## 191 -5.28 -4.373875317
## 192 -5.31 -3.379565405
## 193 -5.35 -4.814728090
## 194 -5.37 -4.568884126
## 195 -5.40 -4.750906721
## 196 -5.43 -4.047652853
## 197 -5.65 -4.990274486
## 198 -5.66 -4.366499903
## 199 -6.70 -4.602288174
## 200 -5.72 -5.114658476
## 201 -6.00 -6.653135205
## 202 -6.25 -6.546154814
## 203 -6.26 -6.254078169
## 204 -6.27 -6.587388705
## 205 -6.35 -6.436542986
## 206 -6.57 -6.157770811
## 207 -6.62 -5.255748105
## 208 -6.96 -6.197333500
## 209 -7.02 -7.653849775
## 210 -7.20 -7.445634469
## 211 -7.28 -7.275692515
## 212 -7.32 -7.760467059
## 213 -7.39 -7.833836642
## 214 -7.82 -8.377927995
## 215 -8.23 -8.704717870
## 216 -8.94 -8.486476898
## 217 1.07 0.033206407
## 218 0.43 0.342412722
## 219 0.32 -0.296738855
## 220 0.00 0.237151659
## 221 -0.40 -1.289882629
## 222 -0.52 -0.460806067
## 223 -0.55 -0.640695572
## 224 -0.60 -0.618080635
## 225 -0.62 -2.157719490
## 226 -0.85 -1.178257625
## 227 -0.89 -0.645907140
## 228 -0.93 -1.064454849
## 229 -0.96 -0.920571854
## 230 -1.06 -1.825633046
## 231 -1.10 -1.258264981
## 232 -1.12 -0.932496194
## 233 -1.15 -0.811202310
## 234 -1.28 -0.673083860
## 235 -1.30 -0.938499257
## 236 -1.31 -1.714428660
## 237 -1.35 -2.157742946
## 238 -1.39 -2.303111346
## 239 -1.41 -1.524217223
## 240 -1.41 -1.617521856
## 241 -1.42 -1.160491567
## 242 -1.46 -1.757300522
## 243 -1.50 -1.529587296
## 244 -1.50 -1.866686517
## 245 -1.52 -1.888299299
## 246 -1.52 -1.127109450
## 247 -1.59 -1.878042961
## 248 -1.61 -1.516062571
## 249 -1.63 -1.401531925
## 250 -1.71 -2.823961805
## 251 -1.83 -2.238115337
## 252 -2.05 -2.523258936
## 253 -2.06 -2.137902734
## 254 -2.07 -3.462960502
## 255 -2.15 -2.381700375
## 256 -2.16 -0.839919439
## 257 -1.99 -1.667208513
## 258 -2.36 -2.330953460
## 259 -2.38 -3.077290007
## 260 -2.39 -1.738152693
## 261 -2.46 -2.128524036
## 262 -2.49 -2.239051608
## 263 -2.54 -2.598974846
## 264 -2.55 -2.843872131
## 265 -2.63 -2.765634259
## 266 -2.64 -2.302265487
## 267 -2.67 -2.827141104
## 268 -2.68 -1.771335226
## 269 -2.77 -2.577234554
## 270 -2.78 -2.934429670
## 271 -2.82 -2.189159005
## 272 -2.92 -3.434728199
## 273 -3.03 -3.704745246
## 274 -3.12 -3.846550176
## 275 -3.16 -3.276068182
## 276 -3.19 -3.389947805
## 277 -3.54 -3.503200610
## 278 -3.54 -2.566182583
## 279 -3.59 -3.612913661
## 280 -3.66 -3.576761405
## 281 -3.68 -3.308960113
## 282 -3.75 -3.987304108
## 283 -3.76 -3.381057144
## 284 -3.78 -4.052769004
## 285 -3.80 -3.991689749
## 286 -3.80 -4.430228082
## 287 -3.85 -3.657422448
## 288 -3.89 -3.507907547
## 289 -3.95 -4.066589608
## 290 -4.29 -5.236341193
## 291 -4.42 -4.871003795
## 292 -4.48 -3.812777461
## 293 -4.48 -3.429953859
## 294 -4.53 -4.907051173
## 295 -4.63 -4.742097842
## 296 -4.73 -4.529025529
## 297 -4.84 -4.558342063
## 298 -4.89 -4.141407629
## 299 -4.89 -5.203295643
## 300 -5.26 -5.974658488
## 301 -6.09 -5.137652342
## 302 -6.29 -6.240607037
## 303 -6.29 -6.558637603
## 304 -6.89 -5.904143016
## 305 -6.96 -6.618614467
## 306 -7.00 -6.871813984
## 307 -7.05 -7.892004727
## 308 -8.30 -8.882113937
## 309 -8.66 -9.854624215
## 310 -9.03 -9.525977965
## 311 -10.41 -9.898151813
## 312 -7.89 -7.446218949
## 313 -2.32 -1.934078811
## 314 0.39 -2.198779112
## 315 -2.90 -4.558402779
## 316 -2.47 -3.747752017
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(SVM_R_Test_Metrics <- postResample(SVM_R_Test[,2], SVM_R_Test[,1]))
## RMSE Rsquared MAE
## 0.6274210 0.9098906 0.4664314
(SVM_R_Test_RMSE <- SVM_R_Test_Metrics[1])
## RMSE
## 0.627421
(SVM_R_Test_Rsquared <- SVM_R_Test_Metrics[2])
## Rsquared
## 0.9098906
1.5.10 Support Vector Machine - Polynomial Kernel (SVM_P)
[A] The support vector machine (polynomial kernel)
model from the
kernlab
package was implemented through the
caret
package.
[B] The model contains 3 hyperparameters:
[B.1] degree =
polynomial degree made to vary across a range of values equal to 1 to
2
[B.2] scale =
scale made to vary across a range of values equal to 0.001 to
0.010
[B.3] C = cost
made to vary across a range of values equal to 0.25 to 32.00
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves degree=2,
scale=0.001 and C=8
[C.2] Root-Mean-Square Error = 0.60281
[C.3] R-Squared = 0.91167
[D] The model does not allow for ranking of predictors
in terms of variable importance.
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.63778
[E.2] R-Squared = 0.90623
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_SVM_P <- as.data.frame(lapply(PMA_PreModelling_Train, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Train_SVM_P)
## [1] 951 221
PMA_PreModelling_Test_SVM_P <- as.data.frame(lapply(PMA_PreModelling_Test, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Test_SVM_P)
## [1] 316 221
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_SVM_P$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
SVM_P_Grid = expand.grid(degree = 1:2,
scale = c(0.01, 0.005, 0.001),
C = 2^(-2:5))
##################################
# Running the support vector machine (polynomial kernel) model
# by setting the caret method to 'svmPoly'
##################################
set.seed(12345678)
SVM_P_Tune <- train(x = PMA_PreModelling_Train_SVM_P[,!names(PMA_PreModelling_Train_SVM_P) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_SVM_P$Log_Solubility,
method = "svmPoly",
tuneGrid = SVM_P_Grid,
trControl = KFold_Control,
preProc = c("center", "scale"))
##################################
# Reporting the cross-validation results
# for the train set
##################################
SVM_P_Tune
## Support Vector Machines with Polynomial Kernel
##
## 951 samples
## 220 predictors
##
## Pre-processing: centered (220), scaled (220)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## degree scale C RMSE Rsquared MAE
## 1 0.001 0.25 1.0813115 0.7829667 0.8202879
## 1 0.001 0.50 0.8997673 0.8307038 0.6850280
## 1 0.001 1.00 0.7854557 0.8607263 0.5964276
## 1 0.001 2.00 0.7203591 0.8781068 0.5463177
## 1 0.001 4.00 0.6833017 0.8874576 0.5181085
## 1 0.001 8.00 0.6597666 0.8939662 0.4989184
## 1 0.001 16.00 0.6604655 0.8934007 0.4990358
## 1 0.001 32.00 0.6648932 0.8925153 0.4974638
## 1 0.005 0.25 0.7626772 0.8669115 0.5790854
## 1 0.005 0.50 0.7036714 0.8824261 0.5344290
## 1 0.005 1.00 0.6744750 0.8897958 0.5120373
## 1 0.005 2.00 0.6599982 0.8937877 0.4978656
## 1 0.005 4.00 0.6632663 0.8925820 0.4992507
## 1 0.005 8.00 0.6667905 0.8921659 0.4973640
## 1 0.005 16.00 0.6711989 0.8909501 0.5015729
## 1 0.005 32.00 0.6773182 0.8892810 0.5029691
## 1 0.010 0.25 0.7036730 0.8824235 0.5344304
## 1 0.010 0.50 0.6744566 0.8898008 0.5120301
## 1 0.010 1.00 0.6600157 0.8937816 0.4978646
## 1 0.010 2.00 0.6632819 0.8925722 0.4992309
## 1 0.010 4.00 0.6667228 0.8921874 0.4972902
## 1 0.010 8.00 0.6711336 0.8909682 0.5014820
## 1 0.010 16.00 0.6772383 0.8892925 0.5030500
## 1 0.010 32.00 0.6829954 0.8876055 0.5038941
## 2 0.001 0.25 0.8787346 0.8384794 0.6664104
## 2 0.001 0.50 0.7649968 0.8687093 0.5800609
## 2 0.001 1.00 0.6886037 0.8889442 0.5220816
## 2 0.001 2.00 0.6441548 0.9002961 0.4845796
## 2 0.001 4.00 0.6170665 0.9076174 0.4619635
## 2 0.001 8.00 0.6028074 0.9116780 0.4470343
## 2 0.001 16.00 0.6097426 0.9099771 0.4521128
## 2 0.001 32.00 0.6166312 0.9084466 0.4569551
## 2 0.005 0.25 0.6443821 0.9013243 0.4859189
## 2 0.005 0.50 0.6143918 0.9090246 0.4577171
## 2 0.005 1.00 0.6054694 0.9116588 0.4482625
## 2 0.005 2.00 0.6034269 0.9124624 0.4481158
## 2 0.005 4.00 0.6064033 0.9117054 0.4502996
## 2 0.005 8.00 0.6122424 0.9102944 0.4538997
## 2 0.005 16.00 0.6234769 0.9073413 0.4595375
## 2 0.005 32.00 0.6357322 0.9038510 0.4675284
## 2 0.010 0.25 0.6166474 0.9087067 0.4557365
## 2 0.010 0.50 0.6091353 0.9109400 0.4503510
## 2 0.010 1.00 0.6107104 0.9103522 0.4513637
## 2 0.010 2.00 0.6122546 0.9100855 0.4540873
## 2 0.010 4.00 0.6201277 0.9080690 0.4580899
## 2 0.010 8.00 0.6313489 0.9048271 0.4639982
## 2 0.010 16.00 0.6348806 0.9041682 0.4680582
## 2 0.010 32.00 0.6388360 0.9030783 0.4712983
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were degree = 2, scale = 0.001 and C = 8.
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 8
##
## Polynomial kernel function.
## Hyperparameters : degree = 2 scale = 0.001 offset = 1
##
## Number of Support Vectors : 628
##
## Objective Function Value : -592.802
## Training error : 0.026533
## degree scale C RMSE Rsquared MAE RMSESD RsquaredSD
## 1 1 0.001 0.25 1.0813115 0.7829667 0.8202879 0.10939801 0.04515491
## 2 1 0.001 0.50 0.8997673 0.8307038 0.6850280 0.08809186 0.03576289
## 3 1 0.001 1.00 0.7854557 0.8607263 0.5964276 0.07371384 0.02888902
## 4 1 0.001 2.00 0.7203591 0.8781068 0.5463177 0.06343423 0.02581250
## 5 1 0.001 4.00 0.6833017 0.8874576 0.5181085 0.05363954 0.02466819
## 6 1 0.001 8.00 0.6597666 0.8939662 0.4989184 0.05297037 0.02374453
## 7 1 0.001 16.00 0.6604655 0.8934007 0.4990358 0.05463004 0.02511122
## 8 1 0.001 32.00 0.6648932 0.8925153 0.4974638 0.05144032 0.02330463
## 9 1 0.005 0.25 0.7626772 0.8669115 0.5790854 0.06862855 0.02732628
## 10 1 0.005 0.50 0.7036714 0.8824261 0.5344290 0.06006912 0.02468980
## 11 1 0.005 1.00 0.6744750 0.8897958 0.5120373 0.05409251 0.02448250
## 12 1 0.005 2.00 0.6599982 0.8937877 0.4978656 0.05073733 0.02385446
## 13 1 0.005 4.00 0.6632663 0.8925820 0.4992507 0.05207502 0.02474687
## 14 1 0.005 8.00 0.6667905 0.8921659 0.4973640 0.05375424 0.02280333
## 15 1 0.005 16.00 0.6711989 0.8909501 0.5015729 0.05781357 0.02425717
## 16 1 0.005 32.00 0.6773182 0.8892810 0.5029691 0.05519991 0.02351653
## 17 1 0.010 0.25 0.7036730 0.8824235 0.5344304 0.06008214 0.02469073
## 18 1 0.010 0.50 0.6744566 0.8898008 0.5120301 0.05411201 0.02448914
## 19 1 0.010 1.00 0.6600157 0.8937816 0.4978646 0.05078104 0.02386388
## 20 1 0.010 2.00 0.6632819 0.8925722 0.4992309 0.05212358 0.02477361
## 21 1 0.010 4.00 0.6667228 0.8921874 0.4972902 0.05380113 0.02278799
## 22 1 0.010 8.00 0.6711336 0.8909682 0.5014820 0.05780016 0.02429656
## 23 1 0.010 16.00 0.6772383 0.8892925 0.5030500 0.05553416 0.02360975
## 24 1 0.010 32.00 0.6829954 0.8876055 0.5038941 0.05472078 0.02395011
## 25 2 0.001 0.25 0.8787346 0.8384794 0.6664104 0.08461575 0.03382786
## 26 2 0.001 0.50 0.7649968 0.8687093 0.5800609 0.06959155 0.02694008
## 27 2 0.001 1.00 0.6886037 0.8889442 0.5220816 0.05754124 0.02350010
## 28 2 0.001 2.00 0.6441548 0.9002961 0.4845796 0.04827107 0.02132828
## 29 2 0.001 4.00 0.6170665 0.9076174 0.4619635 0.04262720 0.01917478
## 30 2 0.001 8.00 0.6028074 0.9116780 0.4470343 0.04458988 0.01880295
## 31 2 0.001 16.00 0.6097426 0.9099771 0.4521128 0.04611867 0.01893301
## 32 2 0.001 32.00 0.6166312 0.9084466 0.4569551 0.03957519 0.01822714
## 33 2 0.005 0.25 0.6443821 0.9013243 0.4859189 0.04519281 0.01997288
## 34 2 0.005 0.50 0.6143918 0.9090246 0.4577171 0.04259492 0.01815389
## 35 2 0.005 1.00 0.6054694 0.9116588 0.4482625 0.04607185 0.01594568
## 36 2 0.005 2.00 0.6034269 0.9124624 0.4481158 0.04239118 0.01468544
## 37 2 0.005 4.00 0.6064033 0.9117054 0.4502996 0.04296603 0.01535264
## 38 2 0.005 8.00 0.6122424 0.9102944 0.4538997 0.05051189 0.01698474
## 39 2 0.005 16.00 0.6234769 0.9073413 0.4595375 0.05670491 0.01788708
## 40 2 0.005 32.00 0.6357322 0.9038510 0.4675284 0.05855575 0.01830176
## 41 2 0.010 0.25 0.6166474 0.9087067 0.4557365 0.04262273 0.01611519
## 42 2 0.010 0.50 0.6091353 0.9109400 0.4503510 0.04367407 0.01533147
## 43 2 0.010 1.00 0.6107104 0.9103522 0.4513637 0.04211634 0.01597261
## 44 2 0.010 2.00 0.6122546 0.9100855 0.4540873 0.04469450 0.01705732
## 45 2 0.010 4.00 0.6201277 0.9080690 0.4580899 0.05017426 0.01789721
## 46 2 0.010 8.00 0.6313489 0.9048271 0.4639982 0.05311794 0.01854614
## 47 2 0.010 16.00 0.6348806 0.9041682 0.4680582 0.05566059 0.01761767
## 48 2 0.010 32.00 0.6388360 0.9030783 0.4712983 0.05467329 0.01787180
## MAESD
## 1 0.07848841
## 2 0.07058536
## 3 0.05952693
## 4 0.04868334
## 5 0.03486260
## 6 0.03460661
## 7 0.03935118
## 8 0.03698402
## 9 0.05594710
## 10 0.04363187
## 11 0.03477481
## 12 0.03526787
## 13 0.03712135
## 14 0.04031082
## 15 0.04322886
## 16 0.04122167
## 17 0.04366243
## 18 0.03480054
## 19 0.03530389
## 20 0.03711223
## 21 0.04046490
## 22 0.04318967
## 23 0.04156588
## 24 0.04262717
## 25 0.06830106
## 26 0.05869496
## 27 0.04616486
## 28 0.03587886
## 29 0.03160052
## 30 0.03497487
## 31 0.03428316
## 32 0.03065615
## 33 0.03897937
## 34 0.03553405
## 35 0.03289566
## 36 0.02924431
## 37 0.03212803
## 38 0.03858799
## 39 0.04030368
## 40 0.04059176
## 41 0.03300444
## 42 0.03211784
## 43 0.03032351
## 44 0.03471796
## 45 0.03644517
## 46 0.03732003
## 47 0.03885197
## 48 0.03859199
(SVM_P_Train_RMSE <- SVM_P_Tune$results[SVM_P_Tune$results$degree==SVM_P_Tune$bestTune$degree &
SVM_P_Tune$results$scale==SVM_P_Tune$bestTune$scale &
SVM_P_Tune$results$C==SVM_P_Tune$bestTune$C,
c("RMSE")])
## [1] 0.6028074
(SVM_P_Train_Rsquared <- SVM_P_Tune$results[SVM_P_Tune$results$degree==SVM_P_Tune$bestTune$degree &
SVM_P_Tune$results$scale==SVM_P_Tune$bestTune$scale &
SVM_P_Tune$results$C==SVM_P_Tune$bestTune$C,
c("Rsquared")])
## [1] 0.911678
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
SVM_P_Test <- data.frame(SVM_P_Observed = PMA_PreModelling_Test$Log_Solubility,
SVM_P_Predicted = predict(SVM_P_Tune,
PMA_PreModelling_Test_SVM_P[,!names(PMA_PreModelling_Test_SVM_P) %in% c("Log_Solubility")]))
SVM_P_Test
## SVM_P_Observed SVM_P_Predicted
## 1 0.93 0.698154063
## 2 0.85 0.427078854
## 3 0.81 -0.480331714
## 4 0.74 1.151652132
## 5 0.61 -0.191366205
## 6 0.58 1.272369495
## 7 0.57 0.621837390
## 8 0.56 0.275252184
## 9 0.52 0.244099962
## 10 0.45 -0.267458399
## 11 0.40 0.184350760
## 12 0.36 -1.143304605
## 13 0.22 -0.052647414
## 14 0.08 -0.187367223
## 15 0.07 -1.042317495
## 16 0.02 -0.594978656
## 17 0.00 0.016467639
## 18 -0.01 0.046589001
## 19 -0.07 -0.031929988
## 20 -0.12 -0.584611961
## 21 -0.17 0.349572884
## 22 -0.29 -0.199967293
## 23 -0.38 -0.559361582
## 24 -0.38 -0.736243197
## 25 -0.39 -1.039990157
## 26 -0.42 -0.675090910
## 27 -0.44 -0.904872764
## 28 -0.46 0.925867013
## 29 -0.48 -2.220979516
## 30 -0.60 -1.064812332
## 31 -0.63 -1.949547589
## 32 -0.66 -0.588381454
## 33 -0.72 -0.599637631
## 34 -0.72 0.149512404
## 35 -0.80 0.258555219
## 36 -0.80 -0.855727981
## 37 -0.82 0.038362282
## 38 -0.82 -0.738151291
## 39 -0.84 -0.126856464
## 40 -0.85 -0.730214678
## 41 -0.85 -0.451254859
## 42 -0.87 -1.799953374
## 43 -0.89 -1.232357250
## 44 -0.90 0.264729516
## 45 -0.96 -1.278004345
## 46 -0.96 -0.879195975
## 47 -0.99 -0.739438079
## 48 -1.01 -0.645786383
## 49 -1.09 -1.094881375
## 50 -1.12 -0.352566521
## 51 -1.14 -0.628898470
## 52 -1.17 -1.519992371
## 53 -1.19 -1.560170436
## 54 -1.22 -1.209797042
## 55 -1.27 -1.697206808
## 56 -1.28 -1.484302846
## 57 -1.32 -1.367706155
## 58 -1.38 -1.265292190
## 59 -1.39 -1.804276350
## 60 -1.42 -1.536563345
## 61 -1.47 -1.300796908
## 62 -1.47 -1.529248110
## 63 -1.50 -1.175006846
## 64 -1.52 -1.455412061
## 65 -1.54 -1.444841289
## 66 -1.55 -2.049066378
## 67 -1.56 -3.081381839
## 68 -1.57 -1.798677086
## 69 -1.60 -1.621986790
## 70 -1.60 -2.494002048
## 71 -1.62 -1.646953336
## 72 -1.64 -2.344908987
## 73 -1.67 -1.814674132
## 74 -1.70 -3.435314666
## 75 -1.70 -2.090599602
## 76 -1.71 -2.272610716
## 77 -1.71 -2.190820408
## 78 -1.75 -1.873007420
## 79 -1.78 -1.872382431
## 80 -1.78 -1.837311351
## 81 -1.82 -1.339788975
## 82 -1.87 -1.392772803
## 83 -1.89 -2.290974799
## 84 -1.92 -1.816823997
## 85 -1.92 -1.513558557
## 86 -1.92 -1.288063863
## 87 -1.94 -2.646280728
## 88 -1.99 -2.344567911
## 89 -2.00 -2.069970796
## 90 -2.05 -2.057227142
## 91 -2.06 -1.743731941
## 92 -2.08 -2.314880404
## 93 -2.10 -2.681167340
## 94 -2.11 -1.493674031
## 95 -2.12 -1.161436770
## 96 -2.17 -2.218950117
## 97 -2.21 -2.005823739
## 98 -2.24 -2.775730625
## 99 -2.24 -1.250804918
## 100 -2.29 -2.260919022
## 101 -2.31 -2.062057472
## 102 -2.32 -2.320875841
## 103 -2.35 -2.322571028
## 104 -2.35 -2.108682186
## 105 -2.36 -2.676862543
## 106 -2.36 -1.914040871
## 107 -2.38 -2.312320983
## 108 -2.42 -2.654536455
## 109 -2.43 -3.768718391
## 110 -2.44 -3.084686127
## 111 -2.52 -2.405670623
## 112 -2.53 -2.864252823
## 113 -2.57 -2.953668235
## 114 -2.62 -2.520714593
## 115 -2.62 -2.769419605
## 116 -2.64 -2.719666526
## 117 -2.64 -3.116224248
## 118 -2.70 -2.771851109
## 119 -2.82 -2.899431080
## 120 -2.88 -2.750914478
## 121 -2.89 -2.798036236
## 122 -2.92 -1.400209082
## 123 -2.93 -3.866448530
## 124 -2.96 -2.657928905
## 125 -2.98 -2.753468234
## 126 -3.01 -2.653648873
## 127 -3.01 -3.352905368
## 128 -3.02 -3.316902204
## 129 -3.07 -3.220233692
## 130 -3.09 -2.501155681
## 131 -3.11 -3.145705535
## 132 -3.13 -3.586748709
## 133 -3.14 -2.221559956
## 134 -3.15 -3.773571121
## 135 -3.22 -2.270276557
## 136 -3.26 -3.510190238
## 137 -3.27 -2.746469265
## 138 -3.27 -3.105914288
## 139 -3.30 -3.103940938
## 140 -3.31 -2.404622858
## 141 -3.33 -2.388056547
## 142 -3.37 -2.210729887
## 143 -3.43 -3.521228417
## 144 -3.43 -2.657084657
## 145 -3.48 -2.679578828
## 146 -3.51 -3.587751838
## 147 -3.59 -3.206702384
## 148 -3.61 -2.750732435
## 149 -3.63 -3.681777890
## 150 -3.63 -3.463384813
## 151 -3.68 -2.388806642
## 152 -3.71 -3.664334298
## 153 -3.74 -2.679977435
## 154 -3.75 -3.680145916
## 155 -3.75 -3.575739380
## 156 -3.77 -3.287900999
## 157 -3.77 -4.304201535
## 158 -3.78 -4.177008484
## 159 -3.81 -3.573058902
## 160 -3.95 -4.325395477
## 161 -3.96 -5.129282203
## 162 -3.96 -4.173591468
## 163 -4.00 -3.471539684
## 164 -4.02 -4.539149664
## 165 -4.04 -4.349746312
## 166 -4.12 -3.744941883
## 167 -4.15 -4.624183584
## 168 -4.16 -3.690634905
## 169 -4.17 -4.571181725
## 170 -4.21 -4.870470959
## 171 -4.23 -4.506831404
## 172 -4.25 -3.725010074
## 173 -4.30 -4.102783702
## 174 -4.31 -5.274579908
## 175 -4.35 -4.584738560
## 176 -4.40 -4.053139508
## 177 -4.40 -4.420537050
## 178 -4.43 -4.681799740
## 179 -4.46 -4.501839782
## 180 -4.47 -3.165527176
## 181 -4.51 -4.873423580
## 182 -4.60 -4.150885779
## 183 -4.64 -4.837760315
## 184 -4.69 -4.904755051
## 185 -4.71 -4.224662579
## 186 -4.77 -4.448898367
## 187 -4.95 -4.574815404
## 188 -4.98 -4.482689881
## 189 -5.21 -5.859679214
## 190 -5.22 -5.378301893
## 191 -5.28 -4.337243721
## 192 -5.31 -3.411926686
## 193 -5.35 -4.786053391
## 194 -5.37 -4.648226035
## 195 -5.40 -4.843246873
## 196 -5.43 -4.429248524
## 197 -5.65 -5.553963741
## 198 -5.66 -4.394597838
## 199 -6.70 -4.716531543
## 200 -5.72 -5.406897037
## 201 -6.00 -6.874976891
## 202 -6.25 -6.358653469
## 203 -6.26 -6.085281689
## 204 -6.27 -6.515283584
## 205 -6.35 -5.741983739
## 206 -6.57 -6.037426203
## 207 -6.62 -5.529276627
## 208 -6.96 -5.954719784
## 209 -7.02 -7.487759072
## 210 -7.20 -7.055003277
## 211 -7.28 -7.150113058
## 212 -7.32 -7.588775057
## 213 -7.39 -7.894251264
## 214 -7.82 -8.398237705
## 215 -8.23 -8.616266938
## 216 -8.94 -8.443737673
## 217 1.07 0.053883568
## 218 0.43 0.264237386
## 219 0.32 -0.002712269
## 220 0.00 -0.006013736
## 221 -0.40 -1.156654241
## 222 -0.52 -0.460051871
## 223 -0.55 -0.850018873
## 224 -0.60 -0.862812265
## 225 -0.62 -2.084261147
## 226 -0.85 -1.154072411
## 227 -0.89 -0.792161274
## 228 -0.93 -0.913584147
## 229 -0.96 -0.905908724
## 230 -1.06 -1.500321500
## 231 -1.10 -1.434473463
## 232 -1.12 -1.180135586
## 233 -1.15 -0.733109574
## 234 -1.28 -0.105407792
## 235 -1.30 -1.364999293
## 236 -1.31 -1.330237472
## 237 -1.35 -2.140768996
## 238 -1.39 -2.087499555
## 239 -1.41 -1.560170436
## 240 -1.41 -1.521967785
## 241 -1.42 -0.698962133
## 242 -1.46 -1.702764148
## 243 -1.50 -1.532849230
## 244 -1.50 -2.356175031
## 245 -1.52 -1.630211246
## 246 -1.52 -0.642576847
## 247 -1.59 -1.677398010
## 248 -1.61 -1.105511327
## 249 -1.63 -1.186788530
## 250 -1.71 -2.578267901
## 251 -1.83 -2.004651127
## 252 -2.05 -2.225788939
## 253 -2.06 -2.297176559
## 254 -2.07 -3.660567793
## 255 -2.15 -2.528741305
## 256 -2.16 -1.062871654
## 257 -1.99 -0.823862630
## 258 -2.36 -2.247379943
## 259 -2.38 -3.942178022
## 260 -2.39 -1.701539733
## 261 -2.46 -2.217846322
## 262 -2.49 -2.214046226
## 263 -2.54 -2.727163557
## 264 -2.55 -2.905853862
## 265 -2.63 -2.742788832
## 266 -2.64 -1.815781255
## 267 -2.67 -2.933691070
## 268 -2.68 -1.604860228
## 269 -2.77 -2.641220779
## 270 -2.78 -2.843506111
## 271 -2.82 -2.470045840
## 272 -2.92 -3.671258570
## 273 -3.03 -3.555095739
## 274 -3.12 -3.628877999
## 275 -3.16 -2.994200350
## 276 -3.19 -3.390921223
## 277 -3.54 -3.491297819
## 278 -3.54 -2.736365786
## 279 -3.59 -3.683230330
## 280 -3.66 -3.207414868
## 281 -3.68 -2.530291987
## 282 -3.75 -4.137029675
## 283 -3.76 -3.308608734
## 284 -3.78 -4.023137046
## 285 -3.80 -3.957358679
## 286 -3.80 -4.592065546
## 287 -3.85 -3.430825214
## 288 -3.89 -3.515096354
## 289 -3.95 -4.055391955
## 290 -4.29 -4.823325484
## 291 -4.42 -5.182709772
## 292 -4.48 -4.178595474
## 293 -4.48 -2.954933400
## 294 -4.53 -4.992843898
## 295 -4.63 -4.596153408
## 296 -4.73 -4.238659782
## 297 -4.84 -4.351876673
## 298 -4.89 -4.405234977
## 299 -4.89 -4.893337378
## 300 -5.26 -5.523284033
## 301 -6.09 -5.381937124
## 302 -6.29 -5.914195532
## 303 -6.29 -6.521608229
## 304 -6.89 -5.926882430
## 305 -6.96 -6.929080787
## 306 -7.00 -6.917203762
## 307 -7.05 -7.669159704
## 308 -8.30 -8.893935996
## 309 -8.66 -8.789288118
## 310 -9.03 -9.250005248
## 311 -10.41 -10.092627390
## 312 -7.89 -7.436196419
## 313 -2.32 -1.802976316
## 314 0.39 -2.337179132
## 315 -2.90 -5.065220155
## 316 -2.47 -4.032627321
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(SVM_P_Test_Metrics <- postResample(SVM_P_Test[,2], SVM_P_Test[,1]))
## RMSE Rsquared MAE
## 0.6377764 0.9062256 0.4730166
(SVM_P_Test_RMSE <- SVM_P_Test_Metrics[1])
## RMSE
## 0.6377764
(SVM_P_Test_Rsquared <- SVM_P_Test_Metrics[2])
## Rsquared
## 0.9062256
1.5.11 K-Nearest Neighbors (KNN)
[A] The k-nearest neighbors model was implemented
through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] k =
number of neighbors made to vary across a range of values equal to 1 to
15
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves k=3
[C.2] Root-Mean-Square Error = 1.06729
[C.3] R-Squared = 0.73260
[D] The model does not allow for ranking of predictors
in terms of variable importance.
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 1.12471
[E.2] R-Squared = 0.71373
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_KNN <- as.data.frame(lapply(PMA_PreModelling_Train, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Train_KNN)
## [1] 951 221
PMA_PreModelling_Test_KNN <- as.data.frame(lapply(PMA_PreModelling_Test, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Test_KNN)
## [1] 316 221
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_KNN$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
KNN_Grid = data.frame(k = 1:15)
##################################
# Running the k-nearest neighbors model
# by setting the caret method to 'knn'
##################################
set.seed(12345678)
KNN_Tune <- train(x = PMA_PreModelling_Train_KNN[,!names(PMA_PreModelling_Train_KNN) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_KNN$Log_Solubility,
method = "knn",
tuneGrid = KNN_Grid,
trControl = KFold_Control,
preProc = c("center", "scale"))
##################################
# Reporting the cross-validation results
# for the train set
##################################
KNN_Tune
## k-Nearest Neighbors
##
## 951 samples
## 220 predictors
##
## Pre-processing: centered (220), scaled (220)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.193390 0.6909384 0.8777626
## 2 1.109959 0.7159649 0.8060134
## 3 1.067291 0.7326032 0.7947394
## 4 1.068531 0.7307165 0.8044122
## 5 1.071815 0.7288216 0.8057926
## 6 1.076846 0.7255801 0.8051050
## 7 1.077346 0.7255634 0.8133125
## 8 1.073705 0.7266408 0.8133501
## 9 1.068856 0.7273001 0.8113416
## 10 1.075687 0.7244911 0.8189207
## 11 1.083810 0.7206933 0.8290562
## 12 1.092858 0.7149748 0.8374816
## 13 1.106658 0.7079159 0.8493934
## 14 1.113444 0.7055549 0.8521571
## 15 1.121625 0.7002523 0.8661098
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 3.
## 3-nearest neighbor regression model
## k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 1.193390 0.6909384 0.8777626 0.10476073 0.04689864 0.05661709
## 2 2 1.109959 0.7159649 0.8060134 0.06560494 0.06008038 0.03160597
## 3 3 1.067291 0.7326032 0.7947394 0.06626369 0.06949454 0.03079834
## 4 4 1.068531 0.7307165 0.8044122 0.06667622 0.06567027 0.03511312
## 5 5 1.071815 0.7288216 0.8057926 0.06567005 0.06155802 0.04694389
## 6 6 1.076846 0.7255801 0.8051050 0.08324859 0.06058341 0.06353121
## 7 7 1.077346 0.7255634 0.8133125 0.07055653 0.06059999 0.06023208
## 8 8 1.073705 0.7266408 0.8133501 0.05678811 0.05955824 0.05362260
## 9 9 1.068856 0.7273001 0.8113416 0.05777880 0.06681028 0.05431120
## 10 10 1.075687 0.7244911 0.8189207 0.05823014 0.06579563 0.05809347
## 11 11 1.083810 0.7206933 0.8290562 0.05604450 0.06217820 0.05685605
## 12 12 1.092858 0.7149748 0.8374816 0.05824283 0.06402713 0.05709366
## 13 13 1.106658 0.7079159 0.8493934 0.05899373 0.06415460 0.05468233
## 14 14 1.113444 0.7055549 0.8521571 0.05716728 0.06139267 0.05081140
## 15 15 1.121625 0.7002523 0.8661098 0.06343865 0.06509901 0.05939330
(KNN_Train_RMSE <- KNN_Tune$results[KNN_Tune$results$k==KNN_Tune$bestTune$k,
c("RMSE")])
## [1] 1.067291
(KNN_Train_Rsquared <- KNN_Tune$results[KNN_Tune$results$k==KNN_Tune$bestTune$k,
c("Rsquared")])
## [1] 0.7326032
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
KNN_Test <- data.frame(KNN_Observed = PMA_PreModelling_Test$Log_Solubility,
KNN_Predicted = predict(KNN_Tune,
PMA_PreModelling_Test_KNN[,!names(PMA_PreModelling_Test_KNN) %in% c("Log_Solubility")]))
KNN_Test
## KNN_Observed KNN_Predicted
## 1 0.93 0.196666667
## 2 0.85 0.003333333
## 3 0.81 -1.256666667
## 4 0.74 0.876666667
## 5 0.61 -0.453333333
## 6 0.58 0.150000000
## 7 0.57 -1.213333333
## 8 0.56 -0.170000000
## 9 0.52 -0.303333333
## 10 0.45 0.373333333
## 11 0.40 0.373333333
## 12 0.36 0.250000000
## 13 0.22 -0.890000000
## 14 0.08 -0.873333333
## 15 0.07 -0.810000000
## 16 0.02 -1.030000000
## 17 0.00 -0.646666667
## 18 -0.01 -1.246666667
## 19 -0.07 -0.526666667
## 20 -0.12 -1.460000000
## 21 -0.17 0.610000000
## 22 -0.29 -0.860000000
## 23 -0.38 -0.730000000
## 24 -0.38 -2.070000000
## 25 -0.39 -0.676666667
## 26 -0.42 -1.803333333
## 27 -0.44 -2.093333333
## 28 -0.46 -1.186666667
## 29 -0.48 -2.086666667
## 30 -0.60 -2.520000000
## 31 -0.63 -2.693333333
## 32 -0.66 -0.713333333
## 33 -0.72 -0.853333333
## 34 -0.72 0.013333333
## 35 -0.80 -1.723333333
## 36 -0.80 -0.623333333
## 37 -0.82 -0.573333333
## 38 -0.82 -1.166666667
## 39 -0.84 -1.180000000
## 40 -0.85 -2.766666667
## 41 -0.85 -0.143333333
## 42 -0.87 -1.030000000
## 43 -0.89 -2.060000000
## 44 -0.90 1.223333333
## 45 -0.96 -1.340000000
## 46 -0.96 -1.493333333
## 47 -0.99 -0.966666667
## 48 -1.01 -0.516666667
## 49 -1.09 -1.500000000
## 50 -1.12 -0.743333333
## 51 -1.14 -0.716666667
## 52 -1.17 -1.726666667
## 53 -1.19 -0.916666667
## 54 -1.22 -0.566666667
## 55 -1.27 -0.806666667
## 56 -1.28 -1.523333333
## 57 -1.32 -1.210000000
## 58 -1.38 -1.220000000
## 59 -1.39 -1.973333333
## 60 -1.42 -2.245000000
## 61 -1.47 -1.510000000
## 62 -1.47 -1.910000000
## 63 -1.50 -0.650000000
## 64 -1.52 -1.030000000
## 65 -1.54 -1.440000000
## 66 -1.55 -1.206666667
## 67 -1.56 -1.946666667
## 68 -1.57 -2.023333333
## 69 -1.60 -1.340000000
## 70 -1.60 -2.113333333
## 71 -1.62 -2.616666667
## 72 -1.64 -2.580000000
## 73 -1.67 -1.983333333
## 74 -1.70 -2.010000000
## 75 -1.70 -1.163333333
## 76 -1.71 -2.920000000
## 77 -1.71 -2.383333333
## 78 -1.75 -2.086666667
## 79 -1.78 -1.513333333
## 80 -1.78 -1.706666667
## 81 -1.82 -1.453333333
## 82 -1.87 -3.180000000
## 83 -1.89 -1.073333333
## 84 -1.92 -2.463333333
## 85 -1.92 -2.810000000
## 86 -1.92 -0.600000000
## 87 -1.94 -3.193333333
## 88 -1.99 -0.743333333
## 89 -2.00 -2.780000000
## 90 -2.05 -2.436666667
## 91 -2.06 -2.416666667
## 92 -2.08 -1.440000000
## 93 -2.10 -3.763333333
## 94 -2.11 -1.963333333
## 95 -2.12 -0.256666667
## 96 -2.17 -1.860000000
## 97 -2.21 -2.190000000
## 98 -2.24 -2.853333333
## 99 -2.24 -1.983333333
## 100 -2.29 -1.910000000
## 101 -2.31 -3.276666667
## 102 -2.32 -2.226666667
## 103 -2.35 -1.453333333
## 104 -2.35 -2.006666667
## 105 -2.36 -3.410000000
## 106 -2.36 -1.353333333
## 107 -2.38 -2.376666667
## 108 -2.42 -3.543333333
## 109 -2.43 -3.303333333
## 110 -2.44 -2.457500000
## 111 -2.52 -3.010000000
## 112 -2.53 -2.170000000
## 113 -2.57 -3.010000000
## 114 -2.62 -2.457500000
## 115 -2.62 -3.006666667
## 116 -2.64 -3.296666667
## 117 -2.64 -3.640000000
## 118 -2.70 -1.073333333
## 119 -2.82 -1.526666667
## 120 -2.88 -2.836666667
## 121 -2.89 -3.410000000
## 122 -2.92 -1.193333333
## 123 -2.93 -2.886666667
## 124 -2.96 -2.526666667
## 125 -2.98 -0.703333333
## 126 -3.01 -2.190000000
## 127 -3.01 -3.913333333
## 128 -3.02 -4.086666667
## 129 -3.07 -3.230000000
## 130 -3.09 -3.986666667
## 131 -3.11 -3.126666667
## 132 -3.13 -3.336666667
## 133 -3.14 -2.836666667
## 134 -3.15 -1.203333333
## 135 -3.22 -1.546666667
## 136 -3.26 -3.626666667
## 137 -3.27 -2.403333333
## 138 -3.27 -1.936666667
## 139 -3.30 -4.446666667
## 140 -3.31 -2.403333333
## 141 -3.33 -3.096666667
## 142 -3.37 -2.870000000
## 143 -3.43 -3.700000000
## 144 -3.43 -2.763333333
## 145 -3.48 -1.613333333
## 146 -3.51 -0.790000000
## 147 -3.59 -2.457500000
## 148 -3.61 -3.096666667
## 149 -3.63 -3.356666667
## 150 -3.63 -4.473333333
## 151 -3.68 -0.993333333
## 152 -3.71 -3.150000000
## 153 -3.74 -3.260000000
## 154 -3.75 -3.126666667
## 155 -3.75 -3.813333333
## 156 -3.77 -4.416666667
## 157 -3.77 -4.773333333
## 158 -3.78 -3.006666667
## 159 -3.81 -3.426666667
## 160 -3.95 -2.236666667
## 161 -3.96 -6.576666667
## 162 -3.96 -4.566666667
## 163 -4.00 -3.430000000
## 164 -4.02 -2.816666667
## 165 -4.04 -2.823333333
## 166 -4.12 -3.960000000
## 167 -4.15 -4.220000000
## 168 -4.16 -2.290000000
## 169 -4.17 -2.670000000
## 170 -4.21 -1.136666667
## 171 -4.23 -4.930000000
## 172 -4.25 -4.446666667
## 173 -4.30 -4.166666667
## 174 -4.31 -5.940000000
## 175 -4.35 -3.976666667
## 176 -4.40 -4.116666667
## 177 -4.40 -3.823333333
## 178 -4.43 -4.146666667
## 179 -4.46 -4.513333333
## 180 -4.47 -4.446666667
## 181 -4.51 -3.336666667
## 182 -4.60 -4.316666667
## 183 -4.64 -4.213333333
## 184 -4.69 -3.340000000
## 185 -4.71 -4.210000000
## 186 -4.77 -3.726666667
## 187 -4.95 -3.866666667
## 188 -4.98 -2.936666667
## 189 -5.21 -6.106666667
## 190 -5.22 -6.543333333
## 191 -5.28 -4.703333333
## 192 -5.31 -3.150000000
## 193 -5.35 -5.003333333
## 194 -5.37 -3.986666667
## 195 -5.40 -3.626666667
## 196 -5.43 -1.030000000
## 197 -5.65 -4.083333333
## 198 -5.66 -4.210000000
## 199 -6.70 -2.666666667
## 200 -5.72 -5.020000000
## 201 -6.00 -7.723333333
## 202 -6.25 -7.086666667
## 203 -6.26 -6.426666667
## 204 -6.27 -6.030000000
## 205 -6.35 -6.750000000
## 206 -6.57 -4.773333333
## 207 -6.62 -5.020000000
## 208 -6.96 -4.773333333
## 209 -7.02 -4.773333333
## 210 -7.20 -7.163333333
## 211 -7.28 -8.136666667
## 212 -7.32 -6.615000000
## 213 -7.39 -8.296666667
## 214 -7.82 -8.296666667
## 215 -8.23 -8.226666667
## 216 -8.94 -7.670000000
## 217 1.07 -0.170000000
## 218 0.43 -0.580000000
## 219 0.32 0.593333333
## 220 0.00 -1.156666667
## 221 -0.40 -1.913333333
## 222 -0.52 -0.676666667
## 223 -0.55 -0.956666667
## 224 -0.60 -1.813333333
## 225 -0.62 -3.086666667
## 226 -0.85 -1.453333333
## 227 -0.89 -1.580000000
## 228 -0.93 -1.723333333
## 229 -0.96 0.410000000
## 230 -1.06 -2.580000000
## 231 -1.10 -1.946666667
## 232 -1.12 -0.923333333
## 233 -1.15 -0.730000000
## 234 -1.28 -0.560000000
## 235 -1.30 -1.546666667
## 236 -1.31 -1.170000000
## 237 -1.35 -0.790000000
## 238 -1.39 -3.230000000
## 239 -1.41 -0.916666667
## 240 -1.41 -0.763333333
## 241 -1.42 -2.396666667
## 242 -1.46 -0.590000000
## 243 -1.50 -1.590000000
## 244 -1.50 -2.906666667
## 245 -1.52 -2.090000000
## 246 -1.52 -1.270000000
## 247 -1.59 -2.066666667
## 248 -1.61 -1.706666667
## 249 -1.63 -1.780000000
## 250 -1.71 -2.236666667
## 251 -1.83 -1.310000000
## 252 -2.05 -1.846666667
## 253 -2.06 -1.956666667
## 254 -2.07 -0.787500000
## 255 -2.15 -2.716666667
## 256 -2.16 -2.423333333
## 257 -1.99 -1.083333333
## 258 -2.36 -2.793333333
## 259 -2.38 -3.436666667
## 260 -2.39 -1.226666667
## 261 -2.46 -2.303333333
## 262 -2.49 -1.820000000
## 263 -2.54 -2.836666667
## 264 -2.55 -1.790000000
## 265 -2.63 -3.072500000
## 266 -2.64 -2.286666667
## 267 -2.67 -2.770000000
## 268 -2.68 0.730000000
## 269 -2.77 -3.960000000
## 270 -2.78 -1.810000000
## 271 -2.82 -1.526666667
## 272 -2.92 -2.156666667
## 273 -3.03 -2.873333333
## 274 -3.12 -1.956666667
## 275 -3.16 -3.240000000
## 276 -3.19 -3.873333333
## 277 -3.54 -3.416666667
## 278 -3.54 -3.086666667
## 279 -3.59 -4.116666667
## 280 -3.66 -3.013333333
## 281 -3.68 -3.036666667
## 282 -3.75 -3.096666667
## 283 -3.76 -3.570000000
## 284 -3.78 -2.920000000
## 285 -3.80 -3.400000000
## 286 -3.80 -2.273333333
## 287 -3.85 -4.446666667
## 288 -3.89 -2.386666667
## 289 -3.95 -3.916666667
## 290 -4.29 -4.886666667
## 291 -4.42 -2.866666667
## 292 -4.48 -3.626666667
## 293 -4.48 -2.053333333
## 294 -4.53 -5.730000000
## 295 -4.63 -4.386666667
## 296 -4.73 -2.473333333
## 297 -4.84 -4.566666667
## 298 -4.89 -3.410000000
## 299 -4.89 -4.383333333
## 300 -5.26 -6.133333333
## 301 -6.09 -5.020000000
## 302 -6.29 -6.443333333
## 303 -6.29 -6.283333333
## 304 -6.89 -2.343333333
## 305 -6.96 -4.216666667
## 306 -7.00 -6.996666667
## 307 -7.05 -8.136666667
## 308 -8.30 -8.083333333
## 309 -8.66 -7.800000000
## 310 -9.03 -7.596666667
## 311 -10.41 -9.523333333
## 312 -7.89 -6.996666667
## 313 -2.32 -2.183333333
## 314 0.39 -3.086666667
## 315 -2.90 -1.890000000
## 316 -2.47 -5.940000000
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(KNN_Test_Metrics <- postResample(KNN_Test[,2], KNN_Test[,1]))
## RMSE Rsquared MAE
## 1.1247103 0.7137298 0.8436946
(KNN_Test_RMSE <- KNN_Test_Metrics[1])
## RMSE
## 1.12471
(KNN_Test_Rsquared <- KNN_Test_Metrics[2])
## Rsquared
## 0.7137298
1.5.12 Classification and Regression Trees (CART)
[A] The classification and regression trees model from
the
rpart
package was implemented through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] cp =
complexity parameter threshold made to vary across a range of values
equal to 0.001 to 0.020
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
cp=0.001
[C.2] Root-Mean-Square Error = 0.94904
[C.3] R-Squared = 0.78141
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] MolWeight variable (numeric)
[D.2] NumCarbon variable (numeric)
[D.3] HydrophilicFactor variable (numeric)
[D.4] NumBonds
variable (numeric)
[D.5] SurfaceArea1 variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.91946
[E.2] R-Squared = 0.80707
##################################
# Creating a local object
# for the train set
##################################
PMA_PreModelling_Train_CART <- PMA_PreModelling_Train
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_CART$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
CART_Grid = data.frame(cp = c(0.001, 0.005, 0.010, 0.015, 0.020))
##################################
# Running the classification and regression trees model
# by setting the caret method to 'rpart'
##################################
set.seed(12345678)
CART_Tune <- train(x = PMA_PreModelling_Train_CART[,!names(PMA_PreModelling_Train_CART) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_CART$Log_Solubility,
method = "rpart",
tuneGrid = CART_Grid,
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
CART_Tune
## CART
##
## 951 samples
## 220 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.001 0.949044 0.7814193 0.7165704
## 0.005 1.062111 0.7304621 0.8158512
## 0.010 1.094310 0.7129754 0.8405203
## 0.015 1.141204 0.6879057 0.8863220
## 0.020 1.159802 0.6767517 0.9117055
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.001.
## n= 951
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 951 3979.3030000 -2.71857000
## 2) NumCarbon>=0.4506182 317 1114.2000000 -4.48536300
## 4) SurfaceArea2< -1.012864 74 173.0767000 -6.86783800
## 8) MolWeight>=0.4710593 48 58.8117900 -7.68208300
## 16) HydrophilicFactor>=-0.5793982 16 18.1638400 -8.59187500 *
## 17) HydrophilicFactor< -0.5793982 32 20.7826500 -7.22718800
## 34) FP184=0 13 4.7800770 -7.89307700 *
## 35) FP184=1 19 6.2942530 -6.77157900 *
## 9) MolWeight< 0.4710593 26 23.6894500 -5.36461500
## 18) MolWeight>=0.1178222 12 7.1820670 -6.18333300 *
## 19) MolWeight< 0.1178222 14 1.5692860 -4.66285700 *
## 5) SurfaceArea2>=-1.012864 243 393.1730000 -3.75983500
## 10) HydrophilicFactor< 0.2078952 159 237.7748000 -4.07528300
## 20) MolWeight>=1.101123 60 65.5968900 -4.64950000
## 40) FP077=0 17 18.8918500 -5.39176500 *
## 41) FP077=1 43 33.6358300 -4.35604700
## 82) FP119=0 35 20.8480300 -4.57857100 *
## 83) FP119=1 8 3.4723500 -3.38250000 *
## 21) MolWeight< 1.101123 99 140.4044000 -3.72727300
## 42) FP075=0 55 69.1337300 -4.25290900
## 84) NumRotBonds>=2.179213 8 8.5670000 -5.56500000 *
## 85) NumRotBonds< 2.179213 47 44.4497900 -4.02957400
## 170) MolWeight>=0.4202744 26 20.8555500 -4.41692300
## 340) FP017=0 15 13.0163300 -4.79666700 *
## 341) FP017=1 11 2.7264910 -3.89909100 *
## 171) MolWeight< 0.4202744 21 14.8634000 -3.55000000 *
## 43) FP075=1 44 37.0793000 -3.07022700
## 86) HydrophilicFactor< -0.6757562 14 12.6063200 -3.73357100 *
## 87) HydrophilicFactor>=-0.6757562 30 15.4377900 -2.76066700 *
## 11) HydrophilicFactor>=0.2078952 84 109.6285000 -3.16273800
## 22) NumMultBonds>=1.228149 28 34.5152700 -3.77892900
## 44) FP131=0 19 23.0277700 -4.12263200 *
## 45) FP131=1 9 4.5046000 -3.05333300 *
## 23) NumMultBonds< 1.228149 56 59.1661900 -2.85464300
## 46) FP081=1 24 15.5219800 -3.45916700
## 92) FP125=0 12 4.3290920 -3.90583300 *
## 93) FP125=1 12 6.4046250 -3.01250000 *
## 47) FP081=0 32 28.2953500 -2.40125000
## 94) FP088=0 24 13.3008000 -2.74791700 *
## 95) FP088=1 8 3.4574880 -1.36125000 *
## 3) NumCarbon< 0.4506182 634 1380.8030000 -1.83517400
## 6) MolWeight>=0.364972 104 173.9672000 -3.32846200
## 12) SurfaceArea1< 0.1308092 43 51.7177400 -4.24674400
## 24) MolWeight>=0.8557128 18 13.6410000 -5.09333300 *
## 25) MolWeight< 0.8557128 25 15.8873000 -3.63720000 *
## 13) SurfaceArea1>=0.1308092 61 60.4300200 -2.68114800
## 26) NumOxygen< 1.113753 47 25.8753700 -2.99914900
## 52) FP001=0 7 4.7345710 -3.99571400 *
## 53) FP001=1 40 12.9722000 -2.82475000 *
## 27) NumOxygen>=1.113753 14 13.8457200 -1.61357100 *
## 7) MolWeight< 0.364972 530 929.4181000 -1.54215100
## 14) SurfaceArea2< -1.012864 118 121.2846000 -2.81500000
## 28) NumBonds>=-0.4530519 37 11.1113100 -3.96432400
## 56) NumBonds>=-0.1671013 20 3.4969000 -4.31500000 *
## 57) NumBonds< -0.1671013 17 2.2614470 -3.55176500 *
## 29) NumBonds< -0.4530519 81 38.9726000 -2.29000000
## 58) NumBonds>=-1.383145 51 14.1965000 -2.62686300
## 116) FP172=1 8 0.7938000 -3.34500000 *
## 117) FP172=0 43 8.5093440 -2.49325600 *
## 59) NumBonds< -1.383145 30 9.1503870 -1.71733300 *
## 15) SurfaceArea2>=-1.012864 412 562.2019000 -1.17759700
## 30) MolWeight>=-0.8064806 241 241.0830000 -1.74688800
## 60) NumRotBonds>=1.764021 14 3.0914860 -3.28714300 *
## 61) NumRotBonds< 1.764021 227 202.7297000 -1.65189400
## 122) NumOxygen< 1.113753 205 153.1710000 -1.76263400
## 244) HydrophilicFactor< -0.7050632 24 12.3292600 -2.41125000 *
## 245) HydrophilicFactor>=-0.7050632 181 129.4060000 -1.67663000
## 490) FP065=1 121 70.4388200 -1.86504100
## 980) MolWeight>=-0.6589892 107 57.1190600 -1.96775700
## 1960) NumChlorine>=-0.04017528 20 11.5382200 -2.61700000 *
## 1961) NumChlorine< -0.04017528 87 35.2125100 -1.81850600
## 3922) HydrophilicFactor>=-0.4643903 72 23.8945900 -1.92791700 *
## 3923) HydrophilicFactor< -0.4643903 15 6.3189330 -1.29333300 *
## 981) MolWeight< -0.6589892 14 3.5628000 -1.08000000 *
## 491) FP065=0 60 46.0095300 -1.29666700
## 982) FP102=0 53 30.9962800 -1.45717000
## 1964) FP145=0 44 20.9088800 -1.63431800 *
## 1965) FP145=1 9 1.9560890 -0.59111110 *
## 983) FP102=1 7 3.3102860 -0.08142857 *
## 123) NumOxygen>=1.113753 22 23.6190000 -0.62000000
## 246) FP168=1 9 0.7426222 -1.58444400 *
## 247) FP168=0 13 8.7094310 0.04769231 *
## 31) MolWeight< -0.8064806 171 132.9333000 -0.37526320
## 62) NumCarbon>=-0.8205617 86 36.7958900 -0.89581400
## 124) FP116=0 78 18.6207500 -1.04076900 *
## 125) FP116=1 8 0.5565500 0.51750000 *
## 63) NumCarbon< -0.8205617 85 49.2558300 0.15141180
## 126) MolWeight>=-1.497949 47 24.5774900 -0.10638300 *
## 127) MolWeight< -1.497949 38 17.6915000 0.47026320
## 254) FP063=0 18 5.5220940 -0.01055556 *
## 255) FP063=1 20 4.2628200 0.90300000 *
## cp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.001 0.949044 0.7814193 0.7165704 0.06100448 0.05338094 0.04092924
## 2 0.005 1.062111 0.7304621 0.8158512 0.04537841 0.04743129 0.03111770
## 3 0.010 1.094310 0.7129754 0.8405203 0.04536525 0.05532136 0.04079579
## 4 0.015 1.141204 0.6879057 0.8863220 0.07570284 0.04717980 0.07258210
## 5 0.020 1.159802 0.6767517 0.9117055 0.07943956 0.04864369 0.08872587
(CART_Train_RMSE <- CART_Tune$results[CART_Tune$results$cp==CART_Tune$bestTune$cp,
c("RMSE")])
## [1] 0.949044
(CART_Train_Rsquared <- CART_Tune$results[CART_Tune$results$cp==CART_Tune$bestTune$cp,
c("Rsquared")])
## [1] 0.7814193
##################################
# Identifying and plotting the
# best model predictors
##################################
CART_VarImp <- varImp(CART_Tune, scale = TRUE)
plot(CART_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Classification and Regression Trees",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
CART_Test <- data.frame(CART_Observed = PMA_PreModelling_Test$Log_Solubility,
CART_Predicted = predict(CART_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Log_Solubility")]))
CART_Test
## CART_Observed CART_Predicted
## 20 0.93 -0.10638298
## 21 0.85 -0.10638298
## 23 0.81 -1.04076923
## 25 0.74 -0.10638298
## 28 0.61 -1.63431818
## 31 0.58 -0.01055556
## 32 0.57 -0.01055556
## 33 0.56 -0.01055556
## 34 0.52 -0.01055556
## 37 0.45 0.51750000
## 38 0.40 0.51750000
## 42 0.36 -1.92791667
## 49 0.22 -0.10638298
## 54 0.08 -0.10638298
## 55 0.07 -1.04076923
## 58 0.02 -1.29333333
## 60 0.00 -1.04076923
## 61 -0.01 -0.01055556
## 65 -0.07 -0.10638298
## 69 -0.12 -1.29333333
## 73 -0.17 0.04769231
## 86 -0.29 -0.10638298
## 90 -0.38 -1.04076923
## 91 -0.38 -1.04076923
## 93 -0.39 -1.04076923
## 96 -0.42 -1.04076923
## 98 -0.44 -1.04076923
## 100 -0.46 -0.10638298
## 104 -0.48 -5.39176471
## 112 -0.60 -1.04076923
## 115 -0.63 -2.76066667
## 119 -0.66 -0.10638298
## 128 -0.72 -0.10638298
## 130 -0.72 -1.04076923
## 139 -0.80 -1.04076923
## 143 -0.80 -1.63431818
## 145 -0.82 -0.01055556
## 146 -0.82 -0.10638298
## 149 -0.84 -1.63431818
## 150 -0.85 -0.01055556
## 152 -0.85 -0.01055556
## 157 -0.87 -2.61700000
## 161 -0.89 -1.63431818
## 162 -0.90 -0.10638298
## 166 -0.96 -3.28714286
## 167 -0.96 0.90300000
## 173 -0.99 -0.10638298
## 176 -1.01 -0.10638298
## 182 -1.09 -1.71733333
## 187 -1.12 -1.92791667
## 190 -1.14 -0.10638298
## 194 -1.17 -1.71733333
## 195 -1.19 -1.08000000
## 201 -1.22 -1.04076923
## 207 -1.27 -1.63431818
## 208 -1.28 -1.04076923
## 215 -1.32 -1.71733333
## 222 -1.38 -1.63431818
## 224 -1.39 -1.92791667
## 231 -1.42 -2.41125000
## 236 -1.47 -1.63431818
## 237 -1.47 -1.71733333
## 240 -1.50 -0.08142857
## 243 -1.52 -1.04076923
## 248 -1.54 -1.08000000
## 251 -1.55 -2.76066667
## 256 -1.56 -1.92791667
## 258 -1.57 -2.61700000
## 262 -1.60 -1.04076923
## 266 -1.60 -1.63431818
## 272 -1.62 -3.63720000
## 280 -1.64 -2.82475000
## 283 -1.67 -1.63431818
## 286 -1.70 -2.82475000
## 287 -1.70 -1.08000000
## 289 -1.71 -1.71733333
## 290 -1.71 -2.82475000
## 298 -1.75 -1.63431818
## 305 -1.78 -1.92791667
## 306 -1.78 -1.61357143
## 312 -1.82 -1.92791667
## 320 -1.87 -1.04076923
## 325 -1.89 -3.28714286
## 332 -1.92 -1.63431818
## 333 -1.92 -2.41125000
## 335 -1.92 -1.63431818
## 339 -1.94 -2.41125000
## 346 -1.99 -1.92791667
## 347 -2.00 -1.71733333
## 350 -2.05 -2.41125000
## 353 -2.06 -1.63431818
## 358 -2.08 -1.92791667
## 365 -2.10 -2.61700000
## 367 -2.11 -1.58444444
## 370 -2.12 -1.92791667
## 379 -2.17 -1.92791667
## 386 -2.21 -2.49325581
## 394 -2.24 -3.99571429
## 396 -2.24 -1.92791667
## 400 -2.29 -1.71733333
## 404 -2.31 -1.71733333
## 405 -2.32 -1.63431818
## 413 -2.35 -1.92791667
## 415 -2.35 -1.92791667
## 417 -2.36 -2.49325581
## 418 -2.36 -0.08142857
## 423 -2.38 -3.34500000
## 434 -2.42 -1.92791667
## 437 -2.43 -2.82475000
## 440 -2.44 -2.82475000
## 449 -2.52 -2.82475000
## 450 -2.53 -2.61700000
## 457 -2.57 -2.82475000
## 467 -2.62 -2.82475000
## 469 -2.62 -1.61357143
## 474 -2.64 -3.73357143
## 475 -2.64 -2.76066667
## 485 -2.70 -3.28714286
## 504 -2.82 -4.12263158
## 511 -2.88 -2.74791667
## 512 -2.89 -2.61700000
## 517 -2.92 -1.63431818
## 519 -2.93 -4.12263158
## 520 -2.96 -2.49325581
## 522 -2.98 -3.73357143
## 527 -3.01 -2.76066667
## 528 -3.01 -3.55176471
## 529 -3.02 -3.99571429
## 537 -3.07 -2.49325581
## 540 -3.09 -3.89909091
## 541 -3.11 -4.31500000
## 547 -3.13 -2.76066667
## 550 -3.14 -2.74791667
## 555 -3.15 -4.12263158
## 564 -3.22 -5.09333333
## 570 -3.26 -3.90583333
## 573 -3.27 -2.82475000
## 575 -3.27 -4.31500000
## 578 -3.30 -3.55176471
## 581 -3.31 -2.61700000
## 585 -3.33 -3.99571429
## 590 -3.37 -4.79666667
## 601 -3.43 -3.01250000
## 602 -3.43 -3.63720000
## 607 -3.48 -3.55000000
## 610 -3.51 -3.55000000
## 618 -3.59 -3.05333333
## 624 -3.61 -3.99571429
## 626 -3.63 -3.28714286
## 627 -3.63 -3.34500000
## 634 -3.68 -1.36125000
## 640 -3.71 -4.57857143
## 642 -3.74 -3.55176471
## 643 -3.75 -4.31500000
## 644 -3.75 -3.55000000
## 645 -3.77 -3.63720000
## 646 -3.77 -4.31500000
## 647 -3.78 -3.05333333
## 652 -3.81 -3.55176471
## 658 -3.95 -2.74791667
## 659 -3.96 -4.66285714
## 660 -3.96 -3.89909091
## 664 -4.00 -3.99571429
## 666 -4.02 -5.09333333
## 667 -4.04 -5.39176471
## 675 -4.12 -4.31500000
## 680 -4.15 -4.57857143
## 681 -4.16 -2.76066667
## 687 -4.17 -4.66285714
## 694 -4.21 -5.39176471
## 697 -4.23 -5.09333333
## 701 -4.25 -4.31500000
## 705 -4.30 -3.01250000
## 707 -4.31 -5.56500000
## 710 -4.35 -3.63720000
## 716 -4.40 -3.89909091
## 719 -4.40 -3.73357143
## 720 -4.43 -4.57857143
## 725 -4.46 -4.57857143
## 727 -4.47 -4.31500000
## 730 -4.51 -5.09333333
## 738 -4.60 -3.55000000
## 745 -4.64 -4.31500000
## 748 -4.69 -5.56500000
## 751 -4.71 -4.57857143
## 756 -4.77 -2.76066667
## 766 -4.95 -4.12263158
## 769 -4.98 -3.63720000
## 783 -5.21 -6.77157895
## 785 -5.22 -4.66285714
## 790 -5.28 -3.89909091
## 793 -5.31 -3.55000000
## 795 -5.35 -4.57857143
## 796 -5.37 -4.57857143
## 797 -5.40 -3.90583333
## 801 -5.43 -5.09333333
## 811 -5.65 -3.63720000
## 812 -5.66 -4.57857143
## 815 -6.70 -3.89909091
## 816 -5.72 -5.09333333
## 817 -6.00 -6.18333333
## 824 -6.25 -6.77157895
## 825 -6.26 -6.77157895
## 826 -6.27 -6.77157895
## 830 -6.35 -4.66285714
## 837 -6.57 -6.18333333
## 838 -6.62 -5.09333333
## 844 -6.96 -6.18333333
## 845 -7.02 -7.89307692
## 847 -7.20 -7.89307692
## 850 -7.28 -6.77157895
## 852 -7.32 -6.77157895
## 853 -7.39 -6.77157895
## 861 -7.82 -8.59187500
## 868 -8.23 -7.89307692
## 874 -8.94 -8.59187500
## 879 1.07 -0.01055556
## 895 0.43 -0.01055556
## 899 0.32 0.90300000
## 903 0.00 -0.01055556
## 917 -0.40 -1.92791667
## 927 -0.52 -0.10638298
## 929 -0.55 -0.10638298
## 931 -0.60 -0.10638298
## 933 -0.62 -1.92791667
## 944 -0.85 -1.04076923
## 947 -0.89 -1.04076923
## 949 -0.93 -1.63431818
## 953 -0.96 -0.10638298
## 958 -1.06 -2.41125000
## 961 -1.10 -1.04076923
## 963 -1.12 -1.63431818
## 964 -1.15 -1.04076923
## 973 -1.28 -1.08000000
## 976 -1.30 -1.71733333
## 977 -1.31 -1.08000000
## 980 -1.35 -3.99571429
## 983 -1.39 -1.29333333
## 984 -1.41 -1.08000000
## 986 -1.41 -1.92791667
## 989 -1.42 -1.04076923
## 991 -1.46 -1.63431818
## 996 -1.50 -1.71733333
## 997 -1.50 -1.29333333
## 999 -1.52 -1.08000000
## 1000 -1.52 -1.08000000
## 1003 -1.59 -1.71733333
## 1008 -1.61 -1.92791667
## 1009 -1.63 -1.63431818
## 1014 -1.71 -1.63431818
## 1015 -1.83 -2.74791667
## 1040 -2.05 -1.58444444
## 1042 -2.06 -2.82475000
## 1043 -2.07 -1.29333333
## 1050 -2.15 -1.63431818
## 1052 -2.16 -1.04076923
## 1056 -1.99 -1.61357143
## 1070 -2.36 -2.82475000
## 1073 -2.38 -2.49325581
## 1074 -2.39 -0.10638298
## 1079 -2.46 -1.63431818
## 1080 -2.49 -2.41125000
## 1085 -2.54 -1.71733333
## 1087 -2.55 -3.55000000
## 1096 -2.63 -4.57857143
## 1099 -2.64 -2.74791667
## 1100 -2.67 -2.41125000
## 1102 -2.68 -2.82475000
## 1107 -2.77 -3.55176471
## 1109 -2.78 -4.12263158
## 1114 -2.82 -2.49325581
## 1118 -2.92 -2.74791667
## 1123 -3.03 -2.82475000
## 1132 -3.12 -3.55000000
## 1134 -3.16 -3.55000000
## 1137 -3.19 -3.34500000
## 1154 -3.54 -3.63720000
## 1155 -3.54 -2.76066667
## 1157 -3.59 -3.90583333
## 1162 -3.66 -4.31500000
## 1164 -3.68 -3.55176471
## 1171 -3.75 -4.57857143
## 1172 -3.76 -3.63720000
## 1175 -3.78 -4.12263158
## 1177 -3.80 -3.28714286
## 1179 -3.80 -5.39176471
## 1183 -3.85 -4.31500000
## 1185 -3.89 -5.39176471
## 1189 -3.95 -3.90583333
## 1211 -4.29 -4.66285714
## 1218 -4.42 -5.39176471
## 1224 -4.48 -3.34500000
## 1225 -4.48 -4.31500000
## 1227 -4.53 -5.39176471
## 1232 -4.63 -4.57857143
## 1235 -4.73 -3.55000000
## 1238 -4.84 -3.90583333
## 1240 -4.89 -5.39176471
## 1241 -4.89 -4.66285714
## 1248 -5.26 -4.66285714
## 1258 -6.09 -5.09333333
## 1261 -6.29 -5.09333333
## 1263 -6.29 -6.77157895
## 1269 -6.89 -5.39176471
## 1270 -6.96 -4.57857143
## 1271 -7.00 -6.77157895
## 1272 -7.05 -6.77157895
## 1280 -8.30 -8.59187500
## 1286 -8.66 -7.89307692
## 1287 -9.03 -7.89307692
## 1289 -10.41 -8.59187500
## 1290 -7.89 -6.77157895
## 1291 -2.32 -1.92791667
## 1294 0.39 -1.29333333
## 1305 -2.90 -4.12263158
## 1308 -2.47 -3.01250000
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(CART_Test_Metrics <- postResample(CART_Test[,2], CART_Test[,1]))
## RMSE Rsquared MAE
## 0.9194665 0.8070706 0.6980661
(CART_Test_RMSE <- CART_Test_Metrics[1])
## RMSE
## 0.9194665
(CART_Test_Rsquared <- CART_Test_Metrics[2])
## Rsquared
## 0.8070706
1.5.13 Conditional Inference Trees (CTREE)
[A] The conditional inference trees model from the
party
package was implemented through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] mincriterion = 1-p-value threshold made to
vary across a range of values equal to 0.75 to 0.99
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
mincriterion=0.75
[C.2] Root-Mean-Square Error = 0.95704
[C.3] R-Squared = 0.77940
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] MolWeight variable (numeric)
[D.2] NumCarbon variable (numeric)
[D.3] FP076
variable (factor)
[D.4] NumMultBonds variable (numeric)
[D.5] NumHalogen variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 1.11601
[E.2] R-Squared = 0.73109
##################################
# Creating a local object
# for the train set
##################################
PMA_PreModelling_Train_CTREE <- PMA_PreModelling_Train
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_CTREE$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
CTREE_Grid = data.frame(mincriterion = sort(c(0.95, seq(0.75, 0.99, length = 2))))
##################################
# Running the conditional inference trees model
# by setting the caret method to 'ctree'
##################################
set.seed(12345678)
CTREE_Tune <- train(x = PMA_PreModelling_Train_CTREE[,!names(PMA_PreModelling_Train_CTREE) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_CTREE$Log_Solubility,
method = "ctree",
tuneGrid = CTREE_Grid,
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
CTREE_Tune
## Conditional Inference Tree
##
## 951 samples
## 220 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## mincriterion RMSE Rsquared MAE
## 0.75 0.9570454 0.7794050 0.7258222
## 0.95 0.9994702 0.7596920 0.7568657
## 0.99 1.0355167 0.7433903 0.7830925
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.75.
##
## Conditional inference tree with 43 terminal nodes
##
## Response: .outcome
## Inputs: FP001, FP002, FP003, FP004, FP005, FP006, FP007, FP008, FP009, FP010, FP011, FP012, FP013, FP014, FP015, FP016, FP017, FP018, FP019, FP020, FP021, FP022, FP023, FP024, FP025, FP026, FP027, FP028, FP029, FP030, FP031, FP032, FP033, FP034, FP035, FP036, FP037, FP038, FP039, FP040, FP041, FP042, FP043, FP044, FP045, FP046, FP047, FP048, FP049, FP050, FP051, FP052, FP053, FP054, FP055, FP056, FP057, FP058, FP059, FP060, FP061, FP062, FP063, FP064, FP065, FP066, FP067, FP068, FP069, FP070, FP071, FP072, FP073, FP074, FP075, FP076, FP077, FP078, FP079, FP080, FP081, FP082, FP083, FP084, FP085, FP086, FP087, FP088, FP089, FP090, FP091, FP092, FP093, FP094, FP095, FP096, FP097, FP098, FP099, FP100, FP101, FP102, FP103, FP104, FP105, FP106, FP107, FP108, FP109, FP110, FP111, FP112, FP113, FP114, FP115, FP116, FP117, FP118, FP119, FP120, FP121, FP122, FP123, FP124, FP125, FP126, FP127, FP128, FP129, FP130, FP131, FP132, FP133, FP134, FP135, FP136, FP137, FP138, FP139, FP140, FP141, FP142, FP143, FP144, FP145, FP146, FP147, FP148, FP149, FP150, FP151, FP152, FP153, FP155, FP156, FP157, FP158, FP159, FP160, FP161, FP162, FP163, FP164, FP165, FP166, FP167, FP168, FP169, FP170, FP171, FP172, FP173, FP174, FP175, FP176, FP177, FP178, FP179, FP180, FP181, FP182, FP183, FP184, FP185, FP186, FP187, FP188, FP189, FP190, FP191, FP192, FP193, FP194, FP195, FP196, FP197, FP198, FP201, FP202, FP203, FP204, FP205, FP206, FP207, FP208, MolWeight, NumBonds, NumMultBonds, NumRotBonds, NumDblBonds, NumCarbon, NumNitrogen, NumOxygen, NumSulfer, NumChlorine, NumHalogen, NumRings, HydrophilicFactor, SurfaceArea1, SurfaceArea2
## Number of observations: 951
##
## 1) MolWeight <= 0.07100313; criterion = 1, statistic = 411.936
## 2) NumCarbon <= -0.4425175; criterion = 1, statistic = 155.007
## 3) FP072 == {1}; criterion = 1, statistic = 76.589
## 4) NumCarbon <= -0.9551655; criterion = 1, statistic = 60.157
## 5) FP131 == {0}; criterion = 1, statistic = 36.759
## 6) MolWeight <= -1.24085; criterion = 0.889, statistic = 11.986
## 7) FP063 == {1}; criterion = 0.999, statistic = 21.467
## 8)* weights = 14
## 7) FP063 == {0}
## 9) NumCarbon <= -1.612785; criterion = 0.901, statistic = 12.217
## 10)* weights = 8
## 9) NumCarbon > -1.612785
## 11)* weights = 24
## 6) MolWeight > -1.24085
## 12)* weights = 31
## 5) FP131 == {1}
## 13)* weights = 15
## 4) NumCarbon > -0.9551655
## 14) NumCarbon <= -0.685958; criterion = 0.997, statistic = 18.77
## 15) FP147 == {1}; criterion = 0.986, statistic = 15.937
## 16)* weights = 7
## 15) FP147 == {0}
## 17)* weights = 47
## 14) NumCarbon > -0.685958
## 18)* weights = 54
## 3) FP072 == {0}
## 19) FP063 == {1}; criterion = 1, statistic = 46.785
## 20) MolWeight <= -0.4330091; criterion = 1, statistic = 27.515
## 21) MolWeight <= -1.533127; criterion = 0.808, statistic = 10.891
## 22)* weights = 11
## 21) MolWeight > -1.533127
## 23) FP116 == {1}; criterion = 0.941, statistic = 13.221
## 24)* weights = 8
## 23) FP116 == {0}
## 25)* weights = 14
## 20) MolWeight > -0.4330091
## 26)* weights = 12
## 19) FP063 == {0}
## 27) NumBonds <= -1.536946; criterion = 1, statistic = 51.593
## 28)* weights = 26
## 27) NumBonds > -1.536946
## 29) NumBonds <= -0.5036947; criterion = 1, statistic = 27.6
## 30) FP172 == {0}; criterion = 0.994, statistic = 17.469
## 31) NumBonds <= -0.9603748; criterion = 0.999, statistic = 20.422
## 32)* weights = 22
## 31) NumBonds > -0.9603748
## 33)* weights = 21
## 30) FP172 == {1}
## 34)* weights = 7
## 29) NumBonds > -0.5036947
## 35)* weights = 10
## 2) NumCarbon > -0.4425175
## 36) SurfaceArea1 <= -1.033167; criterion = 1, statistic = 59.056
## 37) NumBonds <= -0.1221974; criterion = 1, statistic = 24.262
## 38)* weights = 14
## 37) NumBonds > -0.1221974
## 39)* weights = 26
## 36) SurfaceArea1 > -1.033167
## 40) NumCarbon <= 0.1817764; criterion = 1, statistic = 38.11
## 41) FP059 == {0}; criterion = 1, statistic = 26.239
## 42)* weights = 103
## 41) FP059 == {1}
## 43)* weights = 16
## 40) NumCarbon > 0.1817764
## 44)* weights = 17
## 1) MolWeight > 0.07100313
## 45) FP015 == {1}; criterion = 1, statistic = 143.122
## 46) SurfaceArea1 <= 0.116782; criterion = 1, statistic = 59.802
## 47) MolWeight <= 1.080767; criterion = 1, statistic = 31.733
## 48) NumCarbon <= 1.00853; criterion = 0.988, statistic = 16.207
## 49) FP059 == {0}; criterion = 0.884, statistic = 11.904
## 50) FP077 == {1}; criterion = 0.829, statistic = 11.129
## 51)* weights = 10
## 50) FP077 == {0}
## 52) FP070 == {0}; criterion = 0.914, statistic = 12.499
## 53)* weights = 34
## 52) FP070 == {1}
## 54)* weights = 13
## 49) FP059 == {1}
## 55)* weights = 10
## 48) NumCarbon > 1.00853
## 56) MolWeight <= 0.7928528; criterion = 0.98, statistic = 15.306
## 57)* weights = 7
## 56) MolWeight > 0.7928528
## 58)* weights = 27
## 47) MolWeight > 1.080767
## 59) FP137 == {1}; criterion = 0.966, statistic = 14.274
## 60)* weights = 13
## 59) FP137 == {0}
## 61)* weights = 27
## 46) SurfaceArea1 > 0.116782
## 62) MolWeight <= 1.134223; criterion = 1, statistic = 46.235
## 63) NumOxygen <= 0.824597; criterion = 1, statistic = 32.067
## 64) NumOxygen <= -0.332028; criterion = 0.799, statistic = 10.794
## 65)* weights = 14
## 64) NumOxygen > -0.332028
## 66) NumMultBonds <= 0.9381143; criterion = 0.967, statistic = 14.326
## 67) FP105 == {0}; criterion = 0.829, statistic = 11.124
## 68)* weights = 54
## 67) FP105 == {1}
## 69)* weights = 15
## 66) NumMultBonds > 0.9381143
## 70) FP131 == {1}; criterion = 0.859, statistic = 11.513
## 71)* weights = 15
## 70) FP131 == {0}
## 72)* weights = 19
## 63) NumOxygen > 0.824597
## 73) FP002 == {0}; criterion = 0.985, statistic = 15.814
## 74)* weights = 11
## 73) FP002 == {1}
## 75)* weights = 26
## 62) MolWeight > 1.134223
## 76) HydrophilicFactor <= 2.044025; criterion = 0.999, statistic = 21.327
## 77)* weights = 69
## 76) HydrophilicFactor > 2.044025
## 78)* weights = 8
## 45) FP015 == {0}
## 79) FP070 == {0}; criterion = 1, statistic = 38.978
## 80) FP149 == {0}; criterion = 0.997, statistic = 18.853
## 81)* weights = 16
## 80) FP149 == {1}
## 82)* weights = 18
## 79) FP070 == {1}
## 83) MolWeight <= 1.239164; criterion = 0.954, statistic = 13.687
## 84)* weights = 23
## 83) MolWeight > 1.239164
## 85)* weights = 15
## mincriterion RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.75 0.9570454 0.7794050 0.7258222 0.09347278 0.05478381 0.06000684
## 2 0.95 0.9994702 0.7596920 0.7568657 0.10135350 0.04780590 0.07516456
## 3 0.99 1.0355167 0.7433903 0.7830925 0.10465367 0.04950312 0.07684305
(CTREE_Train_RMSE <- CTREE_Tune$results[CTREE_Tune$results$mincriterion==CTREE_Tune$bestTune$mincriterion,
c("RMSE")])
## [1] 0.9570454
(CTREE_Train_Rsquared <- CTREE_Tune$results[CTREE_Tune$results$mincriterion==CTREE_Tune$bestTune$mincriterion,
c("Rsquared")])
## [1] 0.779405
##################################
# Identifying and plotting the
# best model predictors
##################################
CTREE_VarImp <- varImp(CTREE_Tune, scale = TRUE)
plot(CTREE_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Conditional Inference Trees",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
CTREE_Test <- data.frame(CTREE_Observed = PMA_PreModelling_Test$Log_Solubility,
CTREE_Predicted = predict(CTREE_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Log_Solubility")]))
CTREE_Test
## CTREE_Observed CTREE_Predicted
## 1 0.93 0.4825000
## 2 0.85 0.9485714
## 3 0.81 -0.8640426
## 4 0.74 0.2262500
## 5 0.61 -0.8640426
## 6 0.58 0.4825000
## 7 0.57 0.4825000
## 8 0.56 -0.1095833
## 9 0.52 -0.1095833
## 10 0.45 0.2262500
## 11 0.40 0.2262500
## 12 0.36 -4.3207692
## 13 0.22 -0.2038710
## 14 0.08 -0.1095833
## 15 0.07 -0.9721429
## 16 0.02 0.2262500
## 17 0.00 -0.8640426
## 18 -0.01 -0.1095833
## 19 -0.07 -0.2038710
## 20 -0.12 -2.4616667
## 21 -0.17 0.3214286
## 22 -0.29 -0.9721429
## 23 -0.38 -0.8640426
## 24 -0.38 -0.9721429
## 25 -0.39 -0.8640426
## 26 -0.42 -0.9721429
## 27 -0.44 -0.8640426
## 28 -0.46 0.9485714
## 29 -0.48 -4.0410145
## 30 -0.60 -0.8640426
## 31 -0.63 -4.3207692
## 32 -0.66 -0.2038710
## 33 -0.72 -0.2038710
## 34 -0.72 -0.8640426
## 35 -0.80 -0.8640426
## 36 -0.80 -0.8640426
## 37 -0.82 -0.1095833
## 38 -0.82 -0.2038710
## 39 -0.84 -0.8640426
## 40 -0.85 -0.1095833
## 41 -0.85 -0.1095833
## 42 -0.87 -2.4616667
## 43 -0.89 -1.7153398
## 44 -0.90 -0.2038710
## 45 -0.96 -1.7153398
## 46 -0.96 0.7363636
## 47 -0.99 -0.1095833
## 48 -1.01 -0.2038710
## 49 -1.09 -1.5430769
## 50 -1.12 -1.7153398
## 51 -1.14 -1.4840000
## 52 -1.17 -1.5430769
## 53 -1.19 -1.7153398
## 54 -1.22 -1.2851852
## 55 -1.27 -1.7153398
## 56 -1.28 -1.7153398
## 57 -1.32 -1.5430769
## 58 -1.38 -1.7153398
## 59 -1.39 -0.8418182
## 60 -1.42 -3.3047059
## 61 -1.47 -0.2038710
## 62 -1.47 -2.1595455
## 63 -1.50 -2.2320000
## 64 -1.52 -1.2851852
## 65 -1.54 -1.7153398
## 66 -1.55 -6.5542857
## 67 -1.56 -3.3047059
## 68 -1.57 -1.2851852
## 69 -1.60 -1.7153398
## 70 -1.60 -2.3885185
## 71 -1.62 -4.3207692
## 72 -1.64 -0.8418182
## 73 -1.67 -2.8475000
## 74 -1.70 -2.4050000
## 75 -1.70 -1.7153398
## 76 -1.71 -3.1679412
## 77 -1.71 -0.8418182
## 78 -1.75 -1.2851852
## 79 -1.78 -1.7153398
## 80 -1.78 -2.4050000
## 81 -1.82 -1.2851852
## 82 -1.87 -1.2851852
## 83 -1.89 -0.8418182
## 84 -1.92 -1.2851852
## 85 -1.92 -1.7153398
## 86 -1.92 -1.2851852
## 87 -1.94 -3.3047059
## 88 -1.99 -3.3047059
## 89 -2.00 -1.5430769
## 90 -2.05 -2.8475000
## 91 -2.06 -1.7153398
## 92 -2.08 -3.3047059
## 93 -2.10 -1.2851852
## 94 -2.11 -1.7153398
## 95 -2.12 0.2262500
## 96 -2.17 -1.7153398
## 97 -2.21 -2.7528571
## 98 -2.24 -3.1520000
## 99 -2.24 -1.7153398
## 100 -2.29 -2.1595455
## 101 -2.31 -1.5430769
## 102 -2.32 -1.7153398
## 103 -2.35 -3.3047059
## 104 -2.35 -1.7153398
## 105 -2.36 -2.7528571
## 106 -2.36 -4.6220000
## 107 -2.38 -3.3642857
## 108 -2.42 -1.7153398
## 109 -2.43 -3.6873684
## 110 -2.44 -2.4050000
## 111 -2.52 -0.8418182
## 112 -2.53 -3.1679412
## 113 -2.57 -0.8418182
## 114 -2.62 -2.4050000
## 115 -2.62 -2.1587500
## 116 -2.64 -4.2018519
## 117 -2.64 -2.2320000
## 118 -2.70 -3.1520000
## 119 -2.82 -3.6873684
## 120 -2.88 -2.1587500
## 121 -2.89 -3.1679412
## 122 -2.92 -1.7153398
## 123 -2.93 -2.4050000
## 124 -2.96 -2.2320000
## 125 -2.98 -6.5542857
## 126 -3.01 -2.2320000
## 127 -3.01 -3.8330000
## 128 -3.02 -3.1520000
## 129 -3.07 -2.7528571
## 130 -3.09 -0.8418182
## 131 -3.11 -3.3535714
## 132 -3.13 -2.4050000
## 133 -3.14 -2.1587500
## 134 -3.15 -3.6873684
## 135 -3.22 -3.1679412
## 136 -3.26 -3.1520000
## 137 -3.27 -2.3885185
## 138 -3.27 -3.3535714
## 139 -3.30 -3.8330000
## 140 -3.31 -1.2851852
## 141 -3.33 -2.3885185
## 142 -3.37 -2.4050000
## 143 -3.43 -4.0410145
## 144 -3.43 -3.1679412
## 145 -3.48 -3.3047059
## 146 -3.51 -3.3047059
## 147 -3.59 -4.0410145
## 148 -3.61 -4.0410145
## 149 -3.63 -3.3047059
## 150 -3.63 -3.8556250
## 151 -3.68 -2.1587500
## 152 -3.71 -4.4853846
## 153 -3.74 -3.8330000
## 154 -3.75 -4.4373077
## 155 -3.75 -3.8556250
## 156 -3.77 -3.1679412
## 157 -3.77 -4.4373077
## 158 -3.78 -4.0410145
## 159 -3.81 -3.8330000
## 160 -3.95 -3.6873684
## 161 -3.96 -4.4373077
## 162 -3.96 -3.1520000
## 163 -4.00 -3.6328571
## 164 -4.02 -4.3207692
## 165 -4.04 -4.0410145
## 166 -4.12 -4.4373077
## 167 -4.15 -4.4853846
## 168 -4.16 -2.4050000
## 169 -4.17 -4.4373077
## 170 -4.21 -5.9914815
## 171 -4.23 -2.2320000
## 172 -4.25 -4.4373077
## 173 -4.30 -4.0410145
## 174 -4.31 -3.1520000
## 175 -4.35 -3.8556250
## 176 -4.40 -3.1520000
## 177 -4.40 -2.4050000
## 178 -4.43 -5.9914815
## 179 -4.46 -4.0410145
## 180 -4.47 -4.4373077
## 181 -4.51 -5.9914815
## 182 -4.60 -3.3047059
## 183 -4.64 -4.4373077
## 184 -4.69 -4.6220000
## 185 -4.71 -4.0410145
## 186 -4.77 -2.2320000
## 187 -4.95 -3.6873684
## 188 -4.98 -7.2295652
## 189 -5.21 -5.8988889
## 190 -5.22 -4.4373077
## 191 -5.28 -3.1520000
## 192 -5.31 -3.8556250
## 193 -5.35 -4.0410145
## 194 -5.37 -4.0410145
## 195 -5.40 -3.1520000
## 196 -5.43 -7.2295652
## 197 -5.65 -7.2295652
## 198 -5.66 -4.0410145
## 199 -6.70 -4.2018519
## 200 -5.72 -5.9914815
## 201 -6.00 -7.2295652
## 202 -6.25 -5.8988889
## 203 -6.26 -5.8988889
## 204 -6.27 -5.8988889
## 205 -6.35 -4.4373077
## 206 -6.57 -6.5542857
## 207 -6.62 -5.9914815
## 208 -6.96 -6.5542857
## 209 -7.02 -4.2018519
## 210 -7.20 -5.9914815
## 211 -7.28 -7.2295652
## 212 -7.32 -8.6760000
## 213 -7.39 -8.6760000
## 214 -7.82 -8.6760000
## 215 -8.23 -7.2295652
## 216 -8.94 -8.6760000
## 217 1.07 -0.1095833
## 218 0.43 -0.1095833
## 219 0.32 0.7363636
## 220 0.00 -0.1095833
## 221 -0.40 -1.2851852
## 222 -0.52 -0.1095833
## 223 -0.55 -0.1095833
## 224 -0.60 -0.1095833
## 225 -0.62 -2.3885185
## 226 -0.85 -0.9721429
## 227 -0.89 -0.8640426
## 228 -0.93 -1.7153398
## 229 -0.96 -0.2038710
## 230 -1.06 -1.7153398
## 231 -1.10 -0.9721429
## 232 -1.12 -0.2038710
## 233 -1.15 -1.2851852
## 234 -1.28 0.2262500
## 235 -1.30 -1.5430769
## 236 -1.31 -1.7153398
## 237 -1.35 -3.6328571
## 238 -1.39 -1.7153398
## 239 -1.41 -1.7153398
## 240 -1.41 -1.2851852
## 241 -1.42 -0.8640426
## 242 -1.46 -0.9721429
## 243 -1.50 -1.5430769
## 244 -1.50 -0.9721429
## 245 -1.52 -0.9721429
## 246 -1.52 -1.4840000
## 247 -1.59 -2.1595455
## 248 -1.61 -1.7153398
## 249 -1.63 -1.7153398
## 250 -1.71 -1.7153398
## 251 -1.83 -2.4050000
## 252 -2.05 -1.7153398
## 253 -2.06 -0.8418182
## 254 -2.07 -4.3207692
## 255 -2.15 -1.2851852
## 256 -2.16 -0.9721429
## 257 -1.99 -0.8418182
## 258 -2.36 -2.3885185
## 259 -2.38 -2.1595455
## 260 -2.39 -2.1595455
## 261 -2.46 -1.7153398
## 262 -2.49 -3.3047059
## 263 -2.54 -1.5430769
## 264 -2.55 -3.1679412
## 265 -2.63 -4.4853846
## 266 -2.64 -2.3885185
## 267 -2.67 -3.3047059
## 268 -2.68 -2.3885185
## 269 -2.77 -3.3535714
## 270 -2.78 -2.4050000
## 271 -2.82 -3.3535714
## 272 -2.92 -3.8556250
## 273 -3.03 -2.4050000
## 274 -3.12 -3.1679412
## 275 -3.16 -2.4050000
## 276 -3.19 -3.8556250
## 277 -3.54 -3.8556250
## 278 -3.54 -3.1679412
## 279 -3.59 -4.0410145
## 280 -3.66 -4.4373077
## 281 -3.68 -3.8330000
## 282 -3.75 -4.0410145
## 283 -3.76 -3.1679412
## 284 -3.78 -3.6873684
## 285 -3.80 -4.6220000
## 286 -3.80 -4.0410145
## 287 -3.85 -3.8330000
## 288 -3.89 -4.0410145
## 289 -3.95 -4.0410145
## 290 -4.29 -4.4373077
## 291 -4.42 -4.0410145
## 292 -4.48 -3.8556250
## 293 -4.48 -4.4373077
## 294 -4.53 -4.0410145
## 295 -4.63 -4.0410145
## 296 -4.73 -7.2295652
## 297 -4.84 -3.1520000
## 298 -4.89 -3.6873684
## 299 -4.89 -4.4373077
## 300 -5.26 -4.4373077
## 301 -6.09 -5.9914815
## 302 -6.29 -5.9914815
## 303 -6.29 -7.2295652
## 304 -6.89 -5.9914815
## 305 -6.96 -4.0410145
## 306 -7.00 -7.2295652
## 307 -7.05 -8.6760000
## 308 -8.30 -8.6760000
## 309 -8.66 -7.2295652
## 310 -9.03 -7.2295652
## 311 -10.41 -8.6760000
## 312 -7.89 -8.6760000
## 313 -2.32 -1.7153398
## 314 0.39 -4.3207692
## 315 -2.90 -4.0410145
## 316 -2.47 -4.0410145
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(CTREE_Test_Metrics <- postResample(CTREE_Test[,2], CTREE_Test[,1]))
## RMSE Rsquared MAE
## 1.1160100 0.7310919 0.8027123
(CTREE_Test_RMSE <- CTREE_Test_Metrics[1])
## RMSE
## 1.11601
(CTREE_Test_Rsquared <- CTREE_Test_Metrics[2])
## Rsquared
## 0.7310919
1.5.14 Random Forest (RF)
[A] The random forest model from the
randomForest
package was implemented through the
caret
package.
[B] The model contains 1 hyperparameter:
[B.1] mtry =
number of randomly selected predictors made to vary across a range of
values equal to 10 to 100
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
mtry=75
[C.2] Root-Mean-Square Error = 0.65419
[C.3] R-Squared = 0.90098
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] MolWeight variable (numeric)
[D.2] NumCarbon variable (numeric)
[D.3] HydroPhilicFactor variable (numeric)
[D.4] SurfaceArea1 variable (numeric)
[D.5] NumRotBonds variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.65571
[E.2] R-Squared = 0.90057
##################################
# Creating a local object
# for the train set
##################################
PMA_PreModelling_Train_RF <- PMA_PreModelling_Train
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_RF$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
RF_Grid = data.frame(mtry = c(25,75,125))
##################################
# Running the random forest model
# by setting the caret method to 'rf'
##################################
set.seed(12345678)
RF_Tune <- train(x = PMA_PreModelling_Train_RF[,!names(PMA_PreModelling_Train_RF) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_RF$Log_Solubility,
method = "rf",
tuneGrid = RF_Grid,
ntree = 100,
importance = TRUE,
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
RF_Tune
## Random Forest
##
## 951 samples
## 220 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 25 0.6714148 0.8976040 0.4955195
## 75 0.6541868 0.9009781 0.4772816
## 125 0.6589382 0.8983983 0.4776712
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 75.
##
## Call:
## randomForest(x = x, y = y, ntree = 100, mtry = param$mtry, importance = TRUE)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 75
##
## Mean of squared residuals: 0.4364622
## % Var explained: 89.57
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 25 0.6714148 0.8976040 0.4955195 0.06557018 0.01260342 0.04375357
## 2 75 0.6541868 0.9009781 0.4772816 0.06595932 0.01310389 0.04172285
## 3 125 0.6589382 0.8983983 0.4776712 0.05581432 0.01080426 0.04016860
(RF_Train_RMSE <- RF_Tune$results[RF_Tune$results$mtry==RF_Tune$bestTune$mtry,
c("RMSE")])
## [1] 0.6541868
(RF_Train_Rsquared <- RF_Tune$results[RF_Tune$results$mtry==RF_Tune$bestTune$mtry,
c("Rsquared")])
## [1] 0.9009781
##################################
# Identifying and plotting the
# best model predictors
##################################
RF_VarImp <- varImp(RF_Tune, scale = TRUE)
plot(RF_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Random Forest",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
RF_Test <- data.frame(RF_Observed = PMA_PreModelling_Test$Log_Solubility,
RF_Predicted = predict(RF_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Log_Solubility")]))
RF_Test
## RF_Observed RF_Predicted
## 20 0.93 0.20989500
## 21 0.85 0.46575500
## 23 0.81 -0.33998667
## 25 0.74 0.37796167
## 28 0.61 -0.57687667
## 31 0.58 0.42797333
## 32 0.57 0.29741167
## 33 0.56 0.15913667
## 34 0.52 0.05253667
## 37 0.45 -0.16529833
## 38 0.40 0.16764500
## 42 0.36 -2.35196500
## 49 0.22 -0.35109000
## 54 0.08 -0.17439500
## 55 0.07 -0.54914667
## 58 0.02 -1.13201333
## 60 0.00 -0.25148833
## 61 -0.01 -0.01665500
## 65 -0.07 0.05839000
## 69 -0.12 -0.60179333
## 73 -0.17 0.34814667
## 86 -0.29 -0.02392167
## 90 -0.38 -0.63526333
## 91 -0.38 -0.53145667
## 93 -0.39 -0.76127833
## 96 -0.42 -0.43075667
## 98 -0.44 -0.43162333
## 100 -0.46 0.74329833
## 104 -0.48 -2.44920167
## 112 -0.60 -0.51055500
## 115 -0.63 -2.98466167
## 119 -0.66 -0.57326000
## 128 -0.72 -0.67806833
## 130 -0.72 -0.31738000
## 139 -0.80 -0.42072167
## 143 -0.80 -0.70985000
## 145 -0.82 0.11085167
## 146 -0.82 -0.66479333
## 149 -0.84 -0.55452167
## 150 -0.85 -0.44121833
## 152 -0.85 -0.04707333
## 157 -0.87 -2.50876000
## 161 -0.89 -1.56917167
## 162 -0.90 -0.22690000
## 166 -0.96 -1.27440833
## 167 -0.96 0.22710500
## 173 -0.99 -0.61188333
## 176 -1.01 -0.52732381
## 182 -1.09 -1.47098833
## 187 -1.12 -1.00478333
## 190 -1.14 -1.11679333
## 194 -1.17 -1.63073667
## 195 -1.19 -1.36738833
## 201 -1.22 -0.98321500
## 207 -1.27 -1.39956333
## 208 -1.28 -1.53184000
## 215 -1.32 -1.41741333
## 222 -1.38 -1.66779167
## 224 -1.39 -1.70169167
## 231 -1.42 -1.89129333
## 236 -1.47 -0.87659000
## 237 -1.47 -1.57729833
## 240 -1.50 -1.35113500
## 243 -1.52 -1.20394000
## 248 -1.54 -1.40041333
## 251 -1.55 -3.08410500
## 256 -1.56 -2.41555167
## 258 -1.57 -1.68766000
## 262 -1.60 -1.45603000
## 266 -1.60 -2.13415833
## 272 -1.62 -3.11365667
## 280 -1.64 -2.47095000
## 283 -1.67 -1.66484667
## 286 -1.70 -2.71505333
## 287 -1.70 -1.29740333
## 289 -1.71 -2.63375333
## 290 -1.71 -2.15921333
## 298 -1.75 -1.81461667
## 305 -1.78 -1.76615833
## 306 -1.78 -1.65183167
## 312 -1.82 -1.64487167
## 320 -1.87 -1.18677667
## 325 -1.89 -2.12132000
## 332 -1.92 -1.90805667
## 333 -1.92 -2.03452833
## 335 -1.92 -1.50591000
## 339 -1.94 -2.17312643
## 346 -1.99 -2.23364667
## 347 -2.00 -2.15230833
## 350 -2.05 -2.24742333
## 353 -2.06 -1.91868833
## 358 -2.08 -2.29837500
## 365 -2.10 -2.35561667
## 367 -2.11 -1.62317000
## 370 -2.12 -1.71070167
## 379 -2.17 -1.93786000
## 386 -2.21 -2.65126833
## 394 -2.24 -2.84050000
## 396 -2.24 -1.51852333
## 400 -2.29 -1.84330000
## 404 -2.31 -2.34471833
## 405 -2.32 -2.07237000
## 413 -2.35 -2.15067833
## 415 -2.35 -2.11579000
## 417 -2.36 -3.07110500
## 418 -2.36 -2.73504167
## 423 -2.38 -2.43465667
## 434 -2.42 -1.73134000
## 437 -2.43 -2.87439833
## 440 -2.44 -2.69540000
## 449 -2.52 -2.64786000
## 450 -2.53 -2.46411667
## 457 -2.57 -2.59764333
## 467 -2.62 -2.75441667
## 469 -2.62 -2.95589167
## 474 -2.64 -3.66249833
## 475 -2.64 -3.49781833
## 485 -2.70 -3.08082500
## 504 -2.82 -2.92268333
## 511 -2.88 -2.88706333
## 512 -2.89 -3.01256000
## 517 -2.92 -1.66833667
## 519 -2.93 -3.24460667
## 520 -2.96 -2.44246167
## 522 -2.98 -3.68176333
## 527 -3.01 -3.24534000
## 528 -3.01 -3.75289333
## 529 -3.02 -2.94384500
## 537 -3.07 -2.91294833
## 540 -3.09 -3.58601833
## 541 -3.11 -3.65920500
## 547 -3.13 -3.15069667
## 550 -3.14 -2.60735143
## 555 -3.15 -3.09614167
## 564 -3.22 -2.50023333
## 570 -3.26 -3.32189333
## 573 -3.27 -2.95375500
## 575 -3.27 -3.67831500
## 578 -3.30 -3.50198667
## 581 -3.31 -2.33753667
## 585 -3.33 -3.22839167
## 590 -3.37 -3.17885833
## 601 -3.43 -3.73816667
## 602 -3.43 -3.27175667
## 607 -3.48 -2.98412500
## 610 -3.51 -3.41157333
## 618 -3.59 -3.21784000
## 624 -3.61 -3.38225333
## 626 -3.63 -3.20796667
## 627 -3.63 -3.36039167
## 634 -3.68 -1.98414833
## 640 -3.71 -3.82553667
## 642 -3.74 -3.51340667
## 643 -3.75 -4.12928333
## 644 -3.75 -3.94622000
## 645 -3.77 -3.67290833
## 646 -3.77 -4.27068500
## 647 -3.78 -3.94457833
## 652 -3.81 -3.41955333
## 658 -3.95 -4.26720000
## 659 -3.96 -5.03294500
## 660 -3.96 -4.21195667
## 664 -4.00 -3.48207667
## 666 -4.02 -3.81235167
## 667 -4.04 -4.42665167
## 675 -4.12 -4.10467000
## 680 -4.15 -4.31214667
## 681 -4.16 -3.47621500
## 687 -4.17 -5.03863667
## 694 -4.21 -4.49756000
## 697 -4.23 -4.04320500
## 701 -4.25 -4.46614167
## 705 -4.30 -3.93528000
## 707 -4.31 -5.35805667
## 710 -4.35 -4.24843000
## 716 -4.40 -4.29815000
## 719 -4.40 -4.16561500
## 720 -4.43 -4.59730667
## 725 -4.46 -4.48651000
## 727 -4.47 -4.25193000
## 730 -4.51 -4.57426333
## 738 -4.60 -3.95472143
## 745 -4.64 -4.53408167
## 748 -4.69 -4.88521500
## 751 -4.71 -4.09683000
## 756 -4.77 -3.86949333
## 766 -4.95 -3.67954000
## 769 -4.98 -3.78668333
## 783 -5.21 -5.94445333
## 785 -5.22 -5.61153000
## 790 -5.28 -4.17007000
## 793 -5.31 -3.90614000
## 795 -5.35 -4.43664833
## 796 -5.37 -4.43663833
## 797 -5.40 -4.02382333
## 801 -5.43 -5.04274667
## 811 -5.65 -4.67941500
## 812 -5.66 -4.30371167
## 815 -6.70 -4.70443500
## 816 -5.72 -4.60836500
## 817 -6.00 -6.70994000
## 824 -6.25 -6.23801464
## 825 -6.26 -6.03402548
## 826 -6.27 -6.29619048
## 830 -6.35 -5.28174333
## 837 -6.57 -6.50772143
## 838 -6.62 -4.60072667
## 844 -6.96 -6.00886333
## 845 -7.02 -7.52277500
## 847 -7.20 -6.91610464
## 850 -7.28 -7.15971667
## 852 -7.32 -7.70975167
## 853 -7.39 -7.82604106
## 861 -7.82 -8.09600380
## 868 -8.23 -8.19397833
## 874 -8.94 -7.99572143
## 879 1.07 0.31703000
## 895 0.43 0.02934833
## 899 0.32 0.12630833
## 903 0.00 -0.24217000
## 917 -0.40 -1.50730000
## 927 -0.52 -0.22160333
## 929 -0.55 -0.30704333
## 931 -0.60 -0.49359000
## 933 -0.62 -1.97804333
## 944 -0.85 -0.93113500
## 947 -0.89 -0.93667333
## 949 -0.93 -1.50149167
## 953 -0.96 -0.66297833
## 958 -1.06 -1.89618500
## 961 -1.10 -0.97368000
## 963 -1.12 -0.99423833
## 964 -1.15 -0.89360500
## 973 -1.28 -0.82012833
## 976 -1.30 -1.30890167
## 977 -1.31 -1.39726333
## 980 -1.35 -2.89281667
## 983 -1.39 -1.91987167
## 984 -1.41 -1.36738833
## 986 -1.41 -1.57123500
## 989 -1.42 -0.88137548
## 991 -1.46 -1.09400000
## 996 -1.50 -1.54966833
## 997 -1.50 -1.87420833
## 999 -1.52 -1.80561833
## 1000 -1.52 -1.33649167
## 1003 -1.59 -1.86752167
## 1008 -1.61 -1.41786500
## 1009 -1.63 -1.39789500
## 1014 -1.71 -1.78755333
## 1015 -1.83 -2.59385500
## 1040 -2.05 -1.55244833
## 1042 -2.06 -1.99637167
## 1043 -2.07 -3.00424833
## 1050 -2.15 -1.53587000
## 1052 -2.16 -1.09019833
## 1056 -1.99 -1.83613167
## 1070 -2.36 -2.23690500
## 1073 -2.38 -2.88251667
## 1074 -2.39 -1.62589667
## 1079 -2.46 -2.27056667
## 1080 -2.49 -2.28973667
## 1085 -2.54 -2.04383667
## 1087 -2.55 -2.79085167
## 1096 -2.63 -3.20284500
## 1099 -2.64 -2.37778833
## 1100 -2.67 -2.55542500
## 1102 -2.68 -2.52760833
## 1107 -2.77 -3.16981000
## 1109 -2.78 -3.16385000
## 1114 -2.82 -2.72513167
## 1118 -2.92 -3.38231667
## 1123 -3.03 -3.44474500
## 1132 -3.12 -3.94192167
## 1134 -3.16 -3.09815667
## 1137 -3.19 -3.46202833
## 1154 -3.54 -3.75560667
## 1155 -3.54 -3.08713833
## 1157 -3.59 -3.78611500
## 1162 -3.66 -4.07704500
## 1164 -3.68 -3.48136667
## 1171 -3.75 -3.99305167
## 1172 -3.76 -3.94040167
## 1175 -3.78 -3.48154333
## 1177 -3.80 -3.90414000
## 1179 -3.80 -4.31383833
## 1183 -3.85 -4.09734714
## 1185 -3.89 -3.81921500
## 1189 -3.95 -3.98643000
## 1211 -4.29 -4.96257333
## 1218 -4.42 -3.94297667
## 1224 -4.48 -3.53352167
## 1225 -4.48 -4.32658000
## 1227 -4.53 -4.81960000
## 1232 -4.63 -4.45693667
## 1235 -4.73 -4.20299000
## 1238 -4.84 -3.77068333
## 1240 -4.89 -4.36926000
## 1241 -4.89 -5.04212429
## 1248 -5.26 -5.19248000
## 1258 -6.09 -4.59498667
## 1261 -6.29 -5.81051333
## 1263 -6.29 -6.38106667
## 1269 -6.89 -4.82948167
## 1270 -6.96 -5.02815167
## 1271 -7.00 -6.85987806
## 1272 -7.05 -7.89716773
## 1280 -8.30 -8.33209427
## 1286 -8.66 -8.14722381
## 1287 -9.03 -8.23099381
## 1289 -10.41 -9.87519048
## 1290 -7.89 -7.61023417
## 1291 -2.32 -2.04724500
## 1294 0.39 -2.11422667
## 1305 -2.90 -4.30074333
## 1308 -2.47 -3.94169167
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(RF_Test_Metrics <- postResample(RF_Test[,2], RF_Test[,1]))
## RMSE Rsquared MAE
## 0.6557191 0.9005713 0.4681318
(RF_Test_RMSE <- RF_Test_Metrics[1])
## RMSE
## 0.6557191
(RF_Test_Rsquared <- RF_Test_Metrics[2])
## Rsquared
## 0.9005713
1.5.15 Cubist (CUB)
[A] The cubist model from the
Cubist
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] committees = number of committees made to
vary across a range of values equal to 1 to 100
[B.2] neighbors = number of neighbors made to
vary across a range of values equal to 0 to 9
[C] The cross-validated model performance of the final
model is summarized as follows:
[C.1] Final model configuration involves
committees=100 and neighbors=9
[C.2] Root-Mean-Square Error = 0.55645
[C.3] R-Squared = 0.92538
[D] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[D.1] NumCarbon variable (numeric)
[D.2] SurfaceArea1 variable (numeric)
[D.3] MolWeight variable (numeric)
[D.4] SurfaceArea2 variable (numeric)
[D.5] NumMultBonds variable (numeric)
[E] The independent test model performance of the final
model is summarized as follows:
[E.1] Root-Mean-Square Error = 0.63809
[E.2] R-Squared = 0.90717
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_CUB <- as.data.frame(lapply(PMA_PreModelling_Train, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Train_CUB)
## [1] 951 221
PMA_PreModelling_Test_CUB <- as.data.frame(lapply(PMA_PreModelling_Test, function(x) as.numeric(as.character(x))))
dim(PMA_PreModelling_Test_CUB)
## [1] 316 221
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_CUB$Log_Solubility,
k = 10,
returnTrain=TRUE)
KFold_Control <- trainControl(method="cv",
index=KFold_Indices)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
CUB_Grid = expand.grid(committees = c(1:10, 20, 50, 75, 100),
neighbors = c(0, 1, 5, 9))
##################################
# Running the cubist model
# by setting the caret method to 'cubist'
##################################
set.seed(12345678)
CUB_Tune <- train(x = PMA_PreModelling_Train_CUB[,!names(PMA_PreModelling_Train_CUB) %in% c("Log_Solubility")],
y = PMA_PreModelling_Train_CUB$Log_Solubility,
method = "cubist",
tuneGrid = CUB_Grid,
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
CUB_Tune
## Cubist
##
## 951 samples
## 220 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 855, 856, 857, 856, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.6752542 0.8901629 0.5189171
## 1 1 0.7008903 0.8837194 0.5159612
## 1 5 0.6188772 0.9080481 0.4604200
## 1 9 0.6135842 0.9097579 0.4582143
## 2 0 0.6328227 0.9039087 0.4764290
## 2 1 0.6700298 0.8931433 0.4966519
## 2 5 0.5930006 0.9154193 0.4397414
## 2 9 0.5903048 0.9161890 0.4379268
## 3 0 0.6192273 0.9079020 0.4696538
## 3 1 0.6554171 0.8972964 0.4841175
## 3 5 0.5774071 0.9196275 0.4266022
## 3 9 0.5727396 0.9209246 0.4243919
## 4 0 0.6135739 0.9100651 0.4631492
## 4 1 0.6604880 0.8963709 0.4880789
## 4 5 0.5764280 0.9205037 0.4269968
## 4 9 0.5733087 0.9212829 0.4258535
## 5 0 0.6094395 0.9106368 0.4583396
## 5 1 0.6618431 0.8951874 0.4868163
## 5 5 0.5752840 0.9202914 0.4236369
## 5 9 0.5691704 0.9218718 0.4200879
## 6 0 0.6063451 0.9122943 0.4539687
## 6 1 0.6599286 0.8961854 0.4851999
## 6 5 0.5718522 0.9216146 0.4218841
## 6 9 0.5676389 0.9226830 0.4200237
## 7 0 0.6067740 0.9116071 0.4552029
## 7 1 0.6613310 0.8955053 0.4858224
## 7 5 0.5740189 0.9205727 0.4224482
## 7 9 0.5683498 0.9220973 0.4190641
## 8 0 0.6054532 0.9122002 0.4543680
## 8 1 0.6629762 0.8952615 0.4880736
## 8 5 0.5738019 0.9208570 0.4240097
## 8 9 0.5693537 0.9219893 0.4215548
## 9 0 0.6019943 0.9130469 0.4511290
## 9 1 0.6566265 0.8967188 0.4826374
## 9 5 0.5703953 0.9215861 0.4218037
## 9 9 0.5656251 0.9228227 0.4182451
## 10 0 0.6028672 0.9128099 0.4539634
## 10 1 0.6536486 0.8978655 0.4816393
## 10 5 0.5704452 0.9216393 0.4214629
## 10 9 0.5669797 0.9225541 0.4190950
## 20 0 0.5993153 0.9139238 0.4516966
## 20 1 0.6471885 0.8998615 0.4774183
## 20 5 0.5683336 0.9224196 0.4208599
## 20 9 0.5639998 0.9235150 0.4172223
## 50 0 0.5926935 0.9157234 0.4458315
## 50 1 0.6388245 0.9022758 0.4700059
## 50 5 0.5625895 0.9239751 0.4162366
## 50 9 0.5581390 0.9250679 0.4125082
## 75 0 0.5907510 0.9160536 0.4435589
## 75 1 0.6381473 0.9023978 0.4693750
## 75 5 0.5628524 0.9237714 0.4158270
## 75 9 0.5578756 0.9250054 0.4117147
## 100 0 0.5904289 0.9161594 0.4436908
## 100 1 0.6384052 0.9022657 0.4692430
## 100 5 0.5614157 0.9241375 0.4152984
## 100 9 0.5564468 0.9253788 0.4115436
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 100 and neighbors = 9.
##
## Call:
## cubist.default(x = x, y = y, committees = param$committees)
##
## Number of samples: 951
## Number of predictors: 220
##
## Number of committees: 100
## Number of rules per committee: 3, 4, 5, 3, 5, 5, 3, 9, 4, 4, 3, 4, 3, 10, 3, 6, 3, 5, 3, 8 ...
## committees neighbors RMSE Rsquared MAE RMSESD RsquaredSD
## 1 1 0 0.6752542 0.8901629 0.5189171 0.07075110 0.02148323
## 2 1 1 0.7008903 0.8837194 0.5159612 0.06509564 0.02451798
## 3 1 5 0.6188772 0.9080481 0.4604200 0.06922244 0.01920558
## 4 1 9 0.6135842 0.9097579 0.4582143 0.07173506 0.01815343
## 5 2 0 0.6328227 0.9039087 0.4764290 0.06424339 0.01992966
## 6 2 1 0.6700298 0.8931433 0.4966519 0.07053455 0.02567565
## 7 2 5 0.5930006 0.9154193 0.4397414 0.06449195 0.01859323
## 8 2 9 0.5903048 0.9161890 0.4379268 0.07015247 0.01834661
## 9 3 0 0.6192273 0.9079020 0.4696538 0.06721987 0.02038225
## 10 3 1 0.6554171 0.8972964 0.4841175 0.06030129 0.02312476
## 11 3 5 0.5774071 0.9196275 0.4266022 0.05989280 0.01780228
## 12 3 9 0.5727396 0.9209246 0.4243919 0.06193643 0.01679528
## 13 4 0 0.6135739 0.9100651 0.4631492 0.06840373 0.01813722
## 14 4 1 0.6604880 0.8963709 0.4880789 0.06650423 0.02100972
## 15 4 5 0.5764280 0.9205037 0.4269968 0.06700236 0.01651648
## 16 4 9 0.5733087 0.9212829 0.4258535 0.07068752 0.01589680
## 17 5 0 0.6094395 0.9106368 0.4583396 0.06873004 0.02063372
## 18 5 1 0.6618431 0.8951874 0.4868163 0.06461635 0.02501600
## 19 5 5 0.5752840 0.9202914 0.4236369 0.05596206 0.01670758
## 20 5 9 0.5691704 0.9218718 0.4200879 0.06117837 0.01667682
## 21 6 0 0.6063451 0.9122943 0.4539687 0.06820048 0.01781319
## 22 6 1 0.6599286 0.8961854 0.4851999 0.06298180 0.02238397
## 23 6 5 0.5718522 0.9216146 0.4218841 0.05815585 0.01479684
## 24 6 9 0.5676389 0.9226830 0.4200237 0.06183992 0.01446132
## 25 7 0 0.6067740 0.9116071 0.4552029 0.06873557 0.01943849
## 26 7 1 0.6613310 0.8955053 0.4858224 0.06397915 0.02393074
## 27 7 5 0.5740189 0.9205727 0.4224482 0.05798751 0.01671333
## 28 7 9 0.5683498 0.9220973 0.4190641 0.06196759 0.01617214
## 29 8 0 0.6054532 0.9122002 0.4543680 0.06960940 0.01908694
## 30 8 1 0.6629762 0.8952615 0.4880736 0.06009633 0.02290597
## 31 8 5 0.5738019 0.9208570 0.4240097 0.05749275 0.01559596
## 32 8 9 0.5693537 0.9219893 0.4215548 0.06244961 0.01532364
## 33 9 0 0.6019943 0.9130469 0.4511290 0.07313827 0.02063898
## 34 9 1 0.6566265 0.8967188 0.4826374 0.05992369 0.02372793
## 35 9 5 0.5703953 0.9215861 0.4218037 0.05826366 0.01673550
## 36 9 9 0.5656251 0.9228227 0.4182451 0.06368779 0.01669090
## 37 10 0 0.6028672 0.9128099 0.4539634 0.07098043 0.01989151
## 38 10 1 0.6536486 0.8978655 0.4816393 0.05809324 0.02229468
## 39 10 5 0.5704452 0.9216393 0.4214629 0.05716485 0.01614830
## 40 10 9 0.5669797 0.9225541 0.4190950 0.06262368 0.01607240
## 41 20 0 0.5993153 0.9139238 0.4516966 0.06445217 0.01863460
## 42 20 1 0.6471885 0.8998615 0.4774183 0.04788924 0.02090719
## 43 20 5 0.5683336 0.9224196 0.4208599 0.05092834 0.01534381
## 44 20 9 0.5639998 0.9235150 0.4172223 0.05603083 0.01519094
## 45 50 0 0.5926935 0.9157234 0.4458315 0.06214991 0.01847751
## 46 50 1 0.6388245 0.9022758 0.4700059 0.05140761 0.02147608
## 47 50 5 0.5625895 0.9239751 0.4162366 0.05230705 0.01561716
## 48 50 9 0.5581390 0.9250679 0.4125082 0.05656249 0.01544108
## 49 75 0 0.5907510 0.9160536 0.4435589 0.06153736 0.01910950
## 50 75 1 0.6381473 0.9023978 0.4693750 0.05024654 0.02189054
## 51 75 5 0.5628524 0.9237714 0.4158270 0.05055428 0.01598086
## 52 75 9 0.5578756 0.9250054 0.4117147 0.05525199 0.01587909
## 53 100 0 0.5904289 0.9161594 0.4436908 0.06106853 0.01932156
## 54 100 1 0.6384052 0.9022657 0.4692430 0.04950692 0.02206031
## 55 100 5 0.5614157 0.9241375 0.4152984 0.05035178 0.01601373
## 56 100 9 0.5564468 0.9253788 0.4115436 0.05497151 0.01586072
## MAESD
## 1 0.06061840
## 2 0.04687813
## 3 0.05380609
## 4 0.05544167
## 5 0.05773281
## 6 0.04973829
## 7 0.04967622
## 8 0.05624501
## 9 0.05649767
## 10 0.04504088
## 11 0.04608716
## 12 0.04967174
## 13 0.06164008
## 14 0.05038505
## 15 0.05332448
## 16 0.05819382
## 17 0.05731339
## 18 0.04936554
## 19 0.04627150
## 20 0.05052480
## 21 0.05629940
## 22 0.04890753
## 23 0.04742342
## 24 0.05190857
## 25 0.05724735
## 26 0.04786595
## 27 0.04557568
## 28 0.05153380
## 29 0.05796442
## 30 0.04676440
## 31 0.04675714
## 32 0.05291233
## 33 0.05949880
## 34 0.04730699
## 35 0.04655640
## 36 0.05408456
## 37 0.05908847
## 38 0.04622777
## 39 0.04741228
## 40 0.05429759
## 41 0.05441810
## 42 0.03822151
## 43 0.04281448
## 44 0.05116607
## 45 0.05136633
## 46 0.03915659
## 47 0.04229044
## 48 0.05012352
## 49 0.05130847
## 50 0.03903998
## 51 0.04112365
## 52 0.04913291
## 53 0.05072458
## 54 0.03892660
## 55 0.04077691
## 56 0.04863760
(CUB_Train_RMSE <- CUB_Tune$results[CUB_Tune$results$committees==CUB_Tune$bestTune$committees &
CUB_Tune$results$neighbors==CUB_Tune$bestTune$neighbors,
c("RMSE")])
## [1] 0.5564468
(CUB_Train_Rsquared <- CUB_Tune$results[CUB_Tune$results$committees==CUB_Tune$bestTune$committees &
CUB_Tune$results$neighbors==CUB_Tune$bestTune$neighbors,
c("Rsquared")])
## [1] 0.9253788
##################################
# Identifying and plotting the
# best model predictors
##################################
CUB_VarImp <- varImp(CUB_Tune, scale = TRUE)
plot(CUB_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Cubist",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
CUB_Test <- data.frame(CUB_Observed = PMA_PreModelling_Test$Log_Solubility,
CUB_Predicted = predict(CUB_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Log_Solubility")]))
CUB_Test
## CUB_Observed CUB_Predicted
## 1 0.93 5.172006e-01
## 2 0.85 4.744695e-01
## 3 0.81 -3.106867e-01
## 4 0.74 5.642811e-01
## 5 0.61 -2.795888e-01
## 6 0.58 1.220973e+00
## 7 0.57 5.271021e-01
## 8 0.56 4.673800e-01
## 9 0.52 3.596828e-01
## 10 0.45 -1.953650e-01
## 11 0.40 9.581463e-02
## 12 0.36 -1.770827e+00
## 13 0.22 6.735242e-02
## 14 0.08 -1.327409e-02
## 15 0.07 -9.621343e-01
## 16 0.02 -3.472218e-01
## 17 0.00 -1.880903e-02
## 18 -0.01 3.657053e-02
## 19 -0.07 7.146210e-01
## 20 -0.12 -4.522400e-01
## 21 -0.17 4.591612e-01
## 22 -0.29 -7.476294e-02
## 23 -0.38 -4.240656e-01
## 24 -0.38 -8.791980e-01
## 25 -0.39 -8.478543e-01
## 26 -0.42 -9.527521e-01
## 27 -0.44 -5.708790e-01
## 28 -0.46 5.170208e-01
## 29 -0.48 -2.082416e+00
## 30 -0.60 -7.150730e-01
## 31 -0.63 -1.928441e+00
## 32 -0.66 -4.997951e-01
## 33 -0.72 -5.738401e-01
## 34 -0.72 -2.128842e-01
## 35 -0.80 -6.117046e-02
## 36 -0.80 -5.603025e-01
## 37 -0.82 5.831223e-01
## 38 -0.82 -7.053283e-01
## 39 -0.84 -4.939408e-01
## 40 -0.85 -7.390255e-01
## 41 -0.85 -2.165750e-01
## 42 -0.87 -1.334413e+00
## 43 -0.89 -1.491920e+00
## 44 -0.90 1.310215e-01
## 45 -0.96 -1.228196e+00
## 46 -0.96 -3.503021e-01
## 47 -0.99 -5.740114e-01
## 48 -1.01 -4.228113e-01
## 49 -1.09 -1.122615e+00
## 50 -1.12 -1.104403e+00
## 51 -1.14 -1.202845e+00
## 52 -1.17 -1.178322e+00
## 53 -1.19 -1.299870e+00
## 54 -1.22 -9.899879e-01
## 55 -1.27 -1.195272e+00
## 56 -1.28 -1.340667e+00
## 57 -1.32 -1.489669e+00
## 58 -1.38 -1.269200e+00
## 59 -1.39 -1.918472e+00
## 60 -1.42 -1.619951e+00
## 61 -1.47 -1.468703e+00
## 62 -1.47 -1.210847e+00
## 63 -1.50 -8.650517e-01
## 64 -1.52 -1.272961e+00
## 65 -1.54 -1.350985e+00
## 66 -1.55 -2.278194e+00
## 67 -1.56 -2.410002e+00
## 68 -1.57 -1.594703e+00
## 69 -1.60 -1.049522e+00
## 70 -1.60 -2.410012e+00
## 71 -1.62 -2.412026e+00
## 72 -1.64 -2.566043e+00
## 73 -1.67 -1.478632e+00
## 74 -1.70 -2.595009e+00
## 75 -1.70 -1.743490e+00
## 76 -1.71 -2.534111e+00
## 77 -1.71 -2.347152e+00
## 78 -1.75 -1.684114e+00
## 79 -1.78 -1.550764e+00
## 80 -1.78 -1.847633e+00
## 81 -1.82 -1.274292e+00
## 82 -1.87 -1.074537e+00
## 83 -1.89 -1.875680e+00
## 84 -1.92 -1.824179e+00
## 85 -1.92 -1.844802e+00
## 86 -1.92 -1.256984e+00
## 87 -1.94 -2.816156e+00
## 88 -1.99 -1.926969e+00
## 89 -2.00 -2.193821e+00
## 90 -2.05 -2.079077e+00
## 91 -2.06 -2.110536e+00
## 92 -2.08 -2.154182e+00
## 93 -2.10 -2.901855e+00
## 94 -2.11 -1.330634e+00
## 95 -2.12 -1.239691e+00
## 96 -2.17 -1.880477e+00
## 97 -2.21 -2.308400e+00
## 98 -2.24 -2.739114e+00
## 99 -2.24 -1.113671e+00
## 100 -2.29 -2.161248e+00
## 101 -2.31 -2.032299e+00
## 102 -2.32 -1.975597e+00
## 103 -2.35 -2.346902e+00
## 104 -2.35 -1.761070e+00
## 105 -2.36 -3.044691e+00
## 106 -2.36 -2.079780e+00
## 107 -2.38 -2.408683e+00
## 108 -2.42 -2.339951e+00
## 109 -2.43 -3.300439e+00
## 110 -2.44 -2.431149e+00
## 111 -2.52 -2.319848e+00
## 112 -2.53 -2.331616e+00
## 113 -2.57 -2.479958e+00
## 114 -2.62 -2.187034e+00
## 115 -2.62 -2.345561e+00
## 116 -2.64 -2.720923e+00
## 117 -2.64 -3.278156e+00
## 118 -2.70 -2.948889e+00
## 119 -2.82 -3.071409e+00
## 120 -2.88 -2.688234e+00
## 121 -2.89 -2.659425e+00
## 122 -2.92 -1.547081e+00
## 123 -2.93 -3.379261e+00
## 124 -2.96 -2.848138e+00
## 125 -2.98 -3.233932e+00
## 126 -3.01 -2.541941e+00
## 127 -3.01 -3.778557e+00
## 128 -3.02 -3.653405e+00
## 129 -3.07 -3.189232e+00
## 130 -3.09 -2.888317e+00
## 131 -3.11 -3.528169e+00
## 132 -3.13 -3.667722e+00
## 133 -3.14 -2.110050e+00
## 134 -3.15 -3.409269e+00
## 135 -3.22 -2.452943e+00
## 136 -3.26 -3.587133e+00
## 137 -3.27 -2.928179e+00
## 138 -3.27 -3.385448e+00
## 139 -3.30 -3.630623e+00
## 140 -3.31 -2.285615e+00
## 141 -3.33 -2.768307e+00
## 142 -3.37 -2.404949e+00
## 143 -3.43 -3.460907e+00
## 144 -3.43 -2.520015e+00
## 145 -3.48 -2.840985e+00
## 146 -3.51 -3.623060e+00
## 147 -3.59 -2.456094e+00
## 148 -3.61 -3.088016e+00
## 149 -3.63 -3.684434e+00
## 150 -3.63 -3.515325e+00
## 151 -3.68 -2.135059e+00
## 152 -3.71 -4.457196e+00
## 153 -3.74 -3.582433e+00
## 154 -3.75 -4.037922e+00
## 155 -3.75 -3.477290e+00
## 156 -3.77 -3.524473e+00
## 157 -3.77 -4.344183e+00
## 158 -3.78 -4.136355e+00
## 159 -3.81 -3.812230e+00
## 160 -3.95 -4.050872e+00
## 161 -3.96 -4.931999e+00
## 162 -3.96 -4.342718e+00
## 163 -4.00 -3.883743e+00
## 164 -4.02 -3.727985e+00
## 165 -4.04 -4.366707e+00
## 166 -4.12 -3.948336e+00
## 167 -4.15 -4.901789e+00
## 168 -4.16 -3.495876e+00
## 169 -4.17 -4.702560e+00
## 170 -4.21 -4.632524e+00
## 171 -4.23 -4.245769e+00
## 172 -4.25 -4.345779e+00
## 173 -4.30 -3.930046e+00
## 174 -4.31 -5.519844e+00
## 175 -4.35 -4.435305e+00
## 176 -4.40 -4.087770e+00
## 177 -4.40 -4.437664e+00
## 178 -4.43 -4.511951e+00
## 179 -4.46 -4.352980e+00
## 180 -4.47 -4.155533e+00
## 181 -4.51 -4.805473e+00
## 182 -4.60 -3.794261e+00
## 183 -4.64 -4.800642e+00
## 184 -4.69 -4.989451e+00
## 185 -4.71 -4.137802e+00
## 186 -4.77 -4.017933e+00
## 187 -4.95 -4.560051e+00
## 188 -4.98 -4.267509e+00
## 189 -5.21 -6.012078e+00
## 190 -5.22 -5.521434e+00
## 191 -5.28 -4.443992e+00
## 192 -5.31 -3.563909e+00
## 193 -5.35 -4.531025e+00
## 194 -5.37 -4.271461e+00
## 195 -5.40 -4.877545e+00
## 196 -5.43 -4.166358e+00
## 197 -5.65 -4.984075e+00
## 198 -5.66 -4.356925e+00
## 199 -6.70 -4.587479e+00
## 200 -5.72 -4.561443e+00
## 201 -6.00 -7.018298e+00
## 202 -6.25 -6.643809e+00
## 203 -6.26 -6.376649e+00
## 204 -6.27 -6.654786e+00
## 205 -6.35 -6.170313e+00
## 206 -6.57 -6.342891e+00
## 207 -6.62 -4.842244e+00
## 208 -6.96 -6.280235e+00
## 209 -7.02 -7.803179e+00
## 210 -7.20 -7.195404e+00
## 211 -7.28 -7.344370e+00
## 212 -7.32 -7.612546e+00
## 213 -7.39 -7.837104e+00
## 214 -7.82 -8.262712e+00
## 215 -8.23 -8.822540e+00
## 216 -8.94 -8.526814e+00
## 217 1.07 1.221488e-01
## 218 0.43 3.174650e-01
## 219 0.32 -1.087638e-01
## 220 0.00 1.497244e-04
## 221 -0.40 -1.316449e+00
## 222 -0.52 -3.207503e-01
## 223 -0.55 -4.866762e-01
## 224 -0.60 -8.038339e-01
## 225 -0.62 -2.058377e+00
## 226 -0.85 -9.838387e-01
## 227 -0.89 -8.581456e-01
## 228 -0.93 -1.231096e+00
## 229 -0.96 -1.979357e-01
## 230 -1.06 -1.416757e+00
## 231 -1.10 -1.161621e+00
## 232 -1.12 -1.132983e+00
## 233 -1.15 -6.601524e-01
## 234 -1.28 -4.816613e-01
## 235 -1.30 -1.264985e+00
## 236 -1.31 -1.320430e+00
## 237 -1.35 -3.245388e+00
## 238 -1.39 -2.065192e+00
## 239 -1.41 -1.299870e+00
## 240 -1.41 -1.277439e+00
## 241 -1.42 -6.929296e-01
## 242 -1.46 -1.827158e+00
## 243 -1.50 -1.628045e+00
## 244 -1.50 -1.952074e+00
## 245 -1.52 -1.947410e+00
## 246 -1.52 -1.354323e+00
## 247 -1.59 -1.757442e+00
## 248 -1.61 -1.333914e+00
## 249 -1.63 -1.059824e+00
## 250 -1.71 -2.249090e+00
## 251 -1.83 -2.417639e+00
## 252 -2.05 -1.511111e+00
## 253 -2.06 -2.567048e+00
## 254 -2.07 -2.790463e+00
## 255 -2.15 -2.824705e+00
## 256 -2.16 -1.367295e+00
## 257 -1.99 -6.977186e-01
## 258 -2.36 -2.102892e+00
## 259 -2.38 -3.211128e+00
## 260 -2.39 -1.688960e+00
## 261 -2.46 -2.103126e+00
## 262 -2.49 -2.239222e+00
## 263 -2.54 -2.531280e+00
## 264 -2.55 -2.618171e+00
## 265 -2.63 -2.682087e+00
## 266 -2.64 -2.131080e+00
## 267 -2.67 -2.637926e+00
## 268 -2.68 -2.204462e+00
## 269 -2.77 -2.946862e+00
## 270 -2.78 -3.149663e+00
## 271 -2.82 -2.917542e+00
## 272 -2.92 -3.519327e+00
## 273 -3.03 -2.857930e+00
## 274 -3.12 -3.571173e+00
## 275 -3.16 -2.835844e+00
## 276 -3.19 -3.610034e+00
## 277 -3.54 -3.522895e+00
## 278 -3.54 -2.603230e+00
## 279 -3.59 -3.528462e+00
## 280 -3.66 -3.734816e+00
## 281 -3.68 -3.620473e+00
## 282 -3.75 -3.780582e+00
## 283 -3.76 -3.823359e+00
## 284 -3.78 -4.155089e+00
## 285 -3.80 -3.935853e+00
## 286 -3.80 -4.474550e+00
## 287 -3.85 -3.945191e+00
## 288 -3.89 -4.292277e+00
## 289 -3.95 -4.025692e+00
## 290 -4.29 -4.995418e+00
## 291 -4.42 -4.714052e+00
## 292 -4.48 -3.679958e+00
## 293 -4.48 -4.167231e+00
## 294 -4.53 -4.984929e+00
## 295 -4.63 -4.420812e+00
## 296 -4.73 -4.404468e+00
## 297 -4.84 -4.030525e+00
## 298 -4.89 -4.271135e+00
## 299 -4.89 -4.997596e+00
## 300 -5.26 -6.030104e+00
## 301 -6.09 -4.678152e+00
## 302 -6.29 -6.102019e+00
## 303 -6.29 -6.583594e+00
## 304 -6.89 -6.388579e+00
## 305 -6.96 -6.376021e+00
## 306 -7.00 -7.061845e+00
## 307 -7.05 -7.878247e+00
## 308 -8.30 -8.766311e+00
## 309 -8.66 -9.350718e+00
## 310 -9.03 -9.559437e+00
## 311 -10.41 -1.000493e+01
## 312 -7.89 -7.574618e+00
## 313 -2.32 -1.948596e+00
## 314 0.39 -2.261666e+00
## 315 -2.90 -4.946440e+00
## 316 -2.47 -5.025486e+00
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(CUB_Test_Metrics <- postResample(CUB_Test[,2], CUB_Test[,1]))
## RMSE Rsquared MAE
## 0.6380865 0.9071665 0.4665613
(CUB_Test_RMSE <- CUB_Test_Metrics[1])
## RMSE
## 0.6380865
(CUB_Test_Rsquared <- CUB_Test_Metrics[2])
## Rsquared
## 0.9071665
1.5.16 Model Evaluation Summary
Model performance comparison:
[A] The models which demonstrated the best and most
consistent r-squared and RMSE metrics are as follows:
[A.1] CUB: Cubist
(Cubist
package)
[A.1.1] Cross-Validation R-Squared =
0.92538, Test R-Squared = 0.90717
[A.1.2] Cross-Validation RMSE = 0.55645,
Test RMSE = 0.63809
[A.2] SVM_R: Support Vector Machine - Radial Basis
Function Kernel
(kernlab
package)
[A.1.1] Cross-Validation R-Squared =
0.91551, Test R-Squared = 0.90989
[A.1.2] Cross-Validation RMSE = 0.59505,
Test RMSE = 0.62742
[A.3] SVM_R: Support Vector Machine - Polynomial
Kernel
(kernlab
package)
[A.3.1] Cross-Validation R-Squared =
0.91167, Test R-Squared = 0.90623
[A.3.2] Cross-Validation RMSE = 0.59505,
Test RMSE = 0.63778
[A.4] RF: Random Forest
(randomForest
package)
[A.4.1] Cross-Validation R-Squared =
0.90098, Test R-Squared = 0.90057
[A.4.2] Cross-Validation RMSE = 0.65419,
Test RMSE = 0.65571
##################################
# Consolidating all evaluation results
# for the train and test sets
# using the r-squared metric
##################################
Model <- c('LR','PLR_R','PLR_L','PLR_E','PCR','PLS','AVNN','MARS','SVM_R','SVM_P','KNN','CART','CTREE','RF','CUB',
'LR','PLR_R','PLR_L','PLR_E','PCR','PLS','AVNN','MARS','SVM_R','SVM_P','KNN','CART','CTREE','RF','CUB')
Set <- c(rep('Cross-Validation',15),rep('Test',15))
R_Squared <- c(LR_Train_Rsquared,PLR_R_Train_Rsquared,PLR_L_Train_Rsquared,PLR_E_Train_Rsquared,PCR_Train_Rsquared,
PLS_Train_Rsquared,AVNN_Train_Rsquared,MARS_Train_Rsquared,SVM_R_Train_Rsquared,SVM_P_Train_Rsquared,
KNN_Train_Rsquared,CART_Train_Rsquared,CTREE_Train_Rsquared,RF_Train_Rsquared,CUB_Train_Rsquared,
LR_Test_Rsquared,PLR_R_Test_Rsquared,PLR_L_Test_Rsquared,PLR_E_Test_Rsquared,PCR_Test_Rsquared,
PLS_Test_Rsquared,AVNN_Test_Rsquared,MARS_Test_Rsquared,SVM_R_Test_Rsquared,SVM_P_Test_Rsquared,
KNN_Test_Rsquared,CART_Test_Rsquared,CTREE_Test_Rsquared,RF_Test_Rsquared,CUB_Test_Rsquared)
R_Squared_Summary <- as.data.frame(cbind(Model,Set,R_Squared))
R_Squared_Summary$R_Squared <- as.numeric(as.character(R_Squared_Summary$R_Squared))
R_Squared_Summary$Set <- factor(R_Squared_Summary$Set,
levels = c("Cross-Validation",
"Test"))
R_Squared_Summary$Model <- factor(R_Squared_Summary$Model,
levels = c("LR",
"PLR_R",
"PLR_L",
"PLR_E",
"PCR",
"PLS",
"AVNN",
"MARS",
"SVM_R",
"SVM_P",
"KNN",
"CART",
"CTREE",
"RF",
"CUB"))
print(R_Squared_Summary, row.names=FALSE)
## Model Set R_Squared
## LR Cross-Validation 0.8862948
## PLR_R Cross-Validation 0.8968424
## PLR_L Cross-Validation 0.8976271
## PLR_E Cross-Validation 0.8982164
## PCR Cross-Validation 0.8677154
## PLS Cross-Validation 0.8992123
## AVNN Cross-Validation 0.7480237
## MARS Cross-Validation 0.8823049
## SVM_R Cross-Validation 0.9155098
## SVM_P Cross-Validation 0.9116780
## KNN Cross-Validation 0.7326032
## CART Cross-Validation 0.7814193
## CTREE Cross-Validation 0.7794050
## RF Cross-Validation 0.9009781
## CUB Cross-Validation 0.9253788
## LR Test 0.8643929
## PLR_R Test 0.8751709
## PLR_L Test 0.8746818
## PLR_E Test 0.8761762
## PCR Test 0.8351614
## PLS Test 0.8670618
## AVNN Test 0.7829311
## MARS Test 0.8689100
## SVM_R Test 0.9098906
## SVM_P Test 0.9062256
## KNN Test 0.7137298
## CART Test 0.8070706
## CTREE Test 0.7310919
## RF Test 0.9005713
## CUB Test 0.9071665
(R_Squared_Plot <- dotplot(Model ~ R_Squared,
data = R_Squared_Summary,
groups = Set,
main = "Regression Model Performance Comparison",
ylab = "Model",
xlab = "R-Squared",
auto.key = list(adj = 1),
type = c("p", "h"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 2))

##################################
# Consolidating all evaluation results
# for the train and test sets
# using the rmse metric
##################################
Model <- c('LR','PLR_R','PLR_L','PLR_E','PCR','PLS','AVNN','MARS','SVM_R','SVM_P','KNN','CART','CTREE','RF','CUB',
'LR','PLR_R','PLR_L','PLR_E','PCR','PLS','AVNN','MARS','SVM_R','SVM_P','KNN','CART','CTREE','RF','CUB')
Set <- c(rep('Cross-Validation',15),rep('Test',15))
RMSE<- c(LR_Train_RMSE,PLR_R_Train_RMSE,PLR_L_Train_RMSE,PLR_E_Train_RMSE,PCR_Train_RMSE,
PLS_Train_RMSE,AVNN_Train_RMSE,MARS_Train_RMSE,SVM_R_Train_RMSE,SVM_P_Train_RMSE,
KNN_Train_RMSE,CART_Train_RMSE,CTREE_Train_RMSE,RF_Train_RMSE,CUB_Train_RMSE,
LR_Test_RMSE,PLR_R_Test_RMSE,PLR_L_Test_RMSE,PLR_E_Test_RMSE,PCR_Test_RMSE,
PLS_Test_RMSE,AVNN_Test_RMSE,MARS_Test_RMSE,SVM_R_Test_RMSE,SVM_P_Test_RMSE,
KNN_Test_RMSE,CART_Test_RMSE,CTREE_Test_RMSE,RF_Test_RMSE,CUB_Test_RMSE)
RMSE_Summary <- as.data.frame(cbind(Model,Set,RMSE))
RMSE_Summary$RMSE<- as.numeric(as.character(RMSE_Summary$RMSE))
RMSE_Summary$Set <- factor(RMSE_Summary$Set,
levels = c("Cross-Validation",
"Test"))
RMSE_Summary$Model <- factor(RMSE_Summary$Model,
levels = c("LR",
"PLR_R",
"PLR_L",
"PLR_E",
"PCR",
"PLS",
"AVNN",
"MARS",
"SVM_R",
"SVM_P",
"KNN",
"CART",
"CTREE",
"RF",
"CUB"))
print(RMSE_Summary, row.names=FALSE)
## Model Set RMSE
## LR Cross-Validation 0.6871912
## PLR_R Cross-Validation 0.6527539
## PLR_L Cross-Validation 0.6489629
## PLR_E Cross-Validation 0.6471642
## PCR Cross-Validation 0.7426083
## PLS Cross-Validation 0.6440406
## AVNN Cross-Validation 1.0608843
## MARS Cross-Validation 0.7034801
## SVM_R Cross-Validation 0.5950500
## SVM_P Cross-Validation 0.6028074
## KNN Cross-Validation 1.0672913
## CART Cross-Validation 0.9490440
## CTREE Cross-Validation 0.9570454
## RF Cross-Validation 0.6541868
## CUB Cross-Validation 0.5564468
## LR Test 0.7725809
## PLR_R Test 0.7414774
## PLR_L Test 0.7389135
## PLR_E Test 0.7351873
## PCR Test 0.8448324
## PLS Test 0.7647343
## AVNN Test 0.9862466
## MARS Test 0.7580420
## SVM_R Test 0.6274210
## SVM_P Test 0.6377764
## KNN Test 1.1247103
## CART Test 0.9194665
## CTREE Test 1.1160100
## RF Test 0.6557191
## CUB Test 0.6380865
(RMSE_Plot <- dotplot(Model ~ RMSE,
data = RMSE_Summary,
groups = Set,
main = "Regression Model Performance Comparison",
ylab = "Model",
xlab = "Root-Mean-Square Error",
auto.key = list(adj = 1),
type = c("p", "h"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 2))
