1.1 Sample Data
The
Solubility
dataset from the
AppliedPredictiveModeling
package was used for this illustrated example. Other original predictors
were removed from the dataset leaving only a subset of numeric
predictors used during the analysis.
Preliminary dataset assessment:
[A] 951 rows (observations)
[A.1] Train Set = 951 observations
[B] 6 columns (variables)
[B.1] 1/6 response = Log_Solubility variable (numeric)
[B.2] 5/6 predictors = All remaining variables
(0/5 factor + 5/5 numeric)
##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(lattice)
library(dplyr)
library(tidyr)
library(moments)
library(skimr)
library(RANN)
library(pls)
library(corrplot)
library(tidyverse)
library(lares)
library(DMwR)
library(gridExtra)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(stats)
##################################
# Loading source and
# formulating the train set
##################################
data(solubility)
Solubility_Train <- as.data.frame(cbind(solTrainY,solTrainX))
##################################
# Selecting only a subset of
# numeric predictors for the train set
##################################
Solubility_Train <- Solubility_Train[,c("solTrainY",
"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")]
##################################
# Performing a general exploration of the train set
##################################
dim(Solubility_Train)
## [1] 951 6
## 'data.frame': 951 obs. of 6 variables:
## $ solTrainY : num -3.97 -3.98 -3.99 -4 -4.06 -4.08 -4.08 -4.1 -4.1 -4.11 ...
## $ MolWeight : num 208 366 206 136 230 ...
## $ NumCarbon : int 14 21 13 10 9 10 17 12 22 14 ...
## $ NumChlorine : int 0 0 0 0 1 2 2 0 0 0 ...
## $ NumHalogen : int 0 0 0 0 1 2 2 0 1 0 ...
## $ NumMultBonds: int 16 13 7 2 6 2 18 1 4 7 ...
summary(Solubility_Train)
## solTrainY MolWeight NumCarbon NumChlorine
## Min. :-11.620 Min. : 46.09 Min. : 1.000 Min. : 0.0000
## 1st Qu.: -3.955 1st Qu.:122.61 1st Qu.: 6.000 1st Qu.: 0.0000
## Median : -2.510 Median :179.23 Median : 9.000 Median : 0.0000
## Mean : -2.719 Mean :201.65 Mean : 9.893 Mean : 0.5563
## 3rd Qu.: -1.360 3rd Qu.:264.34 3rd Qu.:12.000 3rd Qu.: 0.0000
## Max. : 1.580 Max. :665.81 Max. :33.000 Max. :10.0000
## NumHalogen NumMultBonds
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 0.0000 Median : 6.000
## Mean : 0.6982 Mean : 6.148
## 3rd Qu.: 1.0000 3rd Qu.:10.000
## Max. :10.0000 Max. :25.000
##################################
# Formulating a data type assessment summary
##################################
PDA <- Solubility_Train
(PDA.Summary <- data.frame(
Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 solTrainY numeric
## 2 2 MolWeight numeric
## 3 3 NumCarbon integer
## 4 4 NumChlorine integer
## 5 5 NumHalogen integer
## 6 6 NumMultBonds integer
1.2 Data Quality Assessment
Data quality assessment:
[A] No missing observations noted for any
variable.
[B] Low variance observed for 2 variables with
First.Second.Mode.Ratio>5.
[B.1] NumChlorine variable (numeric)
[B.2] NumHalogen variable (numeric)
[C] No low variance observed for any variable with
Unique.Count.Ratio<0.01.
[D] High skewness observed for 1 variable with
Skewness>3 or Skewness<(-3).
[D.1] NumChlorine variable (numeric)
##################################
# Loading dataset
##################################
DQA <- Solubility_Train
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 solTrainY numeric 951 0 1.000
## 2 2 MolWeight numeric 951 0 1.000
## 3 3 NumCarbon integer 951 0 1.000
## 4 4 NumChlorine integer 951 0 1.000
## 5 5 NumHalogen integer 951 0 1.000
## 6 6 NumMultBonds integer 951 0 1.000
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,!names(DQA) %in% c("solTrainY")]
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}
## [1] "There are 5 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}
## [1] "There are no factor predictor variables."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 MolWeight numeric 646 0.679 102.200
## 2 NumCarbon integer 28 0.029 6.000
## 3 NumChlorine integer 11 0.012 0.000
## 4 NumHalogen integer 11 0.012 0.000
## 5 NumMultBonds integer 25 0.026 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 116.230 16 14 1.143
## 2 7.000 105 97 1.082
## 3 1.000 750 81 9.259
## 4 1.000 685 107 6.402
## 5 7.000 158 122 1.295
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 46.090 201.654 179.230 665.810 0.988 3.945 122.605
## 2 1.000 9.893 9.000 33.000 0.927 3.616 6.000
## 3 0.000 0.556 0.000 10.000 3.178 13.780 0.000
## 4 0.000 0.698 0.000 10.000 2.691 10.808 0.000
## 5 0.000 6.148 6.000 25.000 0.670 3.053 1.000
## Percentile75th
## 1 264.340
## 2 12.000
## 3 0.000
## 4 1.000
## 5 10.000
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No factor predictors noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 2 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 3 NumChlorine integer 11 0.012 0.000
## 4 NumHalogen integer 11 0.012 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 3 1.000 750 81 9.259
## 4 1.000 685 107 6.402
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 3 0.000 0.556 0.000 10.000 3.178 13.780 0.000 0.000
## 4 0.000 0.698 0.000 10.000 2.691 10.808 0.000 1.000
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}
## [1] "High skewness observed for 1 numeric variable(s) with Skewness>3 or Skewness<(-3)."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 3 NumChlorine integer 11 0.012 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 3 1.000 750 81 9.259
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 3 0.000 0.556 0.000 10.000 3.178 13.780 0.000 0.000
1.3 Data Preprocessing
1.3.1 Outlier
Outlier data assessment:
[A] Outliers noted for 5 variables with the numeric
data visualized through a boxplot including observations classified as
suspected outliers using the IQR criterion. The IQR criterion means that
all observations above the (75th percentile + 1.5 x IQR) or below the
(25th percentile - 1.5 x IQR) are suspected outliers, where IQR is the
difference between the third quartile (75th percentile) and first
quartile (25th percentile). Outlier treatment for numerical stability
remains optional depending on potential model requirements for the
subsequent steps.
[A.1] MolWeight
variable (8 outliers detected)
[A.2] NumMultBonds variable (6 outliers
detected)
[A.3] NumCarbon variable (35 outliers
detected)
[A.4] NumChlorine variable (201 outliers
detected)
[A.5] NumHalogen variable (99 outliers
detected)
##################################
# Loading dataset
##################################
DPA <- Solubility_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("solTrainY")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Identifying outliers for the numeric predictors
##################################
OutlierCountList <- c()
for (i in 1:ncol(DPA.Predictors.Numeric)) {
Outliers <- boxplot.stats(DPA.Predictors.Numeric[,i])$out
OutlierCount <- length(Outliers)
OutlierCountList <- append(OutlierCountList,OutlierCount)
OutlierIndices <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}





OutlierCountSummary <- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
OutlierCountSummary$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
NumericPredictorWithOutlierCount <- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "5 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA.Predictors.Numeric))
Data summary
Name |
DPA.Predictors.Numeric |
Number of rows |
951 |
Number of columns |
5 |
_______________________ |
|
Column type frequency: |
|
numeric |
5 |
________________________ |
|
Group variables |
None |
Variable type: numeric
MolWeight |
0 |
1 |
201.65 |
97.91 |
46.09 |
122.6 |
179.23 |
264.34 |
665.81 |
▇▆▂▁▁ |
NumCarbon |
0 |
1 |
9.89 |
5.29 |
1.00 |
6.0 |
9.00 |
12.00 |
33.00 |
▇▇▃▁▁ |
NumChlorine |
0 |
1 |
0.56 |
1.40 |
0.00 |
0.0 |
0.00 |
0.00 |
10.00 |
▇▁▁▁▁ |
NumHalogen |
0 |
1 |
0.70 |
1.47 |
0.00 |
0.0 |
0.00 |
1.00 |
10.00 |
▇▁▁▁▁ |
NumMultBonds |
0 |
1 |
6.15 |
5.17 |
0.00 |
1.0 |
6.00 |
10.00 |
25.00 |
▇▆▃▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric)
## [1] 951 5
1.3.2 Zero and Near-Zero Variance
Zero and near-zero variance data assessment:
[A] Low variance noted for 1 variable from the previous
data quality assessment using a lower threshold.
[B] No low variance noted for any variable using a
preprocessing summary from the
caret
package. The nearZeroVar method
using both the freqCut and uniqueCut criteria set at 95/5 and 10,
respectively, were applied on the dataset.
##################################
# Loading dataset
##################################
DPA <- Solubility_Train
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA))
Data summary
Name |
DPA |
Number of rows |
951 |
Number of columns |
6 |
_______________________ |
|
Column type frequency: |
|
numeric |
6 |
________________________ |
|
Group variables |
None |
Variable type: numeric
solTrainY |
0 |
1 |
-2.72 |
2.05 |
-11.62 |
-3.96 |
-2.51 |
-1.36 |
1.58 |
▁▁▃▇▃ |
MolWeight |
0 |
1 |
201.65 |
97.91 |
46.09 |
122.60 |
179.23 |
264.34 |
665.81 |
▇▆▂▁▁ |
NumCarbon |
0 |
1 |
9.89 |
5.29 |
1.00 |
6.00 |
9.00 |
12.00 |
33.00 |
▇▇▃▁▁ |
NumChlorine |
0 |
1 |
0.56 |
1.40 |
0.00 |
0.00 |
0.00 |
0.00 |
10.00 |
▇▁▁▁▁ |
NumHalogen |
0 |
1 |
0.70 |
1.47 |
0.00 |
0.00 |
0.00 |
1.00 |
10.00 |
▇▁▁▁▁ |
NumMultBonds |
0 |
1 |
6.15 |
5.17 |
0.00 |
1.00 |
6.00 |
10.00 |
25.00 |
▇▆▃▁▁ |
##################################
# Identifying columns with low variance
###################################
DPA_LowVariance <- nearZeroVar(DPA,
freqCut = 95/5,
uniqueCut = 10,
saveMetrics= TRUE)
(DPA_LowVariance[DPA_LowVariance$nzv,])
## [1] freqRatio percentUnique zeroVar nzv
## <0 rows> (or 0-length row.names)
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
} else {
print(paste0("Low variance observed for ",
(nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
DPA_LowVarianceForRemoval <- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
print(paste0("Low variance can be resolved by removing ",
(nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
DPA_LowVarianceRemovedVariable <- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
print(paste0("Variable ",
j,
" for removal: ",
DPA_LowVarianceRemovedVariable))
}
DPA %>%
skim() %>%
dplyr::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
##################################
# Filtering out columns with low variance
#################################
DPA_ExcludedLowVariance <- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedLowVariance_Skimmed <- skim(DPA_ExcludedLowVariance))
}
## [1] "No low variance predictors noted."
1.3.3 Collinearity
High collinearity data assessment:
[A] No high correlation > 95% were noted for any
variable pair as confirmed using the preprocessing summaries from the
caret
and
lares
packages.
##################################
# Loading dataset
##################################
DPA <- Solubility_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("solTrainY")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Visualizing pairwise correlation between predictors
##################################
DPA_CorrelationTest <- cor.mtest(DPA.Predictors.Numeric,
method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")

##################################
# Identifying the highly correlated variables
##################################
DPA_Correlation <- cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs")
(DPA_HighlyCorrelatedCount <- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95))
## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
} else {
print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount),
" pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
(DPA_HighlyCorrelatedPairs <- corr_cross(DPA.Predictors.Numeric,
max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}
## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
DPA_HighlyCorrelated <- findCorrelation(DPA_Correlation, cutoff = 0.95)
(DPA_HighlyCorrelatedForRemoval <- length(DPA_HighlyCorrelated))
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval),
" numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
DPA_HighlyCorrelatedRemovedVariable <- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
print(paste0("Variable ",
j,
" for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
DPA_ExcludedHighCorrelation <- DPA[,-DPA_HighlyCorrelated]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedHighCorrelation_Skimmed <- skim(DPA_ExcludedHighCorrelation))
}
1.3.4 Linear Dependencies
Linear dependency data assessment:
[A] No linear dependencies noted for any subset of
variables using the preprocessing summary from the
caret
package applying the findLinearCombos method which utilizes the
QR decomposition of a matrix to enumerate sets of linear combinations
(if they exist).
##################################
# Loading dataset
##################################
DPA <- Solubility_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("solTrainY")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Identifying the linearly dependent variables
##################################
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
(DPA_LinearlyDependentCount <- length(DPA_LinearlyDependent$linearCombos))
## [1] 0
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
} else {
print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount),
" subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
DPA_LinearlyDependentSubset <- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
print(paste0("Linear dependent variable(s) for subset ",
i,
" include: ",
DPA_LinearlyDependentSubset))
}
}
## [1] "No linearly dependent predictors noted."
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependentForRemoval <- length(DPA_LinearlyDependent$remove)
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval),
" numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
DPA_LinearlyDependentRemovedVariable <- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
print(paste0("Variable ",
j,
" for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
DPA_ExcludedLinearlyDependent <- DPA[,-DPA_LinearlyDependent$remove]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedLinearlyDependent_Skimmed <- skim(DPA_ExcludedLinearlyDependent))
}
1.3.6 Centering and Scaling
Centering and scaling data assessment:
[A] To maintain numerical stability during modelling,
centering and scaling transformations were applied on the transformed
numeric variables. The center method
from the
caret
package was implemented which subtracts the average value of a numeric
variable to all the values. As a result of centering, the variables had
zero mean values. In addition, the scale method, also from the
caret
package, was applied which performs a center transformation with each
value of the variable divided by its standard deviation. Scaling the
data coerced the values to have a common standard deviation of
one.
##################################
# Loading dataset
##################################
DPA <- Solubility_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("solTrainY")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Applying a Box-Cox transformation
##################################
DPA_BoxCox <- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCoxTransformed <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
##################################
# Applying a center and scale data transformation
##################################
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled <- preProcess(DPA_BoxCoxTransformed, method = c("center","scale"))
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed <- predict(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled, DPA_BoxCoxTransformed)
##################################
# Gathering descriptive statistics
##################################
(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformedSkimmed <- skim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed))
Data summary
Name |
DPA.Predictors.Numeric_Bo… |
Number of rows |
951 |
Number of columns |
5 |
_______________________ |
|
Column type frequency: |
|
numeric |
5 |
________________________ |
|
Group variables |
None |
Variable type: numeric
MolWeight |
0 |
1 |
0 |
1 |
-2.84 |
-0.80 |
-0.01 |
0.80 |
2.72 |
▁▆▇▆▁ |
NumCarbon |
0 |
1 |
0 |
1 |
-2.64 |
-0.69 |
-0.01 |
0.54 |
3.06 |
▂▇▇▃▁ |
NumChlorine |
0 |
1 |
0 |
1 |
-0.40 |
-0.40 |
-0.40 |
-0.40 |
6.74 |
▇▁▁▁▁ |
NumHalogen |
0 |
1 |
0 |
1 |
-0.47 |
-0.47 |
-0.47 |
0.20 |
6.32 |
▇▁▁▁▁ |
NumMultBonds |
0 |
1 |
0 |
1 |
-1.19 |
-1.00 |
-0.03 |
0.74 |
3.65 |
▇▇▃▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed)
## [1] 951 5
1.3.7 Pre-Processed Dataset
Preliminary dataset assessment:
[A] 951 rows (observations)
[A.1] Train Set = 951 observations
[B] 6 columns (variables)
[B.1] 1/6 response = Log_Solubility variable (numeric)
[B.2] 5/6 predictors = All remaining variables
(0/5 factor + 5/5 numeric)
[C] Pre-processing actions applied:
[C.1] Centering, scaling and shape transformation
applied to improve data quality
[C.2] No outlier treatment applied since the high
values noted were contextually valid and sensible
[C.3] No predictors removed due to zero or
near-zero variance
[C.4] No predictors removed due to high
correlation
[C.5] No predictors removed due to linear
dependencies
##################################
# Creating the pre-modelling
# train set
##################################
Log_Solubility <- DPA$solTrainY
PMA.Predictors.Numeric <- DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
PMA_BoxCoxTransformed_CenteredScaledTransformed <- cbind(Log_Solubility,PMA.Predictors.Numeric)
PMA_PreModelling_Train <- PMA_BoxCoxTransformed_CenteredScaledTransformed
##################################
# Gathering descriptive statistics
##################################
(PMA_PreModelling_Train_Skimmed <- skim(PMA_PreModelling_Train))
Data summary
Name |
PMA_PreModelling_Train |
Number of rows |
951 |
Number of columns |
6 |
_______________________ |
|
Column type frequency: |
|
numeric |
6 |
________________________ |
|
Group variables |
None |
Variable type: numeric
Log_Solubility |
0 |
1 |
-2.72 |
2.05 |
-11.62 |
-3.96 |
-2.51 |
-1.36 |
1.58 |
▁▁▃▇▃ |
MolWeight |
0 |
1 |
0.00 |
1.00 |
-2.84 |
-0.80 |
-0.01 |
0.80 |
2.72 |
▁▆▇▆▁ |
NumCarbon |
0 |
1 |
0.00 |
1.00 |
-2.64 |
-0.69 |
-0.01 |
0.54 |
3.06 |
▂▇▇▃▁ |
NumChlorine |
0 |
1 |
0.00 |
1.00 |
-0.40 |
-0.40 |
-0.40 |
-0.40 |
6.74 |
▇▁▁▁▁ |
NumHalogen |
0 |
1 |
0.00 |
1.00 |
-0.47 |
-0.47 |
-0.47 |
0.20 |
6.32 |
▇▁▁▁▁ |
NumMultBonds |
0 |
1 |
0.00 |
1.00 |
-1.19 |
-1.00 |
-0.03 |
0.74 |
3.65 |
▇▇▃▁▁ |
###################################
# Verifying the data dimensions
# for the train set
###################################
dim(PMA_PreModelling_Train)
## [1] 951 6
1.5 Linear Regression Model Coefficient Estimation
1.5.1 Linear Regression - Normal Equations (LR_NE)
Normal
Equations are a system of equations whose solution is the Ordinary
Least Squares (OLS) estimator of the regression coefficients and which
are derived from the first-order condition of the least squares
minimization problem. These equations are obtained by setting equal to
zero the partial derivatives of the sum of squared errors (least
squares). This approach is a closed-form solution and a one-step
algorithm used to analytically find the coefficients that minimize the
loss function.
[A] Applying normal equations, the estimated linear
regression coefficients for the given data are as follows:
[A.1] Intercept = -2.71856
[A.2] MolWeight = +0.20493
[A.3] NumCarbon = -1.25425
[A.4] NumChlorine = -0.14419
[A.5] NumHalogen = -1.01350
[A.6] NumMultBonds = -0.33048
[B] These estimated coefficients will be the baseline
values from which all gradient descent algorithm-derived coefficients
will be compared with.
##################################
# Defining a function to implement
# normal equations for estimating
# linear regression coefficients
##################################
NormalEquations_LREstimation <- function(y, X){
X = data.frame(rep(1,length(y)),X)
X = as.matrix(X)
LRcoefficients = solve(t(X)%*%X)%*%t(X)%*%y
return(LRcoefficients)
}
##################################
# Loading dataset
# and restructuring to the
# y and X components
##################################
y <- PMA_PreModelling_Train$Log_Solubility
x1_MolWeight <- PMA_PreModelling_Train$MolWeight
x2_NumCarbon <- PMA_PreModelling_Train$NumCarbon
x3_NumChlorine <- PMA_PreModelling_Train$NumChlorine
x4_NumHalogen <- PMA_PreModelling_Train$NumHalogen
x5_NumMultBonds <- PMA_PreModelling_Train$NumMultBonds
X = data.frame(x1_MolWeight,
x2_NumCarbon,
x3_NumChlorine,
x4_NumHalogen,
x5_NumMultBonds)
##################################
# Estimating the linear regression coefficients
# using the normal equations algorithm
##################################
LR_NE <- NormalEquations_LREstimation(y = y,
X = X)
##################################
# Consolidating all estimated
# linear regression coefficients
# using the normal equations algorithm
##################################
LR_NE <- as.data.frame(LR_NE)
rownames(LR_NE) <- NULL
colnames(LR_NE) <- c("LRCoefficients")
LR_NE$LRCoefficientNames <- c("Intercept",
"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
LR_NE$EstimationMethod <- rep("LR_NE",nrow(LR_NE))
##################################
# Summarizing the estimated
# linear regression coefficients
# using the normal equations algorithm
##################################
print(LR_NE)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185699 Intercept LR_NE
## 2 0.2049318 MolWeight LR_NE
## 3 -1.2542520 NumCarbon LR_NE
## 4 -0.1441934 NumChlorine LR_NE
## 5 -1.0135099 NumHalogen LR_NE
## 6 -0.3304828 NumMultBonds LR_NE
1.5.2 Linear Regression - Gradient Descent Algorithm with Very High
Learning Rate and Low Epoch Count (LR_GDA_VHLR_LEC)
Gradient
Descent minimizes the loss function parameterized by the model’s
coefficients based on the direction and learning rate factors which
determine the partial derivative calculations of future iterations,
allowing the algorithm to gradually arrive at the local or global
minimum considered the point of convergence. This particular
implementation used Batch Gradient Descent which computes the gradient
of the loss function with respect to the parameters for the entire data
set. A very high learning rate (also referred to as step size or the
alpha) and low epoch count were applied resulting in larger steps with
lesser risks of overshooting the minimum due to a lower number of
iterations.
[A] The gradient descent algorithm was implemented with
parameter settings described as follows:
[A.1] Learning
Rate = 200 (Very High)
[A.2] Epochs =
10 (Low)
[B] The final gradient norm was determined as 0.42670
at the 10th epoch indicating that the minimum threshold of 0.00010 was
not achieved up until the last epoch.
[C] Applying the gradient descent algorithm with a high
learning rate and low epoch count, the estimated linear regression
coefficients for the given data are as follows:
[C.1] Intercept = -2.71856 (Baseline =
-2.71856)
[C.2] MolWeight = +0.26840 (Baseline =
+0.20493)
[C.3] NumCarbon = -1.41262 (Baseline =
-1.25425)
[C.4] NumChlorine = -0.60097 (Baseline =
-0.14419)
[C.5] NumHalogen = -0.68157 (Baseline =
-1.01350)
[C.6] NumMultBonds = -0.34150 (Baseline =
-0.33048)
[D] The estimated coefficients using the gradient
descent algorithm with a high learning rate and high epoch count, while
not fully optimized, were sufficiently comparable with the baseline
coefficients using normal equations.
##################################
# Defining a function to implement
# gradient descent algorithm for estimating
# linear regression coefficients
##################################
GradientDescent_LREstimation <-function(y, X, GradientNormMinimumThreshold, LearningRate, Epochs){
GradientNormMinimumThreshold = 0.0001
X = as.matrix(data.frame(rep(1,length(y)),X))
N= dim(X)[1]
print("Initializing Gradient Descent Algorithm Parameters.")
set.seed(12345678)
Theta.InitialValue = as.matrix(rnorm(n=dim(X)[2], mean=0,sd = 1))
Theta.InitialValue = t(Theta.InitialValue)
e = t(y) - Theta.InitialValue%*%t(X)
Gradient.InitialValue = -(2/N)%*%(e)%*%X
Theta = Theta.InitialValue - LearningRate *(1/N)*Gradient.InitialValue
L2Loss = c()
for(i in 1:Epochs){
L2Loss = c(L2Loss,sqrt(sum((t(y) - Theta%*%t(X))^2)))
e = t(y) - Theta%*%t(X)
grad = -(2/N)%*%e%*%X
Theta = Theta - LearningRate*(2/N)*grad
if(sqrt(sum(grad^2)) <= GradientNormMinimumThreshold){
break
}
}
if (i < Epochs) {
print("Gradient Descent Algorithm Converged.")
}
if (i == Epochs) {
print("Gradient Descent Algorithm Reached Last Epoch Without Convergence.")
print("Minimum Threshold for Gradient Norm = 0.0001 Not Achieved.")
}
print(paste("Final Gradient Norm Determined as ",sqrt(sum(grad^2)),"at Epoch",i))
GradientDescentAlgorithmValues <- list("LRCoefficients" = t(Theta), "L2Loss" = L2Loss)
return(GradientDescentAlgorithmValues)
}
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
LR_GDA_VHLR_LEC = GradientDescent_LREstimation(y = y,
X = X,
LearningRate = 200,
Epochs = 10)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.42670603153189 at Epoch 10"
LR_GDA_VHLR_LEC_Summary <- LR_GDA_VHLR_LEC
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
LR_GDA_VHLR_LEC <- as.data.frame(LR_GDA_VHLR_LEC_Summary$LRCoefficients)
rownames(LR_GDA_VHLR_LEC) <- NULL
colnames(LR_GDA_VHLR_LEC) <- c("LRCoefficients")
LR_GDA_VHLR_LEC$LRCoefficientNames <- c("Intercept",
"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
LR_GDA_VHLR_LEC$EstimationMethod <- rep("LR_GDA_VHLR_LEC",nrow(LR_GDA_VHLR_LEC))
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
print(LR_GDA_VHLR_LEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185699 Intercept LR_GDA_VHLR_LEC
## 2 0.2684003 MolWeight LR_GDA_VHLR_LEC
## 3 -1.4126207 NumCarbon LR_GDA_VHLR_LEC
## 4 -0.6009731 NumChlorine LR_GDA_VHLR_LEC
## 5 -0.6815776 NumHalogen LR_GDA_VHLR_LEC
## 6 -0.3415085 NumMultBonds LR_GDA_VHLR_LEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
LR_GDA_VHLR_LEC_Summary$Epoch <- 1:length(LR_GDA_VHLR_LEC_Summary$L2Loss)
LR_GDA_VHLR_LEC_Summary$Method <- rep("LR_GDA_VHLR_LEC",length(LR_GDA_VHLR_LEC_Summary$L2Loss))
L2Loss <- LR_GDA_VHLR_LEC_Summary$L2Loss
Epoch <- LR_GDA_VHLR_LEC_Summary$Epoch
Method <- LR_GDA_VHLR_LEC_Summary$Method
(LR_GDA_VHLR_LEC_ConsolidatedSummary <- cbind(L2Loss, Epoch, Method))
## L2Loss Epoch Method
## [1,] "82.9563953049771" "1" "LR_GDA_VHLR_LEC"
## [2,] "43.3675127878632" "2" "LR_GDA_VHLR_LEC"
## [3,] "40.1055331895743" "3" "LR_GDA_VHLR_LEC"
## [4,] "38.9740740659105" "4" "LR_GDA_VHLR_LEC"
## [5,] "38.2468234561932" "5" "LR_GDA_VHLR_LEC"
## [6,] "37.7326021491187" "6" "LR_GDA_VHLR_LEC"
## [7,] "37.3604396276862" "7" "LR_GDA_VHLR_LEC"
## [8,] "37.0910709007472" "8" "LR_GDA_VHLR_LEC"
## [9,] "36.8989019283906" "9" "LR_GDA_VHLR_LEC"
## [10,] "36.7660015463775" "10" "LR_GDA_VHLR_LEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_VHLR_LEC_Summary,
main = "loss function Optimization Profile : LR_GDA_VHLR_LEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))

1.5.3 Linear Regression - Gradient Descent Algorithm with Very High
Learning Rate and High Epoch Count (LR_GDA_VHLR_HEC)
Gradient
Descent minimizes the loss function parameterized by the model’s
coefficients based on the direction and learning rate factors which
determine the partial derivative calculations of future iterations,
allowing the algorithm to gradually arrive at the local or global
minimum considered the point of convergence. This particular
implementation used Batch Gradient Descent which computes the gradient
of the loss function with respect to the parameters for the entire data
set. A very high learning rate (also referred to as step size or the
alpha) and high epoch count were applied resulting in larger steps with
more risks of overshooting the minimum due to a higher number of
iterations.
[A] The gradient descent algorithm was implemented with
parameter settings described as follows:
[A.1] Learning
Rate = 200 (Very High)
[A.2] Epochs =
50 (High)
[B] The final gradient norm was determined as 11.84773
at the 50th epoch indicating that the minimum threshold of 0.00010 was
not achieved prior to the last epoch.
[C] Applying the gradient descent algorithm with a high
learning rate and high epoch count, the estimated linear regression
coefficients for the given data are as follows:
[C.1] Intercept = -2.71857 (Baseline =
-2.71856)
[C.2] MolWeight = -1.26942 (Baseline =
+0.20493)
[C.3] NumCarbon = -2.36919 (Baseline =
-1.25425)
[C.4] NumChlorine = -1.21815 (Baseline =
-0.14419)
[C.5] NumHalogen = -2.00242 (Baseline =
-1.01350)
[C.6] NumMultBonds = -1.42725 (Baseline =
-0.33048)
[D] The estimated coefficients using the gradient
descent algorithm with a very high learning rate and high epoch count
were not fully optimized and were not comparable with the baseline
coefficients using normal equations.
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
LR_GDA_VHLR_HEC = GradientDescent_LREstimation(y = y,
X = X,
LearningRate = 200,
Epochs = 50)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 11.8477388521046 at Epoch 50"
LR_GDA_VHLR_HEC_Summary <- LR_GDA_VHLR_HEC
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
LR_GDA_VHLR_HEC <- as.data.frame(LR_GDA_VHLR_HEC_Summary$LRCoefficients)
rownames(LR_GDA_VHLR_HEC) <- NULL
colnames(LR_GDA_VHLR_HEC) <- c("LRCoefficients")
LR_GDA_VHLR_HEC$LRCoefficientNames <- c("Intercept",
"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
LR_GDA_VHLR_HEC$EstimationMethod <- rep("LR_GDA_VHLR_HEC",nrow(LR_GDA_VHLR_HEC))
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
print(LR_GDA_VHLR_HEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.718570 Intercept LR_GDA_VHLR_HEC
## 2 -1.269429 MolWeight LR_GDA_VHLR_HEC
## 3 -2.369198 NumCarbon LR_GDA_VHLR_HEC
## 4 -1.218151 NumChlorine LR_GDA_VHLR_HEC
## 5 -2.002425 NumHalogen LR_GDA_VHLR_HEC
## 6 -1.427259 NumMultBonds LR_GDA_VHLR_HEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
LR_GDA_VHLR_HEC_Summary$Epoch <- 1:length(LR_GDA_VHLR_HEC_Summary$L2Loss)
LR_GDA_VHLR_HEC_Summary$Method <- rep("LR_GDA_VHLR_HEC",length(LR_GDA_VHLR_HEC_Summary$L2Loss))
L2Loss <- LR_GDA_VHLR_HEC_Summary$L2Loss
Epoch <- LR_GDA_VHLR_HEC_Summary$Epoch
Method <- LR_GDA_VHLR_HEC_Summary$Method
(LR_GDA_VHLR_HEC_ConsolidatedSummary <- cbind(L2Loss, Epoch, Method))
## L2Loss Epoch Method
## [1,] "82.9563953049771" "1" "LR_GDA_VHLR_HEC"
## [2,] "43.3675127878632" "2" "LR_GDA_VHLR_HEC"
## [3,] "40.1055331895743" "3" "LR_GDA_VHLR_HEC"
## [4,] "38.9740740659105" "4" "LR_GDA_VHLR_HEC"
## [5,] "38.2468234561932" "5" "LR_GDA_VHLR_HEC"
## [6,] "37.7326021491187" "6" "LR_GDA_VHLR_HEC"
## [7,] "37.3604396276862" "7" "LR_GDA_VHLR_HEC"
## [8,] "37.0910709007472" "8" "LR_GDA_VHLR_HEC"
## [9,] "36.8989019283906" "9" "LR_GDA_VHLR_HEC"
## [10,] "36.7660015463775" "10" "LR_GDA_VHLR_HEC"
## [11,] "36.6795350594859" "11" "LR_GDA_VHLR_HEC"
## [12,] "36.6303695815112" "12" "LR_GDA_VHLR_HEC"
## [13,] "36.612173832238" "13" "LR_GDA_VHLR_HEC"
## [14,] "36.620787212362" "14" "LR_GDA_VHLR_HEC"
## [15,] "36.6537669251668" "15" "LR_GDA_VHLR_HEC"
## [16,] "36.7100652578966" "16" "LR_GDA_VHLR_HEC"
## [17,] "36.789806072341" "17" "LR_GDA_VHLR_HEC"
## [18,] "36.8941382329563" "18" "LR_GDA_VHLR_HEC"
## [19,] "37.0251492823555" "19" "LR_GDA_VHLR_HEC"
## [20,] "37.1858266763374" "20" "LR_GDA_VHLR_HEC"
## [21,] "37.380056820877" "21" "LR_GDA_VHLR_HEC"
## [22,] "37.6126542379826" "22" "LR_GDA_VHLR_HEC"
## [23,] "37.8894145699319" "23" "LR_GDA_VHLR_HEC"
## [24,] "38.2171859285369" "24" "LR_GDA_VHLR_HEC"
## [25,] "38.60395341702" "25" "LR_GDA_VHLR_HEC"
## [26,] "39.0589316131335" "26" "LR_GDA_VHLR_HEC"
## [27,] "39.5926595405127" "27" "LR_GDA_VHLR_HEC"
## [28,] "40.2170923405873" "28" "LR_GDA_VHLR_HEC"
## [29,] "40.9456836962362" "29" "LR_GDA_VHLR_HEC"
## [30,] "41.7934532850637" "30" "LR_GDA_VHLR_HEC"
## [31,] "42.7770343876188" "31" "LR_GDA_VHLR_HEC"
## [32,] "43.914698424415" "32" "LR_GDA_VHLR_HEC"
## [33,] "45.2263557053649" "33" "LR_GDA_VHLR_HEC"
## [34,] "46.7335349200765" "34" "LR_GDA_VHLR_HEC"
## [35,] "48.4593475286404" "35" "LR_GDA_VHLR_HEC"
## [36,] "50.4284466828075" "36" "LR_GDA_VHLR_HEC"
## [37,] "52.6669929716488" "37" "LR_GDA_VHLR_HEC"
## [38,] "55.2026405645276" "38" "LR_GDA_VHLR_HEC"
## [39,] "58.0645568817383" "39" "LR_GDA_VHLR_HEC"
## [40,] "61.2834867907404" "40" "LR_GDA_VHLR_HEC"
## [41,] "64.8918689145464" "41" "LR_GDA_VHLR_HEC"
## [42,] "68.9240076285283" "42" "LR_GDA_VHLR_HEC"
## [43,] "73.4163004691077" "43" "LR_GDA_VHLR_HEC"
## [44,] "78.4075176156333" "44" "LR_GDA_VHLR_HEC"
## [45,] "83.9391282093983" "45" "LR_GDA_VHLR_HEC"
## [46,] "90.0556676194485" "46" "LR_GDA_VHLR_HEC"
## [47,] "96.8051401880153" "47" "LR_GDA_VHLR_HEC"
## [48,] "104.239453181256" "48" "LR_GDA_VHLR_HEC"
## [49,] "112.414879289677" "49" "LR_GDA_VHLR_HEC"
## [50,] "121.392546766804" "50" "LR_GDA_VHLR_HEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_VHLR_HEC_Summary,
main = "loss function Optimization Profile : LR_GDA_VHLR_HEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))

1.5.4 Linear Regression - Gradient Descent Algorithm with High
Learning Rate and Low Epoch Count (LR_GDA_HLR_LEC)
Gradient
Descent minimizes the loss function parameterized by the model’s
coefficients based on the direction and learning rate factors which
determine the partial derivative calculations of future iterations,
allowing the algorithm to gradually arrive at the local or global
minimum considered the point of convergence. This particular
implementation used Batch Gradient Descent which computes the gradient
of the loss function with respect to the parameters for the entire data
set. A sufficiently high learning rate (also referred to as step size or
the alpha) and low epoch count were applied resulting in average steps
with more risks of not reaching the minimum due to a lower number of
iterations.
[A] The gradient descent algorithm was implemented with
parameter settings described as follows:
[A.1] Learning
Rate = 100 (High)
[A.2] Epochs =
10 (Low)
[B] The final gradient norm was determined as 0.32722
at the 10th epoch indicating that the minimum threshold of 0.00010 was
not achieved up until the last epoch.
[C] Applying the gradient descent algorithm with a high
learning rate and low epoch count, the estimated linear regression
coefficients for the given data are as follows:
[C.1] Intercept = -2.70555 (Baseline =
-2.71856)
[C.2] MolWeight = +0.60475 (Baseline =
+0.20493)
[C.3] NumCarbon = -1.63634 (Baseline =
-1.25425)
[C.4] NumChlorine = -0.84899 (Baseline =
-0.14419)
[C.5] NumHalogen = -0.49265 (Baseline =
-1.01350)
[C.6] NumMultBonds = -0.29582 (Baseline =
-0.33048)
[D] The estimated coefficients using the gradient
descent algorithm with a high learning rate and low epoch count were not
fully optimized and were not comparable with the baseline coefficients
using normal equations.
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and low epoch count
##################################
LR_GDA_HLR_LEC = GradientDescent_LREstimation(y = y,
X = X,
LearningRate = 100,
Epochs = 10)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.327225250712014 at Epoch 10"
LR_GDA_HLR_LEC_Summary <- LR_GDA_HLR_LEC
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and low epoch count
##################################
LR_GDA_HLR_LEC <- as.data.frame(LR_GDA_HLR_LEC_Summary$LRCoefficients)
rownames(LR_GDA_HLR_LEC) <- NULL
colnames(LR_GDA_HLR_LEC) <- c("LRCoefficients")
LR_GDA_HLR_LEC$LRCoefficientNames <- c("Intercept",
"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
LR_GDA_HLR_LEC$EstimationMethod <- rep("LR_GDA_HLR_LEC",nrow(LR_GDA_HLR_LEC))
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and low epoch count
##################################
print(LR_GDA_HLR_LEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7055537 Intercept LR_GDA_HLR_LEC
## 2 0.6047579 MolWeight LR_GDA_HLR_LEC
## 3 -1.6363423 NumCarbon LR_GDA_HLR_LEC
## 4 -0.8489932 NumChlorine LR_GDA_HLR_LEC
## 5 -0.4926584 NumHalogen LR_GDA_HLR_LEC
## 6 -0.2958283 NumMultBonds LR_GDA_HLR_LEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with high learning rate and low epoch count
##################################
LR_GDA_HLR_LEC_Summary$Epoch <- 1:length(LR_GDA_HLR_LEC_Summary$L2Loss)
LR_GDA_HLR_LEC_Summary$Method <- rep("LR_GDA_HLR_LEC",length(LR_GDA_HLR_LEC_Summary$L2Loss))
L2Loss <- LR_GDA_HLR_LEC_Summary$L2Loss
Epoch <- LR_GDA_HLR_LEC_Summary$Epoch
Method <- LR_GDA_HLR_LEC_Summary$Method
(LR_GDA_HLR_LEC_ConsolidatedSummary <- cbind(L2Loss, Epoch, Method))
## L2Loss Epoch Method
## [1,] "112.129202156293" "1" "LR_GDA_HLR_LEC"
## [2,] "69.9906053294776" "2" "LR_GDA_HLR_LEC"
## [3,] "52.4658187984899" "3" "LR_GDA_HLR_LEC"
## [4,] "44.8755235067431" "4" "LR_GDA_HLR_LEC"
## [5,] "41.6408804372354" "5" "LR_GDA_HLR_LEC"
## [6,] "40.1564841713508" "6" "LR_GDA_HLR_LEC"
## [7,] "39.3527407791219" "7" "LR_GDA_HLR_LEC"
## [8,] "38.8275066395866" "8" "LR_GDA_HLR_LEC"
## [9,] "38.4326912660602" "9" "LR_GDA_HLR_LEC"
## [10,] "38.1117947361666" "10" "LR_GDA_HLR_LEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with high learning rate and low epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_HLR_LEC_Summary,
main = "loss function Optimization Profile : LR_GDA_HLR_LEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))

1.5.5 Linear Regression - Gradient Descent Algorithm with High
Learning Rate and High Epoch Count (LR_GDA_HLR_HEC)
Gradient
Descent minimizes the loss function parameterized by the model’s
coefficients based on the direction and learning rate factors which
determine the partial derivative calculations of future iterations,
allowing the algorithm to gradually arrive at the local or global
minimum considered the point of convergence. This particular
implementation used Batch Gradient Descent which computes the gradient
of the loss function with respect to the parameters for the entire data
set. A sufficiently high learning rate (also referred to as step size or
the alpha) and low epoch count were applied resulting in average steps
with lesser risks of not reaching the minimum as compensated by the
higher number of iterations.
[A] The gradient descent algorithm was implemented with
parameter settings described as follows:
[A.1] Learning
Rate = 100 (High)
[A.2] Epochs =
50 (High)
[B] The final gradient norm was determined as 0.03004
at the 50th epoch indicating that the minimum threshold of 0.00010 was
not achieved prior to the last epoch.
[C] Applying the gradient descent algorithm with a high
learning rate and high epoch count, the estimated linear regression
coefficients for the given data are as follows:
[C.1] Intercept = -2.71856 (Baseline =
-2.71856)
[C.2] MolWeight = +0.16985 (Baseline =
+0.20493)
[C.3] NumCarbon = -1.22196 (Baseline =
-1.25425)
[C.4] NumChlorine = -0.29779 (Baseline =
-0.14419)
[C.5] NumHalogen = -0.84627 (Baseline =
-1.01350)
[C.6] NumMultBonds = -0.33109 (Baseline =
-0.33048)
[D] The estimated coefficients using the gradient
descent algorithm with a high learning rate and high epoch count, while
not fully optimized, were sufficiently comparable with the baseline
coefficients using normal equations.
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and high epoch count
##################################
LR_GDA_HLR_HEC = GradientDescent_LREstimation(y = y,
X = X,
LearningRate = 100,
Epochs = 50)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.0300496286624375 at Epoch 50"
LR_GDA_HLR_HEC_Summary <- LR_GDA_HLR_HEC
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and high epoch count
##################################
LR_GDA_HLR_HEC <- as.data.frame(LR_GDA_HLR_HEC_Summary$LRCoefficients)
rownames(LR_GDA_HLR_HEC) <- NULL
colnames(LR_GDA_HLR_HEC) <- c("LRCoefficients")
LR_GDA_HLR_HEC$LRCoefficientNames <- c("Intercept",
"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
LR_GDA_HLR_HEC$EstimationMethod <- rep("LR_GDA_HLR_HEC",nrow(LR_GDA_HLR_HEC))
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and high epoch count
##################################
print(LR_GDA_HLR_HEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185699 Intercept LR_GDA_HLR_HEC
## 2 0.1698594 MolWeight LR_GDA_HLR_HEC
## 3 -1.2219687 NumCarbon LR_GDA_HLR_HEC
## 4 -0.2977972 NumChlorine LR_GDA_HLR_HEC
## 5 -0.8462708 NumHalogen LR_GDA_HLR_HEC
## 6 -0.3310979 NumMultBonds LR_GDA_HLR_HEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with high learning rate and high epoch count
##################################
LR_GDA_HLR_HEC_Summary$Epoch <- 1:length(LR_GDA_HLR_HEC_Summary$L2Loss)
LR_GDA_HLR_HEC_Summary$Method <- rep("LR_GDA_HLR_HEC",length(LR_GDA_HLR_HEC_Summary$L2Loss))
L2Loss <- LR_GDA_HLR_HEC_Summary$L2Loss
Epoch <- LR_GDA_HLR_HEC_Summary$Epoch
Method <- LR_GDA_HLR_HEC_Summary$Method
(LR_GDA_HLR_HEC_ConsolidatedSummary <- cbind(L2Loss, Epoch, Method))
## L2Loss Epoch Method
## [1,] "112.129202156293" "1" "LR_GDA_HLR_HEC"
## [2,] "69.9906053294776" "2" "LR_GDA_HLR_HEC"
## [3,] "52.4658187984899" "3" "LR_GDA_HLR_HEC"
## [4,] "44.8755235067431" "4" "LR_GDA_HLR_HEC"
## [5,] "41.6408804372354" "5" "LR_GDA_HLR_HEC"
## [6,] "40.1564841713508" "6" "LR_GDA_HLR_HEC"
## [7,] "39.3527407791219" "7" "LR_GDA_HLR_HEC"
## [8,] "38.8275066395866" "8" "LR_GDA_HLR_HEC"
## [9,] "38.4326912660602" "9" "LR_GDA_HLR_HEC"
## [10,] "38.1117947361666" "10" "LR_GDA_HLR_HEC"
## [11,] "37.8410814575013" "11" "LR_GDA_HLR_HEC"
## [12,] "37.6088433323443" "12" "LR_GDA_HLR_HEC"
## [13,] "37.4080884025972" "13" "LR_GDA_HLR_HEC"
## [14,] "37.2339090197462" "14" "LR_GDA_HLR_HEC"
## [15,] "37.0824885446874" "15" "LR_GDA_HLR_HEC"
## [16,] "36.9506925242453" "16" "LR_GDA_HLR_HEC"
## [17,] "36.8358764051491" "17" "LR_GDA_HLR_HEC"
## [18,] "36.7357790012313" "18" "LR_GDA_HLR_HEC"
## [19,] "36.6484536165958" "19" "LR_GDA_HLR_HEC"
## [20,] "36.5722180985614" "20" "LR_GDA_HLR_HEC"
## [21,] "36.5056158801148" "21" "LR_GDA_HLR_HEC"
## [22,] "36.447384251053" "22" "LR_GDA_HLR_HEC"
## [23,] "36.3964278466087" "23" "LR_GDA_HLR_HEC"
## [24,] "36.3517961489294" "24" "LR_GDA_HLR_HEC"
## [25,] "36.3126642087895" "25" "LR_GDA_HLR_HEC"
## [26,] "36.2783160262168" "26" "LR_GDA_HLR_HEC"
## [27,] "36.2481301692672" "27" "LR_GDA_HLR_HEC"
## [28,] "36.2215673013761" "28" "LR_GDA_HLR_HEC"
## [29,] "36.1981593502996" "29" "LR_GDA_HLR_HEC"
## [30,] "36.177500096855" "30" "LR_GDA_HLR_HEC"
## [31,] "36.1592369958326" "31" "LR_GDA_HLR_HEC"
## [32,] "36.1430640683384" "32" "LR_GDA_HLR_HEC"
## [33,] "36.1287157267037" "33" "LR_GDA_HLR_HEC"
## [34,] "36.1159614113633" "34" "LR_GDA_HLR_HEC"
## [35,] "36.1046009346445" "35" "LR_GDA_HLR_HEC"
## [36,] "36.0944604398" "36" "LR_GDA_HLR_HEC"
## [37,] "36.0853888952591" "37" "LR_GDA_HLR_HEC"
## [38,] "36.0772550542315" "38" "LR_GDA_HLR_HEC"
## [39,] "36.0699448186983" "39" "LR_GDA_HLR_HEC"
## [40,] "36.0633589546223" "40" "LR_GDA_HLR_HEC"
## [41,] "36.057411112046" "41" "LR_GDA_HLR_HEC"
## [42,] "36.0520261097326" "42" "LR_GDA_HLR_HEC"
## [43,] "36.0471384492471" "43" "LR_GDA_HLR_HEC"
## [44,] "36.0426910279566" "44" "LR_GDA_HLR_HEC"
## [45,] "36.0386340244292" "45" "LR_GDA_HLR_HEC"
## [46,] "36.0349239332061" "46" "LR_GDA_HLR_HEC"
## [47,] "36.0315227289593" "47" "LR_GDA_HLR_HEC"
## [48,] "36.028397142702" "48" "LR_GDA_HLR_HEC"
## [49,] "36.0255180350174" "49" "LR_GDA_HLR_HEC"
## [50,] "36.022859853282" "50" "LR_GDA_HLR_HEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with high learning rate and high epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_HLR_HEC_Summary,
main = "loss function Optimization Profile : LR_GDA_HLR_HEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))

1.5.6 Linear Regression - Gradient Descent Algorithm with Low
Learning Rate and Low Epoch Count (LR_GDA_LLR_LEC)
Gradient
Descent minimizes the loss function parameterized by the model’s
coefficients based on the direction and learning rate factors which
determine the partial derivative calculations of future iterations,
allowing the algorithm to gradually arrive at the local or global
minimum considered the point of convergence. This particular
implementation used Batch Gradient Descent which computes the gradient
of the loss function with respect to the parameters for the entire data
set. A low learning rate (also referred to as step size or the alpha)
and low epoch count were applied resulting in smaller steps with higher
risks of not reaching the minimum due to the smaller number of
iterations.
[A] The gradient descent algorithm was implemented with
parameter settings described as follows:
[A.1] Learning
Rate = 50 (Low)
[A.2] Epochs =
10 (Low)
[B] The final gradient norm was determined as 0.96221
at the 10th epoch indicating that the minimum threshold of 0.00010 was
not achieved prior to the last epoch.
[C] Applying the gradient descent algorithm with a low
learning rate and low epoch count, the estimated linear regression
coefficients for the given data are as follows:
[C.1] Intercept = -2.39224 (Baseline =
-2.71856)
[C.2] MolWeight = +0.89396 (Baseline =
+0.20493)
[C.3] NumCarbon = -1.84352 (Baseline =
-1.25425)
[C.4] NumChlorine = -1.06912 (Baseline =
-0.14419)
[C.5] NumHalogen = -0.38068 (Baseline =
-1.01350)
[C.6] NumMultBonds = -0.36341 (Baseline =
-0.33048)
[D] The estimated coefficients using the gradient
descent algorithm with a low learning rate and low epoch count were not
fully optimized and were not comparable with the baseline coefficients
using normal equations.
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and low epoch count
##################################
LR_GDA_LLR_LEC = GradientDescent_LREstimation(y = y,
X = X,
LearningRate = 50,
Epochs = 10)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.962210610545313 at Epoch 10"
LR_GDA_LLR_LEC_Summary <- LR_GDA_LLR_LEC
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and low epoch count
##################################
LR_GDA_LLR_LEC <- as.data.frame(LR_GDA_LLR_LEC_Summary$LRCoefficients)
rownames(LR_GDA_LLR_LEC) <- NULL
colnames(LR_GDA_LLR_LEC) <- c("LRCoefficients")
LR_GDA_LLR_LEC$LRCoefficientNames <- c("Intercept",
"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
LR_GDA_LLR_LEC$EstimationMethod <- rep("LR_GDA_LLR_LEC",nrow(LR_GDA_LLR_LEC))
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and low epoch count
##################################
print(LR_GDA_LLR_LEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.3922400 Intercept LR_GDA_LLR_LEC
## 2 0.8939686 MolWeight LR_GDA_LLR_LEC
## 3 -1.8435282 NumCarbon LR_GDA_LLR_LEC
## 4 -1.0691289 NumChlorine LR_GDA_LLR_LEC
## 5 -0.3806898 NumHalogen LR_GDA_LLR_LEC
## 6 -0.3634163 NumMultBonds LR_GDA_LLR_LEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with low learning rate and low epoch count
##################################
LR_GDA_LLR_LEC_Summary$Epoch <- 1:length(LR_GDA_LLR_LEC_Summary$L2Loss)
LR_GDA_LLR_LEC_Summary$Method <- rep("LR_GDA_LLR_LEC",length(LR_GDA_LLR_LEC_Summary$L2Loss))
L2Loss <- LR_GDA_LLR_LEC_Summary$L2Loss
Epoch <- LR_GDA_LLR_LEC_Summary$Epoch
Method <- LR_GDA_LLR_LEC_Summary$Method
(LR_GDA_LLR_LEC_ConsolidatedSummary <- cbind(L2Loss, Epoch, Method))
## L2Loss Epoch Method
## [1,] "128.868015504947" "1" "LR_GDA_LLR_LEC"
## [2,] "100.674236221077" "2" "LR_GDA_LLR_LEC"
## [3,] "81.8154777197929" "3" "LR_GDA_LLR_LEC"
## [4,] "68.7468688791543" "4" "LR_GDA_LLR_LEC"
## [5,] "59.590708566118" "5" "LR_GDA_LLR_LEC"
## [6,] "53.1931626792032" "6" "LR_GDA_LLR_LEC"
## [7,] "48.758193032101" "7" "LR_GDA_LLR_LEC"
## [8,] "45.7046714572526" "8" "LR_GDA_LLR_LEC"
## [9,] "43.605750421321" "9" "LR_GDA_LLR_LEC"
## [10,] "42.1544665405458" "10" "LR_GDA_LLR_LEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with low learning rate and low epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_LLR_LEC_Summary,
main = "loss function Optimization Profile : LR_GDA_LLR_LEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))

1.5.7 Linear Regression - Gradient Descent Algorithm with Low
Learning Rate and High Epoch Count (LR_GDA_LLR_HEC)
Gradient
Descent minimizes the loss function parameterized by the model’s
coefficients based on the direction and learning rate factors which
determine the partial derivative calculations of future iterations,
allowing the algorithm to gradually arrive at the local or global
minimum considered the point of convergence. This particular
implementation used Batch Gradient Descent which computes the gradient
of the loss function with respect to the parameters for the entire data
set. A low learning rate (also referred to as step size or the alpha)
and low epoch count were applied resulting in smaller steps with lesser
risks of not reaching the minimum as compensated by the smaller number
of iterations
[A] The gradient descent algorithm was implemented with
parameter settings described as follows:
[A.1] Learning
Rate = 50 (Low)
[A.2] Epochs =
50 (High)
[B] The final gradient norm was determined as 0.11481
at the 50th epoch indicating that the minimum threshold of 0.00010 was
not achieved prior to the last epoch.
[C] Applying the gradient descent algorithm with a low
learning rate and high epoch count, the estimated linear regression
coefficients for the given data are as follows:
[C.1] Intercept = -2.71854 (Baseline =
-2.71856)
[C.2] MolWeight = +0.28134 (Baseline =
+0.20493)
[C.3] NumCarbon = -1.33710 (Baseline =
-1.25425)
[C.4] NumChlorine = -0.51277 (Baseline =
-0.14419)
[C.5] NumHalogen = -0.68397 (Baseline =
-1.01350)
[C.6] NumMultBonds = -0.31132 (Baseline =
-0.33048)
[D] The estimated coefficients using the gradient
descent algorithm with a low learning rate and high epoch count, while
not fully optimized, were sufficiently comparable with the baseline
coefficients using normal equations.
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and high epoch count
##################################
LR_GDA_LLR_HEC = GradientDescent_LREstimation(y = y,
X = X,
LearningRate = 50,
Epochs = 50)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.114815262946592 at Epoch 50"
LR_GDA_LLR_HEC_Summary <- LR_GDA_LLR_HEC
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and high epoch count
##################################
LR_GDA_LLR_HEC <- as.data.frame(LR_GDA_LLR_HEC_Summary$LRCoefficients)
rownames(LR_GDA_LLR_HEC) <- NULL
colnames(LR_GDA_LLR_HEC) <- c("LRCoefficients")
LR_GDA_LLR_HEC$LRCoefficientNames <- c("Intercept",
"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
LR_GDA_LLR_HEC$EstimationMethod <- rep("LR_GDA_LLR_HEC",nrow(LR_GDA_LLR_HEC))
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and high epoch count
##################################
print(LR_GDA_LLR_HEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185441 Intercept LR_GDA_LLR_HEC
## 2 0.2813439 MolWeight LR_GDA_LLR_HEC
## 3 -1.3371016 NumCarbon LR_GDA_LLR_HEC
## 4 -0.5127712 NumChlorine LR_GDA_LLR_HEC
## 5 -0.6839773 NumHalogen LR_GDA_LLR_HEC
## 6 -0.3113283 NumMultBonds LR_GDA_LLR_HEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with low learning rate and high epoch count
##################################
LR_GDA_LLR_HEC_Summary$Epoch <- 1:length(LR_GDA_LLR_HEC_Summary$L2Loss)
LR_GDA_LLR_HEC_Summary$Method <- rep("LR_GDA_LLR_HEC",length(LR_GDA_LLR_HEC_Summary$L2Loss))
L2Loss <- LR_GDA_LLR_HEC_Summary$L2Loss
Epoch <- LR_GDA_LLR_HEC_Summary$Epoch
Method <- LR_GDA_LLR_HEC_Summary$Method
(LR_GDA_LLR_HEC_ConsolidatedSummary <- cbind(L2Loss, Epoch, Method))
## L2Loss Epoch Method
## [1,] "128.868015504947" "1" "LR_GDA_LLR_HEC"
## [2,] "100.674236221077" "2" "LR_GDA_LLR_HEC"
## [3,] "81.8154777197929" "3" "LR_GDA_LLR_HEC"
## [4,] "68.7468688791543" "4" "LR_GDA_LLR_HEC"
## [5,] "59.590708566118" "5" "LR_GDA_LLR_HEC"
## [6,] "53.1931626792032" "6" "LR_GDA_LLR_HEC"
## [7,] "48.758193032101" "7" "LR_GDA_LLR_HEC"
## [8,] "45.7046714572526" "8" "LR_GDA_LLR_HEC"
## [9,] "43.605750421321" "9" "LR_GDA_LLR_HEC"
## [10,] "42.1544665405458" "10" "LR_GDA_LLR_HEC"
## [11,] "41.1364565433462" "11" "LR_GDA_LLR_HEC"
## [12,] "40.406033177599" "12" "LR_GDA_LLR_HEC"
## [13,] "39.8662448387947" "13" "LR_GDA_LLR_HEC"
## [14,] "39.4534953274598" "14" "LR_GDA_LLR_HEC"
## [15,] "39.126421847265" "15" "LR_GDA_LLR_HEC"
## [16,] "38.8582166343799" "16" "LR_GDA_LLR_HEC"
## [17,] "38.631490240265" "17" "LR_GDA_LLR_HEC"
## [18,] "38.4349057071761" "18" "LR_GDA_LLR_HEC"
## [19,] "38.2610024308783" "19" "LR_GDA_LLR_HEC"
## [20,] "38.1048009669459" "20" "LR_GDA_LLR_HEC"
## [21,] "37.9629129868599" "21" "LR_GDA_LLR_HEC"
## [22,] "37.8329749643241" "22" "LR_GDA_LLR_HEC"
## [23,] "37.7132881309214" "23" "LR_GDA_LLR_HEC"
## [24,] "37.6025894071465" "24" "LR_GDA_LLR_HEC"
## [25,] "37.4999053476352" "25" "LR_GDA_LLR_HEC"
## [26,] "37.4044586694964" "26" "LR_GDA_LLR_HEC"
## [27,] "37.3156081003806" "27" "LR_GDA_LLR_HEC"
## [28,] "37.2328093671795" "28" "LR_GDA_LLR_HEC"
## [29,] "37.1555896282248" "29" "LR_GDA_LLR_HEC"
## [30,] "37.0835304829609" "30" "LR_GDA_LLR_HEC"
## [31,] "37.0162564799321" "31" "LR_GDA_LLR_HEC"
## [32,] "36.9534271714769" "32" "LR_GDA_LLR_HEC"
## [33,] "36.8947314752451" "33" "LR_GDA_LLR_HEC"
## [34,] "36.8398835522538" "34" "LR_GDA_LLR_HEC"
## [35,] "36.7886196955981" "35" "LR_GDA_LLR_HEC"
## [36,] "36.7406959041593" "36" "LR_GDA_LLR_HEC"
## [37,] "36.6958859301698" "37" "LR_GDA_LLR_HEC"
## [38,] "36.6539796624992" "38" "LR_GDA_LLR_HEC"
## [39,] "36.6147817542687" "39" "LR_GDA_LLR_HEC"
## [40,] "36.5781104334971" "40" "LR_GDA_LLR_HEC"
## [41,] "36.5437964549883" "41" "LR_GDA_LLR_HEC"
## [42,] "36.5116821644221" "42" "LR_GDA_LLR_HEC"
## [43,] "36.4816206540301" "43" "LR_GDA_LLR_HEC"
## [44,] "36.4534749948634" "44" "LR_GDA_LLR_HEC"
## [45,] "36.4271175344747" "45" "LR_GDA_LLR_HEC"
## [46,] "36.4024292514622" "46" "LR_GDA_LLR_HEC"
## [47,] "36.3792991601602" "47" "LR_GDA_LLR_HEC"
## [48,] "36.3576237600811" "48" "LR_GDA_LLR_HEC"
## [49,] "36.3373065256698" "49" "LR_GDA_LLR_HEC"
## [50,] "36.3182574326452" "50" "LR_GDA_LLR_HEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with low learning rate and high epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_LLR_HEC_Summary,
main = "loss function Optimization Profile : LR_GDA_LLR_HEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))

1.6 Linear Regression Model Coefficient Estimation Evaluation
Summary
[A] The gradient descent algorithms with sufficiently
comparable coefficients with the baseline despite not achieving fully
optimized coefficients are as follows:
[A.1] LR_GDA_HLR_HEC : Linear Regression -
Gradient Descent Algorithm with High Learning Rate and High Epoch
Count
[A.2] LR_GDA_LLR_HEC : Linear Regression -
Gradient Descent Algorithm with Low Learning Rate and High Epoch
Count
[A.3] LR_GDA_VHLR_HLEC : Linear Regression -
Gradient Descent Algorithm with Very High Learning Rate and Low Epoch
Count
[B] The choice of gradient norm minimum threshold for
convergence, learning rate and the epoch count in the implementation of
the gradient descent algorithm are critical to achieving fully optimized
coefficients while maintaining a sensibly minimal loss function.
##################################
# Consolidating the loss function optimization data
# for all gradient descent algorithms
# with different learning rates and epoch counts
##################################
LR_GDA_ConsolidatedSummary <- rbind(LR_GDA_VHLR_LEC_ConsolidatedSummary,
LR_GDA_VHLR_HEC_ConsolidatedSummary,
LR_GDA_HLR_LEC_ConsolidatedSummary,
LR_GDA_HLR_HEC_ConsolidatedSummary,
LR_GDA_LLR_LEC_ConsolidatedSummary,
LR_GDA_LLR_HEC_ConsolidatedSummary)
LR_GDA_ConsolidatedSummary <- as.data.frame(LR_GDA_ConsolidatedSummary)
LR_GDA_ConsolidatedSummary$L2Loss <- as.numeric(as.character(LR_GDA_ConsolidatedSummary$L2Loss))
LR_GDA_ConsolidatedSummary$Epoch <- as.numeric(as.character(LR_GDA_ConsolidatedSummary$Epoch))
LR_GDA_ConsolidatedSummary$Method <- factor(LR_GDA_ConsolidatedSummary$Method,
levels = c("LR_GDA_LLR_LEC",
"LR_GDA_HLR_LEC",
"LR_GDA_VHLR_LEC",
"LR_GDA_LLR_HEC",
"LR_GDA_HLR_HEC",
"LR_GDA_VHLR_HEC"))
##################################
# Plotting the loss function optimization data
# for all gradient descent algorithms
# with different learning rates and epoch counts
##################################
xyplot(L2Loss ~ Epoch | Method,
data = LR_GDA_ConsolidatedSummary,
main = "loss function Optimization Profile for Gradient Descent Algorithm with Different Learning Rates and Epoch Counts",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))

##################################
# Gathering the estimated coefficients
# for normal equations and all gradient descent algorithms
# with different learning rates and epoch counts
##################################
LR_NE_VS_LR_GDA_VHLR_LEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_VHLR_HEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_HLR_LEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_HLR_HEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_LLR_LEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_LLR_HEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_VHLR_LEC$Group <- rep("LR_NE Versus LR_GDA_VHLR_LEC",nrow(LR_NE_VS_LR_GDA_VHLR_LEC))
LR_NE_VS_LR_GDA_VHLR_HEC$Group <- rep("LR_NE Versus LR_GDA_VHLR_HEC",nrow(LR_NE_VS_LR_GDA_VHLR_LEC))
LR_NE_VS_LR_GDA_HLR_LEC$Group <- rep("LR_NE Versus LR_GDA_HLR_LEC",nrow(LR_NE_VS_LR_GDA_HLR_LEC))
LR_NE_VS_LR_GDA_HLR_HEC$Group <- rep("LR_NE Versus LR_GDA_HLR_HEC",nrow(LR_NE_VS_LR_GDA_HLR_LEC))
LR_NE_VS_LR_GDA_LLR_LEC$Group <- rep("LR_NE Versus LR_GDA_LLR_LEC",nrow(LR_NE_VS_LR_GDA_HLR_LEC))
LR_NE_VS_LR_GDA_LLR_HEC$Group <- rep("LR_NE Versus LR_GDA_LLR_HEC",nrow(LR_NE_VS_LR_GDA_HLR_LEC))
LR_GDA_VHLR_LEC$Group <- rep("LR_NE Versus LR_GDA_VHLR_LEC",nrow(LR_GDA_VHLR_LEC))
LR_GDA_VHLR_HEC$Group <- rep("LR_NE Versus LR_GDA_VHLR_HEC",nrow(LR_GDA_VHLR_HEC))
LR_GDA_HLR_LEC$Group <- rep("LR_NE Versus LR_GDA_HLR_LEC",nrow(LR_GDA_HLR_LEC))
LR_GDA_HLR_HEC$Group <- rep("LR_NE Versus LR_GDA_HLR_HEC",nrow(LR_GDA_HLR_HEC))
LR_GDA_LLR_LEC$Group <- rep("LR_NE Versus LR_GDA_LLR_LEC",nrow(LR_GDA_LLR_LEC))
LR_GDA_LLR_HEC$Group <- rep("LR_NE Versus LR_GDA_LLR_HEC",nrow(LR_GDA_LLR_HEC))
##################################
# Consolidating the estimated coefficients
# for normal equations and all gradient descent algorithms
# with different learning rates and epoch counts
##################################
LR_NE_GDA_ConsolidatedSummary <- rbind(LR_NE_VS_LR_GDA_VHLR_LEC,
LR_NE_VS_LR_GDA_VHLR_HEC,
LR_NE_VS_LR_GDA_HLR_LEC,
LR_NE_VS_LR_GDA_HLR_HEC,
LR_NE_VS_LR_GDA_LLR_LEC,
LR_NE_VS_LR_GDA_LLR_HEC,
LR_GDA_VHLR_LEC,
LR_GDA_VHLR_HEC,
LR_GDA_HLR_LEC,
LR_GDA_HLR_HEC,
LR_GDA_LLR_LEC,
LR_GDA_LLR_HEC)
LR_NE_GDA_ConsolidatedSummary <- as.data.frame(LR_NE_GDA_ConsolidatedSummary)
LR_NE_GDA_ConsolidatedSummary$LRCoefficients <- as.numeric(as.character(LR_NE_GDA_ConsolidatedSummary$LRCoefficients))
LR_NE_GDA_ConsolidatedSummary$Group <- factor(LR_NE_GDA_ConsolidatedSummary$Group,
levels = c("LR_NE Versus LR_GDA_LLR_LEC",
"LR_NE Versus LR_GDA_HLR_LEC",
"LR_NE Versus LR_GDA_VHLR_LEC",
"LR_NE Versus LR_GDA_LLR_HEC",
"LR_NE Versus LR_GDA_HLR_HEC",
"LR_NE Versus LR_GDA_VHLR_HEC"))
LR_NE_GDA_ConsolidatedSummary$LRCoefficientNames <- factor(LR_NE_GDA_ConsolidatedSummary$LRCoefficientNames,
levels = c("NumMultBonds",
"NumHalogen",
"NumChlorine",
"NumCarbon",
"MolWeight",
"Intercept"))
LR_NE_GDA_ConsolidatedSummary$EstimationMethod <- factor(LR_NE_GDA_ConsolidatedSummary$EstimationMethod,
levels = c("LR_NE",
"LR_GDA_LLR_LEC",
"LR_GDA_LLR_HEC",
"LR_GDA_HLR_LEC",
"LR_GDA_HLR_HEC",
"LR_GDA_VHLR_LEC",
"LR_GDA_VHLR_HEC"))
print(LR_NE_GDA_ConsolidatedSummary)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185699 Intercept LR_NE
## 2 0.2049318 MolWeight LR_NE
## 3 -1.2542520 NumCarbon LR_NE
## 4 -0.1441934 NumChlorine LR_NE
## 5 -1.0135099 NumHalogen LR_NE
## 6 -0.3304828 NumMultBonds LR_NE
## 7 -2.7185699 Intercept LR_NE
## 8 0.2049318 MolWeight LR_NE
## 9 -1.2542520 NumCarbon LR_NE
## 10 -0.1441934 NumChlorine LR_NE
## 11 -1.0135099 NumHalogen LR_NE
## 12 -0.3304828 NumMultBonds LR_NE
## 13 -2.7185699 Intercept LR_NE
## 14 0.2049318 MolWeight LR_NE
## 15 -1.2542520 NumCarbon LR_NE
## 16 -0.1441934 NumChlorine LR_NE
## 17 -1.0135099 NumHalogen LR_NE
## 18 -0.3304828 NumMultBonds LR_NE
## 19 -2.7185699 Intercept LR_NE
## 20 0.2049318 MolWeight LR_NE
## 21 -1.2542520 NumCarbon LR_NE
## 22 -0.1441934 NumChlorine LR_NE
## 23 -1.0135099 NumHalogen LR_NE
## 24 -0.3304828 NumMultBonds LR_NE
## 25 -2.7185699 Intercept LR_NE
## 26 0.2049318 MolWeight LR_NE
## 27 -1.2542520 NumCarbon LR_NE
## 28 -0.1441934 NumChlorine LR_NE
## 29 -1.0135099 NumHalogen LR_NE
## 30 -0.3304828 NumMultBonds LR_NE
## 31 -2.7185699 Intercept LR_NE
## 32 0.2049318 MolWeight LR_NE
## 33 -1.2542520 NumCarbon LR_NE
## 34 -0.1441934 NumChlorine LR_NE
## 35 -1.0135099 NumHalogen LR_NE
## 36 -0.3304828 NumMultBonds LR_NE
## 37 -2.7185699 Intercept LR_GDA_VHLR_LEC
## 38 0.2684003 MolWeight LR_GDA_VHLR_LEC
## 39 -1.4126207 NumCarbon LR_GDA_VHLR_LEC
## 40 -0.6009731 NumChlorine LR_GDA_VHLR_LEC
## 41 -0.6815776 NumHalogen LR_GDA_VHLR_LEC
## 42 -0.3415085 NumMultBonds LR_GDA_VHLR_LEC
## 43 -2.7185699 Intercept LR_GDA_VHLR_HEC
## 44 -1.2694291 MolWeight LR_GDA_VHLR_HEC
## 45 -2.3691976 NumCarbon LR_GDA_VHLR_HEC
## 46 -1.2181515 NumChlorine LR_GDA_VHLR_HEC
## 47 -2.0024255 NumHalogen LR_GDA_VHLR_HEC
## 48 -1.4272592 NumMultBonds LR_GDA_VHLR_HEC
## 49 -2.7055537 Intercept LR_GDA_HLR_LEC
## 50 0.6047579 MolWeight LR_GDA_HLR_LEC
## 51 -1.6363423 NumCarbon LR_GDA_HLR_LEC
## 52 -0.8489932 NumChlorine LR_GDA_HLR_LEC
## 53 -0.4926584 NumHalogen LR_GDA_HLR_LEC
## 54 -0.2958283 NumMultBonds LR_GDA_HLR_LEC
## 55 -2.7185699 Intercept LR_GDA_HLR_HEC
## 56 0.1698594 MolWeight LR_GDA_HLR_HEC
## 57 -1.2219687 NumCarbon LR_GDA_HLR_HEC
## 58 -0.2977972 NumChlorine LR_GDA_HLR_HEC
## 59 -0.8462708 NumHalogen LR_GDA_HLR_HEC
## 60 -0.3310979 NumMultBonds LR_GDA_HLR_HEC
## 61 -2.3922400 Intercept LR_GDA_LLR_LEC
## 62 0.8939686 MolWeight LR_GDA_LLR_LEC
## 63 -1.8435282 NumCarbon LR_GDA_LLR_LEC
## 64 -1.0691289 NumChlorine LR_GDA_LLR_LEC
## 65 -0.3806898 NumHalogen LR_GDA_LLR_LEC
## 66 -0.3634163 NumMultBonds LR_GDA_LLR_LEC
## 67 -2.7185441 Intercept LR_GDA_LLR_HEC
## 68 0.2813439 MolWeight LR_GDA_LLR_HEC
## 69 -1.3371016 NumCarbon LR_GDA_LLR_HEC
## 70 -0.5127712 NumChlorine LR_GDA_LLR_HEC
## 71 -0.6839773 NumHalogen LR_GDA_LLR_HEC
## 72 -0.3113283 NumMultBonds LR_GDA_LLR_HEC
## Group
## 1 LR_NE Versus LR_GDA_VHLR_LEC
## 2 LR_NE Versus LR_GDA_VHLR_LEC
## 3 LR_NE Versus LR_GDA_VHLR_LEC
## 4 LR_NE Versus LR_GDA_VHLR_LEC
## 5 LR_NE Versus LR_GDA_VHLR_LEC
## 6 LR_NE Versus LR_GDA_VHLR_LEC
## 7 LR_NE Versus LR_GDA_VHLR_HEC
## 8 LR_NE Versus LR_GDA_VHLR_HEC
## 9 LR_NE Versus LR_GDA_VHLR_HEC
## 10 LR_NE Versus LR_GDA_VHLR_HEC
## 11 LR_NE Versus LR_GDA_VHLR_HEC
## 12 LR_NE Versus LR_GDA_VHLR_HEC
## 13 LR_NE Versus LR_GDA_HLR_LEC
## 14 LR_NE Versus LR_GDA_HLR_LEC
## 15 LR_NE Versus LR_GDA_HLR_LEC
## 16 LR_NE Versus LR_GDA_HLR_LEC
## 17 LR_NE Versus LR_GDA_HLR_LEC
## 18 LR_NE Versus LR_GDA_HLR_LEC
## 19 LR_NE Versus LR_GDA_HLR_HEC
## 20 LR_NE Versus LR_GDA_HLR_HEC
## 21 LR_NE Versus LR_GDA_HLR_HEC
## 22 LR_NE Versus LR_GDA_HLR_HEC
## 23 LR_NE Versus LR_GDA_HLR_HEC
## 24 LR_NE Versus LR_GDA_HLR_HEC
## 25 LR_NE Versus LR_GDA_LLR_LEC
## 26 LR_NE Versus LR_GDA_LLR_LEC
## 27 LR_NE Versus LR_GDA_LLR_LEC
## 28 LR_NE Versus LR_GDA_LLR_LEC
## 29 LR_NE Versus LR_GDA_LLR_LEC
## 30 LR_NE Versus LR_GDA_LLR_LEC
## 31 LR_NE Versus LR_GDA_LLR_HEC
## 32 LR_NE Versus LR_GDA_LLR_HEC
## 33 LR_NE Versus LR_GDA_LLR_HEC
## 34 LR_NE Versus LR_GDA_LLR_HEC
## 35 LR_NE Versus LR_GDA_LLR_HEC
## 36 LR_NE Versus LR_GDA_LLR_HEC
## 37 LR_NE Versus LR_GDA_VHLR_LEC
## 38 LR_NE Versus LR_GDA_VHLR_LEC
## 39 LR_NE Versus LR_GDA_VHLR_LEC
## 40 LR_NE Versus LR_GDA_VHLR_LEC
## 41 LR_NE Versus LR_GDA_VHLR_LEC
## 42 LR_NE Versus LR_GDA_VHLR_LEC
## 43 LR_NE Versus LR_GDA_VHLR_HEC
## 44 LR_NE Versus LR_GDA_VHLR_HEC
## 45 LR_NE Versus LR_GDA_VHLR_HEC
## 46 LR_NE Versus LR_GDA_VHLR_HEC
## 47 LR_NE Versus LR_GDA_VHLR_HEC
## 48 LR_NE Versus LR_GDA_VHLR_HEC
## 49 LR_NE Versus LR_GDA_HLR_LEC
## 50 LR_NE Versus LR_GDA_HLR_LEC
## 51 LR_NE Versus LR_GDA_HLR_LEC
## 52 LR_NE Versus LR_GDA_HLR_LEC
## 53 LR_NE Versus LR_GDA_HLR_LEC
## 54 LR_NE Versus LR_GDA_HLR_LEC
## 55 LR_NE Versus LR_GDA_HLR_HEC
## 56 LR_NE Versus LR_GDA_HLR_HEC
## 57 LR_NE Versus LR_GDA_HLR_HEC
## 58 LR_NE Versus LR_GDA_HLR_HEC
## 59 LR_NE Versus LR_GDA_HLR_HEC
## 60 LR_NE Versus LR_GDA_HLR_HEC
## 61 LR_NE Versus LR_GDA_LLR_LEC
## 62 LR_NE Versus LR_GDA_LLR_LEC
## 63 LR_NE Versus LR_GDA_LLR_LEC
## 64 LR_NE Versus LR_GDA_LLR_LEC
## 65 LR_NE Versus LR_GDA_LLR_LEC
## 66 LR_NE Versus LR_GDA_LLR_LEC
## 67 LR_NE Versus LR_GDA_LLR_HEC
## 68 LR_NE Versus LR_GDA_LLR_HEC
## 69 LR_NE Versus LR_GDA_LLR_HEC
## 70 LR_NE Versus LR_GDA_LLR_HEC
## 71 LR_NE Versus LR_GDA_LLR_HEC
## 72 LR_NE Versus LR_GDA_LLR_HEC
dotplot(LRCoefficientNames ~ LRCoefficients | Group,
data = LR_NE_GDA_ConsolidatedSummary,
groups = EstimationMethod,
main = "Estimated Linear Regression Coefficient Value Comparison",
ylab = "Linear Regression Coefficients",
xlab = "Estimated Linear Regression Coefficient Values",
auto.key = list(adj = 1),
type = c("p", "h"),
# origin = 0,
alpha = 0.45,
pch = 16,
cex = 2)
