Selected molecular descriptors from the Dragon chemoinformatics application were used to predict bioconcentration factors for 779 chemicals in order to evaluate QSAR (Quantitative Structure Activity Relationship). This dataset was obtained from the UCI machine learning repository.
The dataset consists of 779 observations of 10 attributes. Below is a brief description of each feature and the response variable (logBCF) in our dataset:
Note that all predictors with the exception of B02[C-N] are quantitative. For the purpose of this assignment, DO NOT CONVERT B02[C-N] to factor. Leave the data in its original format - numeric in R.
Please load the dataset “Bio_pred” and then split the dataset into a train and test set in a 80:20 ratio. Use the training set to build the models in Questions 1-6. Use the test set to help evaluate model performance in Question 7. Please make sure that you are using R version 3.6.X or above (i.e. version 4.X is also acceptable).
# Clear variables in memory
rm(list=ls())
# Import the libraries
library(CombMSC)
library(boot)
library(leaps)
library(MASS)
library(glmnet)
# Ensure that the sampling type is correct
RNGkind(sample.kind="Rejection")
# Set a seed for reproducibility
set.seed(100)
# Read data
fullData = read.csv("~/Documents/ISYE6414/Bio_pred.csv",header=TRUE)
# Split data for traIning and testing
testRows = sample(nrow(fullData),0.2*nrow(fullData))
testData = fullData[testRows, ]
trainData = fullData[-testRows, ]
Note: Use the training set to build the models in Questions 1-6. Use the test set to help evaluate model performance in Question 7.
model1 <- lm(logBCF~., data = trainData)
summary(model1)
##
## Call:
## lm(formula = logBCF ~ ., data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2577 -0.5180 0.0448 0.5117 4.0423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.001422 0.138057 0.010 0.99179
## nHM 0.137022 0.022462 6.100 1.88e-09 ***
## piPC09 0.031158 0.020874 1.493 0.13603
## PCD 0.055655 0.063874 0.871 0.38391
## X2Av -0.031890 0.253574 -0.126 0.89996
## MLOGP 0.506088 0.034211 14.793 < 2e-16 ***
## ON1V 0.140595 0.066810 2.104 0.03575 *
## N.072 -0.073334 0.070993 -1.033 0.30202
## B02.C.N. -0.158231 0.080143 -1.974 0.04879 *
## F04.C.O. -0.030763 0.009667 -3.182 0.00154 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7957 on 614 degrees of freedom
## Multiple R-squared: 0.6672, Adjusted R-squared: 0.6623
## F-statistic: 136.8 on 9 and 614 DF, p-value: < 2.2e-16
nHM, MLOGP, ON1V, B02.C.N., F04.C.O. are all significant at a 99% level of confidence or greater. No other coefficients are statistically significant at a 95% or any other level.
library(CombMSC)
n = nrow(trainData)
set.seed(100)
#Mallow's CP
Cp(model1, S2=summary(model1)$sigma^2)
## [,1]
## [1,] 10
#AIC
AIC(model1, k=2)
## [1] 1497.477
#BIC
AIC(model1, k=log(n))
## [1] 1546.274
set.seed(100)
model2 <- lm(logBCF~ nHM + MLOGP + ON1V + B02.C.N. + F04.C.O., data = trainData)
summary(model2)
##
## Call:
## lm(formula = logBCF ~ nHM + MLOGP + ON1V + B02.C.N. + F04.C.O.,
## data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4897 -0.5157 0.0361 0.5436 4.0525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.035709 0.097138 -0.368 0.7133
## nHM 0.124981 0.019202 6.509 1.57e-10 ***
## MLOGP 0.569653 0.026171 21.767 < 2e-16 ***
## ON1V 0.129941 0.054767 2.373 0.0180 *
## B02.C.N. -0.119519 0.072402 -1.651 0.0993 .
## F04.C.O. -0.023428 0.009311 -2.516 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8002 on 618 degrees of freedom
## Multiple R-squared: 0.6612, Adjusted R-squared: 0.6585
## F-statistic: 241.2 on 5 and 618 DF, p-value: < 2.2e-16
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: logBCF ~ nHM + piPC09 + PCD + X2Av + MLOGP + ON1V + N.072 + B02.C.N. +
## F04.C.O.
## Model 2: logBCF ~ nHM + MLOGP + ON1V + B02.C.N. + F04.C.O.
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 614 388.70
## 2 618 395.67 -4 -6.9653 2.7506 0.02745 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The f-test result is statistically significant at a level of alpha = .01, so model1 and model2 are different form one another. However, using p-values alone to variable selection is not a wise choice, since some variables may be jointly significant and other may be significant once others are removed from the model. Using p-values alone also ignores possible multicolliniearity.
Hint: You can use nbest parameter.
set.seed(100)
library(leaps)
out = leaps(trainData[,-c(10)], trainData$logBCF, method = "Cp")
cbind(as.matrix(out$which),out$Cp)
## 1 2 3 4 5 6 7 8 9
## 1 0 0 0 0 1 0 0 0 0 58.596851
## 1 1 0 0 0 0 0 0 0 0 821.003439
## 1 0 1 0 0 0 0 0 0 0 875.911282
## 1 0 0 1 0 0 0 0 0 0 986.721460
## 1 0 0 0 0 0 0 0 1 0 1004.917654
## 1 0 0 0 0 0 0 1 0 0 1156.059904
## 1 0 0 0 0 0 1 0 0 0 1180.996680
## 1 0 0 0 0 0 0 0 0 1 1192.010393
## 1 0 0 0 1 0 0 0 0 0 1220.973066
## 2 1 0 0 0 1 0 0 0 0 17.737801
## 2 0 0 0 0 1 0 0 1 0 53.997513
## 2 0 0 0 0 1 0 0 0 1 54.037436
## 2 0 0 0 1 1 0 0 0 0 55.412689
## 2 0 0 0 0 1 1 0 0 0 56.858903
## 2 0 1 0 0 1 0 0 0 0 58.947203
## 2 0 0 0 0 1 0 1 0 0 60.204935
## 2 0 0 1 0 1 0 0 0 0 60.407567
## 2 1 1 0 0 0 0 0 0 0 550.795979
## 2 1 0 1 0 0 0 0 0 0 567.222115
## 3 1 1 0 0 1 0 0 0 0 15.184626
## 3 1 0 0 0 1 0 0 0 1 16.654014
## 3 1 0 0 0 1 0 0 1 0 16.686004
## 3 1 0 1 0 1 0 0 0 0 17.784159
## 3 1 0 0 0 1 0 1 0 0 19.090334
## 3 1 0 0 0 1 1 0 0 0 19.214708
## 3 1 0 0 1 1 0 0 0 0 19.550838
## 3 0 1 0 0 1 0 0 0 1 49.903187
## 3 0 1 0 0 1 0 0 1 0 51.654468
## 3 0 0 0 0 1 0 0 1 1 51.892920
## 4 1 1 0 0 1 0 0 0 1 9.495041
## 4 1 1 0 0 1 0 0 1 0 11.403720
## 4 1 0 0 0 1 1 0 0 1 13.758279
## 4 1 0 1 0 1 0 0 1 0 14.757627
## 4 1 1 0 0 1 0 1 0 0 14.873550
## 4 1 0 1 0 1 0 0 0 1 15.715948
## 4 1 1 0 1 1 0 0 0 0 16.339710
## 4 1 0 0 0 1 0 0 1 1 16.695502
## 4 1 1 0 0 1 1 0 0 0 17.174340
## 4 1 1 1 0 1 0 0 0 0 17.184441
## 5 1 1 0 0 1 0 0 1 1 7.240754
## 5 1 1 0 0 1 1 0 0 1 8.895893
## 5 1 1 0 0 1 0 1 0 1 9.818088
## 5 1 1 0 1 1 0 0 0 1 10.676736
## 5 1 0 1 0 1 1 0 0 1 10.711183
## 5 1 1 1 0 1 0 0 0 1 11.372175
## 5 1 1 0 0 1 0 1 1 0 12.600629
## 5 1 0 0 0 1 1 0 1 1 13.002396
## 5 1 1 0 1 1 0 0 1 0 13.257140
## 5 1 1 1 0 1 0 0 1 0 13.275843
## 6 1 1 0 0 1 1 0 1 1 6.116174
## 6 1 0 1 0 1 1 0 1 1 6.820940
## 6 1 1 0 0 1 1 1 0 1 8.147412
## 6 1 1 0 0 1 0 1 1 1 8.642006
## 6 1 1 0 1 1 0 0 1 1 9.042626
## 6 1 1 1 0 1 0 0 1 1 9.240749
## 6 1 1 1 0 1 1 0 0 1 10.665490
## 6 1 1 0 1 1 1 0 0 1 10.746628
## 6 1 0 1 0 1 1 1 0 1 10.865153
## 6 1 1 0 1 1 0 1 0 1 11.090540
## 7 1 1 0 0 1 1 1 1 1 6.831852
## 7 1 1 1 0 1 1 0 1 1 7.069867
## 7 1 1 0 1 1 1 0 1 1 8.072045
## 7 1 0 1 0 1 1 1 1 1 8.263250
## 7 1 0 1 1 1 1 0 1 1 8.804787
## 7 1 1 1 0 1 1 1 0 1 9.980191
## 7 1 1 0 1 1 1 1 0 1 10.110409
## 7 1 1 0 1 1 0 1 1 1 10.428510
## 7 1 1 1 0 1 0 1 1 1 10.620322
## 7 1 1 1 1 1 0 0 1 1 11.021592
## 8 1 1 1 0 1 1 1 1 1 8.015816
## 8 1 1 0 1 1 1 1 1 1 8.759216
## 8 1 1 1 1 1 1 0 1 1 9.067038
## 8 1 0 1 1 1 1 1 1 1 10.228149
## 8 1 1 1 1 1 1 1 0 1 11.898038
## 8 1 1 1 1 1 0 1 1 1 12.428478
## 8 1 1 1 1 1 1 1 1 0 18.126038
## 8 0 1 1 1 1 1 1 1 1 45.211381
## 8 1 1 1 1 0 1 1 1 1 226.837691
## 9 1 1 1 1 1 1 1 1 1 10.000000
set.seed(100)
best.model = which(out$Cp==min(out$Cp))
cbind(as.matrix(out$which),out$Cp)[best.model,]
## 1 2 3 4 5 6 7 8
## 1.000000 1.000000 0.000000 0.000000 1.000000 1.000000 0.000000 1.000000
## 9
## 1.000000 6.116174
names(trainData[c(1,2,5,6,8,9)])
## [1] "nHM" "piPC09" "MLOGP" "ON1V" "B02.C.N." "F04.C.O."
model3 <- lm(logBCF~ nHM + piPC09 + MLOGP + ON1V + B02.C.N. + F04.C.O., data = trainData)
summary(model3)
##
## Call:
## lm(formula = logBCF ~ nHM + piPC09 + MLOGP + ON1V + B02.C.N. +
## F04.C.O., data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2364 -0.5234 0.0421 0.5196 4.1159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.035785 0.099454 0.360 0.71911
## nHM 0.124086 0.019083 6.502 1.63e-10 ***
## piPC09 0.042167 0.014135 2.983 0.00297 **
## MLOGP 0.528522 0.029434 17.956 < 2e-16 ***
## ON1V 0.098099 0.055457 1.769 0.07740 .
## B02.C.N. -0.160204 0.073225 -2.188 0.02906 *
## F04.C.O. -0.028644 0.009415 -3.042 0.00245 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7951 on 617 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.6628
## F-statistic: 205.1 on 6 and 617 DF, p-value: < 2.2e-16
set.seed(100)
minmodel <- glm(logBCF~1, data = trainData)
model4 <- step(model1,
scope = list(lower = minmodel,
upper = model1),
direction = "backward",
k=log(n)) #to make AIC column calculate BIC instead
## Start: AIC=-231
## logBCF ~ nHM + piPC09 + PCD + X2Av + MLOGP + ON1V + N.072 + B02.C.N. +
## F04.C.O.
##
## Df Sum of Sq RSS AIC
## - X2Av 1 0.010 388.71 -237.417
## - PCD 1 0.481 389.18 -236.662
## - N.072 1 0.676 389.38 -236.350
## - piPC09 1 1.411 390.11 -235.173
## - B02.C.N. 1 2.468 391.17 -233.484
## - ON1V 1 2.804 391.51 -232.949
## <none> 388.70 -230.997
## - F04.C.O. 1 6.410 395.11 -227.226
## - nHM 1 23.557 412.26 -200.718
## - MLOGP 1 138.539 527.24 -47.211
##
## Step: AIC=-237.42
## logBCF ~ nHM + piPC09 + PCD + MLOGP + ON1V + N.072 + B02.C.N. +
## F04.C.O.
##
## Df Sum of Sq RSS AIC
## - PCD 1 0.517 389.23 -243.025
## - N.072 1 0.667 389.38 -242.783
## - piPC09 1 1.423 390.14 -241.574
## - B02.C.N. 1 2.510 391.22 -239.838
## - ON1V 1 2.915 391.63 -239.192
## <none> 388.71 -237.417
## - F04.C.O. 1 6.491 395.21 -233.520
## - nHM 1 25.431 414.15 -204.309
## - MLOGP 1 146.081 534.80 -44.772
##
## Step: AIC=-243.02
## logBCF ~ nHM + piPC09 + MLOGP + ON1V + N.072 + B02.C.N. + F04.C.O.
##
## Df Sum of Sq RSS AIC
## - N.072 1 0.813 390.04 -248.159
## - B02.C.N. 1 2.099 391.33 -246.105
## - ON1V 1 2.412 391.64 -245.606
## <none> 389.23 -243.025
## - F04.C.O. 1 6.088 395.32 -239.776
## - piPC09 1 6.203 395.43 -239.594
## - nHM 1 27.541 416.77 -206.800
## - MLOGP 1 181.833 571.06 -10.264
##
## Step: AIC=-248.16
## logBCF ~ nHM + piPC09 + MLOGP + ON1V + B02.C.N. + F04.C.O.
##
## Df Sum of Sq RSS AIC
## - ON1V 1 1.978 392.02 -251.438
## - B02.C.N. 1 3.026 393.07 -249.773
## <none> 390.04 -248.159
## - piPC09 1 5.626 395.67 -245.659
## - F04.C.O. 1 5.851 395.89 -245.304
## - nHM 1 26.728 416.77 -213.236
## - MLOGP 1 203.819 593.86 7.728
##
## Step: AIC=-251.44
## logBCF ~ nHM + piPC09 + MLOGP + B02.C.N. + F04.C.O.
##
## Df Sum of Sq RSS AIC
## - B02.C.N. 1 2.693 394.72 -253.602
## - F04.C.O. 1 3.902 395.92 -251.695
## <none> 392.02 -251.438
## - piPC09 1 7.252 399.27 -246.437
## - nHM 1 25.197 417.22 -219.003
## - MLOGP 1 247.006 639.03 47.031
##
## Step: AIC=-253.6
## logBCF ~ nHM + piPC09 + MLOGP + F04.C.O.
##
## Df Sum of Sq RSS AIC
## <none> 394.72 -253.602
## - F04.C.O. 1 4.868 399.58 -252.390
## - piPC09 1 5.798 400.51 -250.939
## - nHM 1 26.847 421.56 -218.977
## - MLOGP 1 302.931 697.65 95.359
summary(model4)
##
## Call:
## lm(formula = logBCF ~ nHM + piPC09 + MLOGP + F04.C.O., data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2611 -0.5126 0.0517 0.5353 4.3488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.008695 0.078196 -0.111 0.91150
## nHM 0.114029 0.017574 6.489 1.78e-10 ***
## piPC09 0.041119 0.013636 3.015 0.00267 **
## MLOGP 0.566473 0.025990 21.796 < 2e-16 ***
## F04.C.O. -0.022104 0.008000 -2.763 0.00590 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7985 on 619 degrees of freedom
## Multiple R-squared: 0.662, Adjusted R-squared: 0.6599
## F-statistic: 303.1 on 4 and 619 DF, p-value: < 2.2e-16
There are four predictor variables in model4. All are significant at a 99% confidence level.
set.seed(100)
minmodel <- glm(logBCF~1, data = trainData)
model5 <- step(minmodel,
scope = list(lower = minmodel,
upper = model1),
direction = "forward")
## Start: AIC=2165.97
## logBCF ~ 1
##
## Df Deviance AIC
## + MLOGP 1 429.60 1543.9
## + nHM 1 912.25 2013.8
## + piPC09 1 947.02 2037.2
## + PCD 1 1017.17 2081.7
## + B02.C.N. 1 1028.68 2088.8
## + N.072 1 1124.37 2144.3
## + ON1V 1 1140.16 2153.0
## + F04.C.O. 1 1147.13 2156.8
## <none> 1167.92 2166.0
## + X2Av 1 1165.46 2166.7
##
## Step: AIC=1543.9
## logBCF ~ MLOGP
##
## Df Deviance AIC
## + nHM 1 402.47 1505.2
## + B02.C.N. 1 425.42 1539.8
## + F04.C.O. 1 425.45 1539.8
## + X2Av 1 426.32 1541.1
## + ON1V 1 427.23 1542.5
## <none> 429.60 1543.9
## + piPC09 1 428.55 1544.4
## + N.072 1 429.35 1545.5
## + PCD 1 429.48 1545.7
##
## Step: AIC=1505.19
## logBCF ~ MLOGP + nHM
##
## Df Deviance AIC
## + piPC09 1 399.58 1502.7
## + F04.C.O. 1 400.51 1504.2
## + B02.C.N. 1 400.53 1504.2
## <none> 402.47 1505.2
## + PCD 1 401.23 1505.3
## + N.072 1 402.06 1506.5
## + ON1V 1 402.13 1506.7
## + X2Av 1 402.35 1507.0
##
## Step: AIC=1502.7
## logBCF ~ MLOGP + nHM + piPC09
##
## Df Deviance AIC
## + F04.C.O. 1 394.72 1497.0
## + B02.C.N. 1 395.92 1499.0
## + N.072 1 398.12 1502.4
## <none> 399.58 1502.7
## + X2Av 1 399.05 1503.9
## + ON1V 1 399.58 1504.7
## + PCD 1 399.58 1504.7
##
## Step: AIC=1497.05
## logBCF ~ MLOGP + nHM + piPC09 + F04.C.O.
##
## Df Deviance AIC
## + B02.C.N. 1 392.02 1494.8
## + ON1V 1 393.07 1496.5
## <none> 394.72 1497.0
## + N.072 1 393.65 1497.4
## + X2Av 1 394.20 1498.2
## + PCD 1 394.64 1498.9
##
## Step: AIC=1494.78
## logBCF ~ MLOGP + nHM + piPC09 + F04.C.O. + B02.C.N.
##
## Df Deviance AIC
## + ON1V 1 390.04 1493.6
## <none> 392.02 1494.8
## + N.072 1 391.64 1496.2
## + X2Av 1 391.90 1496.6
## + PCD 1 392.02 1496.8
##
## Step: AIC=1493.62
## logBCF ~ MLOGP + nHM + piPC09 + F04.C.O. + B02.C.N. + ON1V
##
## Df Deviance AIC
## <none> 390.04 1493.6
## + N.072 1 389.23 1494.3
## + PCD 1 389.38 1494.6
## + X2Av 1 390.02 1495.6
summary(model5)
##
## Call:
## glm(formula = logBCF ~ MLOGP + nHM + piPC09 + F04.C.O. + B02.C.N. +
## ON1V, data = trainData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2364 -0.5234 0.0421 0.5196 4.1159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.035785 0.099454 0.360 0.71911
## MLOGP 0.528522 0.029434 17.956 < 2e-16 ***
## nHM 0.124086 0.019083 6.502 1.63e-10 ***
## piPC09 0.042167 0.014135 2.983 0.00297 **
## F04.C.O. -0.028644 0.009415 -3.042 0.00245 **
## B02.C.N. -0.160204 0.073225 -2.188 0.02906 *
## ON1V 0.098099 0.055457 1.769 0.07740 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.6321621)
##
## Null deviance: 1167.92 on 623 degrees of freedom
## Residual deviance: 390.04 on 617 degrees of freedom
## AIC: 1493.6
##
## Number of Fisher Scoring iterations: 2
model4 = logBCF ~ nHM + piPC09 + MLOGP + F04.C.O. model5 = logBCF ~ MLOGP + nHM + piPC09 + F04.C.O. + B02.C.N. + ON1V
Backward stepwise regression using BIC yielded a model with only four variables (model4). Whereas forward stepwise regression using AIC yielded a model with six variables (mode15). The models are they same but model5 has B02.C.N. and ON1V as additional variables.
set.seed(100)
Model = c("Model1 - Full Model",
"Model3 - Leaps",
"Model4 - BWD stepwise w/BIC")
Adj_R2 = c(summary(model1)$adj.r.squared,
summary(model3)$adj.r.squared,
summary(model4)$adj.r.squared)
Mallows_Cp = c(Cp(model1, S2=summary(model1)$sigma^2),
Cp(model3, S2=summary(model3)$sigma^2),
Cp(model4, S2=summary(model4)$sigma^2))
AICs = c(AIC(model1, k=2),
AIC(model3, k=2),
AIC(model4, k=2))
BICs = c(AIC(model1, k=log(n)),
AIC(model3, k=log(n)),
AIC(model4, k=log(n)))
Num_Vars = c(length(model1$coefficients)-1,
length(model3$coefficients)-1,
length(model4$coefficients)-1)
comp_table <- data.frame(Model, Adj_R2, Mallows_Cp, AICs, BICs, Num_Vars)
comp_table
## Model Adj_R2 Mallows_Cp AICs BICs Num_Vars
## 1 Model1 - Full Model 0.6623027 10 1497.477 1546.274 9
## 2 Model3 - Leaps 0.6627864 7 1493.623 1529.113 6
## 3 Model4 - BWD stepwise w/BIC 0.6598504 5 1497.052 1523.669 4
Model3 is best model based on AIC and Adjusted r-squared. However, Model4 is the best model based on BIC. This is not surprising since BIC penalizes complex models the most. Model4 is the least complex with only 4 predictor variables and the criteria used for variable section was BIC.
set.seed(100)
ridge.model.cv <- cv.glmnet(x = as.matrix(trainData[,-10]),
y= trainData$logBCF,
alpha=0,
nfolds = 10,
type.measure="deviance")
## Fit elastic net model with 100 values for lambda
ridge.model <- glm(logBCF ~ MLOGP + nHM + piPC09 + PCD + X2Av + B02.C.N. + ON1V + F04.C.O. + N.072, data = trainData)
set.seed(100)
## Plot coefficient paths
plot(ridge.model, xvar="lambda", label=TRUE, lwd=2)
## Warning in plot.window(...): "xvar" is not a graphical parameter
## Warning in plot.window(...): "label" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "xvar" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "xvar" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "xvar" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not a
## graphical parameter
## Warning in box(...): "xvar" is not a graphical parameter
## Warning in box(...): "label" is not a graphical parameter
## Warning in title(...): "xvar" is not a graphical parameter
## Warning in title(...): "label" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "xvar" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" is not a
## graphical parameter
## Warning in title(sub = sub.caption, ...): "xvar" is not a graphical parameter
## Warning in title(sub = sub.caption, ...): "label" is not a graphical parameter
## Warning in plot.window(...): "xvar" is not a graphical parameter
## Warning in plot.window(...): "label" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "xvar" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "xvar" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "xvar" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not a
## graphical parameter
## Warning in box(...): "xvar" is not a graphical parameter
## Warning in box(...): "label" is not a graphical parameter
## Warning in title(...): "xvar" is not a graphical parameter
## Warning in title(...): "label" is not a graphical parameter
## Warning in title(sub = sub.caption, ...): "xvar" is not a graphical parameter
## Warning in title(sub = sub.caption, ...): "label" is not a graphical parameter
## Warning in plot.window(...): "xvar" is not a graphical parameter
## Warning in plot.window(...): "label" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "xvar" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "xvar" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "xvar" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not a
## graphical parameter
## Warning in box(...): "xvar" is not a graphical parameter
## Warning in box(...): "label" is not a graphical parameter
## Warning in title(...): "xvar" is not a graphical parameter
## Warning in title(...): "label" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "xvar" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" is not a
## graphical parameter
## Warning in title(sub = sub.caption, ...): "xvar" is not a graphical parameter
## Warning in title(sub = sub.caption, ...): "label" is not a graphical parameter
## Warning in plot.window(...): "xvar" is not a graphical parameter
## Warning in plot.window(...): "label" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "xvar" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "xvar" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "xvar" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not a
## graphical parameter
## Warning in box(...): "xvar" is not a graphical parameter
## Warning in box(...): "label" is not a graphical parameter
## Warning in title(...): "xvar" is not a graphical parameter
## Warning in title(...): "label" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "xvar" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" is not a
## graphical parameter
## Warning in title(sub = sub.caption, ...): "xvar" is not a graphical parameter
## Warning in title(sub = sub.caption, ...): "label" is not a graphical parameter
abline(v=log(ridge.model.cv$lambda.min), col='black', lty=2, lwd=2)
coef(ridge.model, s=ridge.model.cv$lambda.min)
## (Intercept) MLOGP nHM piPC09 PCD X2Av
## 0.001421691 0.506087878 0.137022099 0.031157852 0.055655100 -0.031889999
## B02.C.N. ON1V F04.C.O. N.072
## -0.158230644 0.140594747 -0.030762988 -0.073334022
ridge.model.cv$lambda.min
## [1] 0.108775
All nine predictor variables were selected. On the surface, it might be surprising since the other variable selection methods all reduced the model. However, it is likely that these models were over fit- an issue that was addressed by the using cross-validation.
set.seed(100)
cv.lasso <- cv.glmnet(as.matrix(trainData[,-10]),
trainData$logBCF,
alpha = 1,
nfolds =10,
nlamda = 10,
standardize = TRUE)
#optimal lambda value
cv.lasso$lambda.min
## [1] 0.007854436
coef(cv.lasso, s = cv.lasso$lambda.min)
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.02722838
## nHM 0.12543866
## piPC09 0.03387665
## PCD 0.03194878
## X2Av .
## MLOGP 0.52174346
## ON1V 0.09633951
## N.072 -0.05487196
## B02.C.N. -0.13961811
## F04.C.O. -0.02535576
lasso <- glm(logBCF ~ MLOGP + nHM + piPC09 + PCD + B02.C.N. + ON1V + F04.C.O. + N.072, data = trainData)
set.seed(100)
plot(cv.lasso)
set.seed(100)
coef(cv.lasso, s = cv.lasso$lambda.min)
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.02722838
## nHM 0.12543866
## piPC09 0.03387665
## PCD 0.03194878
## X2Av .
## MLOGP 0.52174346
## ON1V 0.09633951
## N.072 -0.05487196
## B02.C.N. -0.13961811
## F04.C.O. -0.02535576
set.seed(100)
cv.en <- cv.glmnet(as.matrix(trainData[,-10]),
trainData$logBCF,
alpha = .5,
nfolds =10,
nlamda = 10,
standardize = TRUE)
#optimal lambda value
cv.en$lambda.min
## [1] 0.0207662
coef(cv.en, s = cv.en$lambda.min)
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.04903516
## nHM 0.12397290
## piPC09 0.03470891
## PCD 0.03060034
## X2Av .
## MLOGP 0.51776470
## ON1V 0.08901088
## N.072 -0.05236840
## B02.C.N. -0.14155538
## F04.C.O. -0.02420217
en <- glm(logBCF ~ MLOGP + nHM + piPC09 + PCD + B02.C.N. + ON1V + F04.C.O. + N.072, data = trainData)
set.seed(100)
coef(cv.en, s = cv.en$lambda.min)
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.04903516
## nHM 0.12397290
## piPC09 0.03470891
## PCD 0.03060034
## X2Av .
## MLOGP 0.51776470
## ON1V 0.08901088
## N.072 -0.05236840
## B02.C.N. -0.14155538
## F04.C.O. -0.02420217
The same variables were selected in both the ridge and lasso models.
set.seed(100)
pred.full = predict(model1, newdata = testData, type = "response")
pred.full[c(1,2,3)]
## 714 503 358
## 2.446479 4.333759 3.266892
pred.bwd = predict(model4, newdata = testData, type = "response")
pred.bwd[c(1,2,3)]
## 714 503 358
## 2.424916 4.353167 3.274192
pred.ridge = predict(ridge.model, newx = testData, type = "response")
pred.ridge[c(1,2,3)]
## 2 3 4
## 0.7507973 2.4450643 0.7545745
pred.lasso = predict(lasso, newdata = testData, type = "response")
pred.lasso[c(1,2,3)]
## 714 503 358
## 2.446392 4.333415 3.267254
pred.en = predict(en, newdata = testData, type = "response")
pred.en[c(1,2,3)]
## 714 503 358
## 2.446392 4.333415 3.267254
set.seed(100)
#MSPE(model1, testData)
| Backward Stepwise | Ridge | Lasso | Elastic Net | |
|---|---|---|---|---|
| nHM | ||||
| piPC09 | ||||
| PCD | ||||
| X2AV | ||||
| MLOGP | ||||
| ON1V | ||||
| N.072 | ||||
| B02.C.N. | ||||
| F04.C.O. |