## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.006061 0.024242 0.154767 0.181818 1.000000
| 5 percent or less unique | 719 |
| 4 percent or less unique | 673 |
| 3 percent or less unique | 595 |
| 2 percent or less unique | 544 |
| 1 percent or less unique | 334 |
| .5 percent or less unique | 38 |
##
## Call:
## lm(formula = permeability_set ~ fingerprint_set[, "X623"])
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.961 -10.471 -7.111 2.964 43.579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.021 1.201 10.012 <2e-16 ***
## fingerprint_set[, "X623"] 35.644 15.423 2.311 0.0221 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 163 degrees of freedom
## Multiple R-squared: 0.03173, Adjusted R-squared: 0.02579
## F-statistic: 5.341 on 1 and 163 DF, p-value: 0.02208
## Data: X dimension: 123 388
## Y dimension: 123 1
## Fit method: oscorespls
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 21.93 35.74 40.65 45.69 51.69 58.10 62.01
## .outcome 36.22 52.22 61.24 68.27 73.65 77.19 79.51
## [1] "Correlation and RMSE for pls - oscorepls - method"
## [1] 0.5561009
## [1] 12.8519
## Data: X dimension: 123 388
## Y dimension: 123 1
## Fit method: simpls
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 21.93 35.74 40.65 45.69 51.69 58.10 62.01
## .outcome 36.22 52.22 61.24 68.27 73.65 77.19 79.51
## [1] "Correlation and RMSE for simpls method"
## [1] 0.5561009
## [1] 12.8519
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 13.22052 0.3350436 10.301431 1.821787 0.1533368 1.639688
## 2 2 11.96823 0.4850946 8.472994 2.889233 0.1836436 2.028952
## 3 3 12.07081 0.4683421 9.136912 3.144100 0.1962813 1.989383
## 4 4 12.07134 0.4721820 9.359592 3.112994 0.2074927 2.011005
## 5 5 11.75915 0.4922229 9.083641 2.633979 0.1800913 1.663858
## 6 6 11.42003 0.5324685 8.663322 2.662231 0.1632630 2.062575
## 7 7 11.32409 0.5458431 8.716005 2.679489 0.1641727 2.190432
## 8 8 11.37425 0.5531723 8.867989 2.465868 0.1552896 1.895296
## 9 9 11.36217 0.5640565 8.842134 2.619179 0.1770499 2.022334
## 10 10 11.71048 0.5510060 9.016586 2.365549 0.1588582 1.834260
## Data: X dimension: 123 388
## Y dimension: 123 1
## Fit method: widekernelpls
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 21.93 35.74 40.65 45.69 51.69 58.10 62.01
## .outcome 36.22 52.22 61.24 68.27 73.65 77.19 79.51
## [1] -2.293922 -1.944875 13.362850 13.362850 10.054137 -3.260217 5.397643
## [8] 39.369655 -7.451232 26.632461 4.255509 5.268727 3.650328 11.949756
## [15] 9.632420 4.431480 2.702082 2.289478 37.243797 5.606970 21.901297
## [22] 6.212466 6.048486 22.693437 15.981558 33.822810 4.284673 5.260306
## [29] -5.030670 2.702082 6.213221 56.577543 5.261391 4.702815 15.273992
## [36] -3.466780 6.151865 3.647331 3.564829 -1.907079 -2.894031 22.288174
## [1] "Correlation and RMSE for widekernelpls method"
## [1] 0.5561009
## [1] 12.8519
## [1] "mean of permeability set"
## [1] 12.23744
## [1] "X6 and X157 are the only variables kept by the elastic net:"
## [1] 2.423346
## [1] -11.2059
## [1] "elastic net"
## [1] 12.41594
## [1] 0.5940502
## [1] "lasso"
## [1] 12.58246
## [1] 0.5789845
## [1] 21.23789
## [1] "BiologicalMaterial07"
## [1] 100 100 100 100 100 100
## Data: X dimension: 132 56
## Y dimension: 132 1
## Fit method: widekernelpls
## Number of components considered: 1
## TRAINING: % variance explained
## 1 comps
## X 15.69
## .outcome 50.62
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 1.430232 0.4744976 1.158520 0.3036160 0.1910804 0.2234675
## 2 2 1.576323 0.5189120 1.135020 0.9583382 0.2128707 0.3719558
## 3 3 1.464198 0.5690430 1.098241 0.7175804 0.2009408 0.2945041
## 4 4 1.746167 0.5349570 1.191058 1.4485594 0.2101670 0.4635368
## 5 5 2.222099 0.5006889 1.315328 2.5818162 0.2487586 0.7657938
## 6 6 2.349867 0.4901791 1.357472 2.8908251 0.2435563 0.8479381
## 7 7 2.446425 0.4823594 1.392927 3.0299201 0.2428746 0.8670859
## 8 8 2.597359 0.4731310 1.462833 3.2768486 0.2429011 0.9523778
## 9 9 2.682542 0.4590501 1.520770 3.3270509 0.2429632 0.9521031
## 10 10 2.604087 0.4641339 1.479641 3.0642853 0.2335499 0.8815459
## [1] 41.32283 39.28775 38.88566 37.92321 41.06554 40.63112 39.84869
## [8] 43.29190 42.03808 41.42337 40.51995 42.47684 39.90009 42.99435
## [15] 40.36238 52.44590 40.73774 39.33747 37.73276 40.34661 41.03542
## [22] 39.07838 38.97543 42.36834 42.08060 39.30223 43.03458 40.20742
## [29] 41.59102 40.91167 40.02780 41.75982 39.67569 39.70147 39.87578
## [36] 39.16241 39.03622 38.20845 39.74488 37.97763 38.65844 41.78918
## [43] 40.36646 42.77719
## [1] 0.5447276
## [1] 2.093934
## [1] 39.98136
## [1] 10.82873
## [1] 0.260081
exercises are from: Applied Predictive Modeling, Max Kuhn and Kjell Johnson .
data(permeability) data(fingerprints) permeability_set<-permeability summary(apply(fingerprints, 2, mean)) fingerprint_set<-as.data.frame(fingerprints) set.seed(135) near_Zer_list<-rep(0,6) near_Zer_names<-rep(0,6)
near_Zer_list[1]<-length(nearZeroVar(fingerprint_set,freqCut = 95/5)) near_Zer_list[2]<-length(nearZeroVar(fingerprint_set,freqCut = 96/4)) near_Zer_list[3]<-length(nearZeroVar(fingerprint_set,freqCut = 97/3)) near_Zer_list[4]<-length(nearZeroVar(fingerprint_set,freqCut = 98/2)) near_Zer_list[5]<-length(nearZeroVar(fingerprint_set,freqCut = 99/1)) near_Zer_list[6]<-length(nearZeroVar(fingerprint_set,freqCut = 99.5/.5)) near_Zer_names[1]<-‘5 percent or less unique’ near_Zer_names[2]<-‘4 percent or less unique’ near_Zer_names[3]<-‘3 percent or less unique’ near_Zer_names[4]<-‘2 percent or less unique’ near_Zer_names[5]<-‘1 percent or less unique’ near_Zer_names[6]<-‘.5 percent or less unique’ near_Zer_list<-cbind(near_Zer_names,near_Zer_list) colnames(near_Zer_list)<-c(‘’,’’) kable_input<-kable(near_Zer_list, “html”) %>% kable_styling(“striped”, full_width = F) %>% column_spec(1, bold = T, color = “white”, background = “#3dc666”) %>% column_spec(2, bold = T, color = “#3dc666”, background = “white”) add_header_above(kable_input, header = c(“Number Removed, by threshold”=2), bold = TRUE, italic = TRUE)%>% kable_styling(bootstrap_options = “striped”, font_size = 22)
test_model<-lm(permeability_set~fingerprint_set[,‘X623’]) summary(test_model)
near_zero_var_indices<-nearZeroVar(fingerprint_set,freqCut = 95/5) fingerprint_set<-fingerprint_set[,-near_zero_var_indices]
Splits and associated correlations and RMSE: 93 ___ .5232 12.237 94 ___ .6045 12.198 95 ___ .6046 12.259 96 ___ .5436 12.335 97 ___ .5739 13.400 98 ___ .5631 13.861 99 ___ .5194 12.588
99.5 __ .5212 12.51
ctrl<- trainControl(method= ‘CV’, number = 10) test_set_indices<-sample.int(165,size=42) test_permeability_set<-permeability_set[test_set_indices,] training_permeability_set<-permeability_set[-test_set_indices,] testing_set<-fingerprint_set[test_set_indices,] training_set<-fingerprint_set[-test_set_indices,] pls_model<-train(training_set,training_permeability_set, method=‘pls’, tuneLength=40, trControl=ctrl, preProc = c(‘center’,‘scale’)) summary(pls_model) print(‘Correlation and RMSE for pls - oscorepls - method’) cor(predict(pls_model,newdata=testing_set),test_permeability_set) RMSE(predict(pls_model,newdata=testing_set),test_permeability_set) gbmImp <- varImp(pls_model, scale = FALSE) plot(gbmImp, top = 10) pls_model<-train(training_set,training_permeability_set, method=‘simpls’, tuneLength=40, trControl=ctrl, preProc = c(‘center’,‘scale’)) summary(pls_model) print(‘Correlation and RMSE for simpls method’) cor(predict(pls_model,newdata=testing_set),test_permeability_set) RMSE(predict(pls_model,newdata=testing_set),test_permeability_set)
pls_model<-train(training_set,training_permeability_set, method=‘widekernelpls’, tuneLength=40, trControl=ctrl, preProc = c(‘center’,‘scale’)) pls_model$results[1:10,] summary(pls_model)
plot(pls_model)
predict(pls_model,newdata=testing_set) print(‘Correlation and RMSE for widekernelpls method’) cor(predict(pls_model,newdata=testing_set),test_permeability_set) RMSE(predict(pls_model,newdata=testing_set),test_permeability_set) print(‘mean of permeability set’) mean(permeability_set)
gbmImp <- varImp(pls_model, scale = FALSE) plot(gbmImp, top = 10)
training_permeability_set<-as.matrix(training_permeability_set) training_set<-as.matrix(training_set)
lambda<-cv.glmnet(training_set, training_permeability_set) print(‘X6 and X157 are the only variables kept by the elastic net:’) lambda<-coef(lambda, s = “lambda.1se”, family=“gaussian”, alpha=.5) lambda[7,] lambda[95,] #which(lambda>0) #which(lambda<0)
testing_set_matrix<-as.matrix(testing_set) #plot(cv.glmnet(training_set, training_permeability_set))
#elastic net and lasso models print(‘elastic net’)
elastic_net_model<-glmnet(training_set, training_permeability_set, family=“gaussian”, alpha=.5) plot(elastic_net_model) elastic_net_pred <- predict(elastic_net_model, s=elastic_net_model$lambda.1se, newx=testing_set_matrix) RMSE(elastic_net_pred,test_permeability_set) cor_set<-cor(elastic_net_pred,test_permeability_set) max(cor_set[,1],na.rm=TRUE)
print(‘lasso’) lasso_model<-glmnet(training_set, training_permeability_set, family=“gaussian”, alpha=1) lasso_pred <- predict(lasso_model, s=lasso_model$lambda.1se, newx=testing_set_matrix) RMSE(lasso_pred,test_permeability_set) cor_set<-cor(lasso_pred,test_permeability_set) max(cor_set[,1],na.rm=TRUE) #lasso_model
simple_lin_model<-lm(training_permeability_set~training_set) simple_lin_pred <- predict(simple_lin_model, newx=testing_set_matrix) RMSE(simple_lin_pred,test_permeability_set) #binom_model<-glmnet(training_set, training_permeability_set, family=“poisson”) #binom_pred <- predict(binom_model, s=lasso_model$lambda.1se, newx=testing_set_matrix) #RMSE(binom_pred,test_permeability_set)
#####
#####
#####
###Question 6:3
data(ChemicalManufacturingProcess) column_names<-names(ChemicalManufacturingProcess) chem_set<-as.matrix(unlist(ChemicalManufacturingProcess)) chem_set<-matrix(chem_set,ncol=58) colnames(chem_set)<-column_names
chem_set<-knn.impute(chem_set, k = 5)
print(‘BiologicalMaterial07’) head(chem_set[,8]) chem_set<-chem_set[,-8] test_set_indices<-sample.int(176,size=44) test_set<-chem_set[test_set_indices,] training_set<-chem_set[-test_set_indices,]
pls_model<-train(training_set[,-1],training_set[,1], method=‘widekernelpls’, tuneLength=40, trControl=ctrl, preProc = c(‘center’,‘scale’)) summary(pls_model)
gbmImp <- varImp(pls_model, scale = FALSE) plot(gbmImp, top = 15)
pls_model$results[1:10,] plot(pls_model) predict(pls_model,newdata=test_set[,-1]) cor(predict(pls_model,newdata=test_set[,-1]),test_set[,1]) RMSE(predict(pls_model,newdata=test_set[,-1]),test_set[,1]) mean(test_set[,1]) simple_lin_model<-lm(training_set[,1]~.,data=data.frame(training_set[,-1])) simple_lin_pred <-predict(simple_lin_model,data.frame(test_set[,-1]))
RMSE(simple_lin_pred,test_set[,1]) cor(simple_lin_pred,test_set[,1])
cor_set<-training_set[,c(‘Yield’,‘ManufacturingProcess32’,‘ManufacturingProcess13’,‘ManufacturingProcess09’,‘ManufacturingProcess36’,‘ManufacturingProcess17’,‘BiologicalMaterial06’,‘BiologicalMaterial03’,‘BiologicalMaterial02’,‘ManufacturingProcess06’,‘ManufacturingProcess33’,‘BiologicalMaterial12’,‘ManufacturingProcess12’,‘BiologicalMaterial11’,‘BiologicalMaterial08’,‘ManufacturingProcess11’)] correl.matrix<-cor(cor_set, use= “complete.obs”) corrplot::corrplot(correl.matrix,method= “color” , type= “upper”)