## StudRes Hat CookD
## 367 -4.0520330 0.008196838 0.003853476
## 494 2.5018604 0.052924672 0.009973036
## 1053 0.6497145 0.117639071 0.001608355
## 1384 2.0121671 0.085853058 0.010851139
## 1662 -0.1486110 0.164553254 0.000124334
## 2557 -3.9046434 0.012993535 0.005702387
##
## Call:
## lm(formula = ph ~ ., data = beverage_set[, c(1, 3:36)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52781 -0.07668 0.01003 0.08698 0.41612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.117e+01 1.077e+00 10.379 < 2e-16 ***
## brandcode.B 8.871e-02 2.131e-02 4.163 3.25e-05 ***
## brandcode.C -4.207e-02 2.113e-02 -1.991 0.04662 *
## brandcode.D 5.192e-02 1.750e-02 2.967 0.00304 **
## carbvolume -1.461e-01 8.944e-02 -1.633 0.10250
## fillounces -8.211e-02 3.201e-02 -2.565 0.01037 *
## pcvolume -1.037e-01 5.387e-02 -1.924 0.05441 .
## carbpressure 3.096e-03 4.177e-03 0.741 0.45867
## carbtemp -1.668e-03 3.294e-03 -0.506 0.61257
## psc -7.863e-02 5.626e-02 -1.397 0.16240
## pscfill -3.531e-02 2.309e-02 -1.529 0.12643
## pscco2 -1.224e-01 6.248e-02 -1.959 0.05017 .
## mnfflow -7.495e-04 4.592e-05 -16.320 < 2e-16 ***
## carbpressure1 6.185e-03 6.880e-04 8.990 < 2e-16 ***
## fillpressure 1.332e-03 1.210e-03 1.102 0.27077
## hydpressure1 -4.086e-05 3.630e-04 -0.113 0.91039
## hydpressure2 -1.188e-03 5.309e-04 -2.237 0.02535 *
## hydpressure3 3.581e-03 5.830e-04 6.142 9.44e-10 ***
## hydpressure4 -8.337e-05 3.187e-04 -0.262 0.79366
## fillerlevel -8.664e-04 5.605e-04 -1.546 0.12231
## fillerspeed 1.930e-05 9.941e-06 1.941 0.05231 .
## temperature -1.612e-02 2.314e-03 -6.966 4.15e-12 ***
## usagecont -7.252e-03 1.135e-03 -6.387 2.00e-10 ***
## carbflow 8.946e-06 3.745e-06 2.389 0.01697 *
## density -1.181e-01 2.777e-02 -4.252 2.20e-05 ***
## mfr -7.817e-05 5.576e-05 -1.402 0.16105
## balling -1.464e-01 2.605e-02 -5.618 2.15e-08 ***
## pressurevacuum -3.240e-02 7.885e-03 -4.110 4.09e-05 ***
## oxygenfiller -3.800e-01 7.231e-02 -5.256 1.60e-07 ***
## bowlsetpoint 2.861e-03 5.905e-04 4.845 1.34e-06 ***
## pressuresetpoint -7.750e-03 1.962e-03 -3.950 8.02e-05 ***
## airpressurer -1.756e-03 2.343e-03 -0.749 0.45364
## alchrel 8.013e-02 2.658e-02 3.014 0.00260 **
## carbrel 4.685e-02 4.802e-02 0.976 0.32933
## ballinglvl 1.852e-01 2.628e-02 7.049 2.32e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1312 on 2523 degrees of freedom
## Multiple R-squared: 0.4221, Adjusted R-squared: 0.4143
## F-statistic: 54.21 on 34 and 2523 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = ph ~ ., data = beverage_vif_set[, -c(27, 35, 8,
## 9, 33, 30, 18, 20)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52968 -0.07927 0.01079 0.09029 0.40807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.113e+01 8.859e-01 12.565 < 2e-16 ***
## brandcode.A 7.023e-03 1.617e-02 0.434 0.66399
## brandcode.C -1.179e-01 8.761e-03 -13.460 < 2e-16 ***
## brandcode.D 8.929e-02 1.847e-02 4.834 1.42e-06 ***
## carbvolume -1.624e-01 4.993e-02 -3.253 0.00116 **
## fillounces -7.451e-02 3.284e-02 -2.269 0.02336 *
## pcvolume -3.888e-02 5.438e-02 -0.715 0.47465
## psc -8.918e-02 5.789e-02 -1.540 0.12357
## pscfill -2.668e-02 2.375e-02 -1.123 0.26150
## pscco2 -1.163e-01 6.420e-02 -1.812 0.07014 .
## mnfflow -7.557e-04 4.127e-05 -18.310 < 2e-16 ***
## carbpressure1 5.240e-03 6.908e-04 7.585 4.64e-14 ***
## fillpressure 3.409e-03 1.218e-03 2.798 0.00518 **
## hydpressure1 6.995e-04 3.615e-04 1.935 0.05312 .
## hydpressure2 6.086e-04 3.617e-04 1.683 0.09258 .
## hydpressure4 -5.019e-05 3.209e-04 -0.156 0.87575
## fillerspeed 3.140e-05 1.004e-05 3.128 0.00178 **
## temperature -1.565e-02 2.324e-03 -6.733 2.05e-11 ***
## usagecont -8.276e-03 1.145e-03 -7.230 6.35e-13 ***
## carbflow -4.608e-07 3.437e-06 -0.134 0.89336
## density -1.147e-01 2.103e-02 -5.453 5.43e-08 ***
## mfr -1.435e-04 5.682e-05 -2.525 0.01163 *
## pressurevacuum -1.058e-02 6.516e-03 -1.624 0.10446
## oxygenfiller -4.056e-01 7.385e-02 -5.491 4.38e-08 ***
## pressuresetpoint -9.615e-03 1.983e-03 -4.849 1.32e-06 ***
## airpressurer -7.940e-04 2.354e-03 -0.337 0.73590
## carbrel 2.115e-01 4.653e-02 4.546 5.72e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1352 on 2531 degrees of freedom
## Multiple R-squared: 0.3846, Adjusted R-squared: 0.3783
## F-statistic: 60.85 on 26 and 2531 DF, p-value: < 2.2e-16
## brandcode.A brandcode.C brandcode.D carbvolume fillounces pcvolume psc pscfill pscco2
## 3.743757 1.219848 8.772027 3.952100 1.152495 1.526463 1.147506 1.104754 1.061105
## mnfflow carbpressure1 fillpressure hydpressure1 hydpressure2 hydpressure4 fillerspeed temperature usagecont
## 3.410814 1.488327 2.094948 2.831987 4.903888 2.526039 9.656904 1.373179 1.623619
## carbflow density mfr pressurevacuum oxygenfiller pressuresetpoint airpressurer carbrel
## 1.875702 8.786870 8.094441 1.938064 1.506902 2.282114 1.143300 4.986685
## [1] "RMSE for multi-regression model"
## [1] 0.1279363
## [1] "RMSE for model with high-vif variables removed"
## [1] 0.1314487
## Data: X dimension: 2048 35
## Y dimension: 2048 1
## Fit method: oscorespls
## Number of components considered: 10
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps 10 comps
## X 15.51 25.69 40.66 48.22 51.91 53.99 56.48 60.17 63.05 65.33
## .outcome 24.23 32.31 35.04 36.84 38.59 39.56 40.08 40.34 40.55 40.71
## [1] "Correlation and RMSE for pls - oscorepls - method"
## [1] 0.1295915
## Data: X dimension: 2048 35
## Y dimension: 2048 1
## Fit method: simpls
## Number of components considered: 10
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps 10 comps
## X 15.51 25.69 40.66 48.22 51.91 53.99 56.48 60.17 63.05 65.33
## .outcome 24.23 32.31 35.04 36.84 38.59 39.56 40.08 40.34 40.55 40.71
## [1] "Correlation and RMSE for pls - simpls - method"
## [1] 0.1295915
## [1] "kmeans augmented regression RMSE:"
## [1] 0.1299986
## [1] 0.1369162
source for Cook’s distance plot: https://www.statmethods.net/stats/rdiagnostics.html #gvif cutoff: https://stats.stackexchange.com/questions/70679/which-variance-inflation-factor-should-i-be-using-textgvi#f-or-textgvif –answer by fox
set.seed(123)
beverage_set<-read.csv(‘https://raw.githubusercontent.com/brian-cuny/624-project2/master/data/student_data_complete.csv’)
beverage_corr_set<-beverage_set
beverage_set<-beverage_set[-c(2354,2082,690,1896,1841,475,2562,2149),]
#690,1093,2355,1942,1897,2083,475,1842 –original vals
multi_model<-lm(data = beverage_set[,c(1,3:36)], ph~.)
cutoff <- 4/((nrow(beverage_set)-length(multi_model$coefficients)-2))
plot(multi_model, which=4, cook.levels=cutoff)
influencePlot(multi_model, id.method=“identify”, main=“Influence Plot”, sub=“Circle size is proportial to Cook’s Distance” )
summary(multi_model)
correl.matrix<-cor(beverage_corr_set, use= “complete.obs”)
corrplot(correl.matrix,method= “color” , type= “upper”)
beverage_vif_set<-beverage_set[,c(1:2,4:36)]
vif_model<-lm(data = beverage_vif_set, ph~.)
vif_values<-vif(vif_model)
vif_values_rows<-names(vif(vif_model))
vif_values<-as.data.frame(cbind.data.frame(as.character(vif_values_rows),as.numeric(vif_values)))
colnames(vif_values)<-c(‘variable’,‘variable_vif’)
vif_values <-vif_values[which(vif_values$variable_vif>10),]
ggplot(data=vif_values, aes(y=variable_vif,x=variable)) + geom_bar(stat=‘identity’,fill=‘#b5c6fc’) + theme(panel.background = element_rect(fill = ‘#707996’),text = element_text(family = ‘corben’,color=‘#249382’,size=38),axis.text.x = element_text(angle = 30, hjust = .9)) + ggtitle(‘Variables with the highest vif’)
#lower_vif_model<-lm(data = beverage_vif_set[,-c(4,8,9,18,27,30,35)], ph~.) #these columns matched the first set
lower_vif_model<-lm(data = beverage_vif_set[,-c(27,35,8,9,33,30,18,20)], ph~.)
summary(lower_vif_model)
vif(lower_vif_model)[1:9]
vif(lower_vif_model)[10:18]
vif(lower_vif_model)[19:26]
label_set<-c(‘ph’,‘brandcode.A’,‘brandcode.B’,‘brandcode.C’,‘brandcode.D’, ‘carbvolume’, ‘fillounces’, ‘pcvolume’, ‘carbpressure’, ‘carbtemp’, ‘psc’, ‘pscfill’, ‘pscco2’, ‘mnfflow’, ‘carbpressure1’, ‘fillpressure’, ‘hydpressure1’, ‘hydpressure2’, ‘hydpressure3’, ‘hydpressure4’, ‘fillerlevel’, ‘fillerspeed’, ‘temperature’, ‘usagecont’, ‘carbflow’, ‘density’, ‘mfr’, ‘balling’, ‘pressurevacuum’, ‘oxygenfiller’, ‘bowlsetpoint’, ‘pressuresetpoint’, ‘airpressurer’, ‘alchrel’, ‘carbrel’, ‘ballinglvl’)
plotter <- function(df, label_set,lab_num,colorbar=‘#dee253’) {ggplot(df, aes_string(x = label_set[lab_num],y = df$ph)) + geom_point(alpha=.9,color=‘#65b285’) + ylab(‘ph level’)+xlab(label_set[lab_num])+ theme(axis.text.x = element_text(angle = 30, hjust = .9),text = element_text(family =‘corben’,color=‘#249382’,size=16)) }
plotter(beverage_set,label_set,2)
plotter(beverage_set,label_set,3)
plotter(beverage_set,label_set,4)
plotter(beverage_set,label_set,5)
plotter(beverage_set,label_set,6)
plotter(beverage_set,label_set,7)
plotter(beverage_set,label_set,8)
plotter(beverage_set,label_set,9)
plotter(beverage_set,label_set,10)
plotter(beverage_set,label_set,11)
plotter(beverage_set,label_set,12)
plotter(beverage_set,label_set,13)
plotter(beverage_set,label_set,14)
plotter(beverage_set,label_set,15)
plotter(beverage_set,label_set,16)
plotter(beverage_set,label_set,17)
plotter(beverage_set,label_set,18)
plotter(beverage_set,label_set,19)
plotter(beverage_set,label_set,20)
plotter(beverage_set,label_set,21)
plotter(beverage_set,label_set,22)
plotter(beverage_set,label_set,23)
plotter(beverage_set,label_set,24)
plotter(beverage_set,label_set,25)
plotter(beverage_set,label_set,26)
plotter(beverage_set,label_set,27)
plotter(beverage_set,label_set,28)
plotter(beverage_set,label_set,29)
plotter(beverage_set,label_set,30)
plotter(beverage_set,label_set,31)
plotter(beverage_set,label_set,32)
plotter(beverage_set,label_set,33)
plotter(beverage_set,label_set,34)
plotter(beverage_set,label_set,35)
plotter(beverage_set,label_set,36)
part <- createDataPartition(beverage_set$ph, p=0.8, list=FALSE)
training_set <- beverage_set %>%
filter(row_number() %in% part)
validation_set <- beverage_set %>%
filter(!row_number() %in% part)
#RMSE with the validation set for regression-based models {.tabset .tabset-dropdown}
##basic multi-regression
print(‘RMSE for multi-regression model’)
RMSE(predict(multi_model,validation_set),validation_set[,1])
##regression without high-vif variables
print(‘RMSE for model with high-vif variables removed’)
RMSE(predict(lower_vif_model,validation_set[,-c(3,28,36,9,10,34,31,19,21)]),validation_set[,1])
##partial least squares regression
#We set a control set to create a 10-fold cross validation ctrl<- trainControl(method= ‘CV’, number = 10) training_set_matrix<-as.matrix(training_set) pls_model<-train(ph~.,data=training_set_matrix, method=‘pls’, tuneLength=10, trControl=ctrl, preProc = c(‘center’,‘scale’)) summary(pls_model) print(‘Correlation and RMSE for pls - oscorepls - method’) RMSE(predict(pls_model,newdata=validation_set),validation_set[,1]) #gbmImp <- varImp(pls_model, scale = FALSE) #plot(gbmImp, top = 10) pls_model<-train(ph~.,data=training_set_matrix, method=‘simpls’, tuneLength=10, trControl=ctrl, preProc = c(‘center’,‘scale’)) summary(pls_model) print(‘Correlation and RMSE for pls - simpls - method’) RMSE(predict(pls_model,newdata=validation_set),validation_set[,1])
adjusted_r_squared_set<-rep(0,10)
wss_set<-rep(0,10)
km_train_set<-as.data.frame(training_set[,c(2:36)])#This dataframe allows kmeans to run
for (i in 1:10){
km_model<-kmeans(km_train_set,i*10-5)
means_group<-matrix(km_model)
wss_set[i]<-km_model$tot.withinss
training_set_kmeans<-cbind.data.frame (training_set,means_group[1])#This dataframe is augmented with the results of the kmeans category creation
colnames(training_set_kmeans)[36]<-’means_group’
kmeans_model<-lm(data=training_set_kmeans,ph~.)
adjusted_r_squared_set[i]<-summary(kmeans_model)[9]
}
#print(‘R squared for different numbers of means:’)
#rows_for_table<-c(‘5 means’,‘15 means’,‘25 means’,‘35 means’,‘45 means’,‘55 means’,‘65 means’,‘75 means’,‘85 means’,‘95 means’)
#adjusted_r_squared_set<-cbind(rows_for_table,adjusted_r_squared_set)
plot(y=wss_set,x=seq(5,95, by=10),main=“within sum of squares by number of means”,xlab=“number of clusters”,ylab=“wss”)
#The best model by wss is the 65 means cluster
km_model<-kmeans(km_train_set,65)
training_set_kmeans<-cbind(training_set,means_group[1])
colnames(training_set_kmeans)[37]<-’means_group’
kmeans_model<-lm(data=training_set_kmeans,ph~.)
#create a cluster in the validation set
km_validation_set<-as.data.frame(validation_set[,c(2:36)])#as dataframe so kmeans works
means_group<-kmeans(km_validation_set,65)
validation_set_km<-cbind(validation_set,means_group[1])#augment with kmeans group
colnames(validation_set_km)[37]<-’means_group’
print(‘kmeans augmented regression RMSE:’)
RMSE( predict(kmeans_model, validation_set_km), validation_set_km[,1])
#Brand was taken out so elasticnet would run. It needs to be changed to numeric
#training_set_elnet<-training_set[,-(2)]
elastic_net_model<-glmnet(as.matrix(training_set_matrix[,-1]), training_set_matrix[,1], family=“gaussian”, alpha=.65, standardize = TRUE)
plot(elastic_net_model)
elnet_predict<-predict(elastic_net_model, s=elastic_net_model$lambda.1se, newx=as.matrix(validation_set[,-1]))
RMSE(elnet_predict,validation_set[,1])
source for Cook’s distance plot: https://www.statmethods.net/stats/rdiagnostics.html #gvif cutoff: https://stats.stackexchange.com/questions/70679/which-variance-inflation-factor-should-i-be-using-textgvi#f-or-textgvif –answer by fox —This changed because of a change in the brand variable