A Load in and Visualize dataset
- book has wrong data load command so lets make the variables it assumes we already have loaded
- we’re gonna have to choose a model later so lets look at some descriptive statistics for dataset
- Near zero var tells us to remove Biological material 7
- It looks like the Biological Material does have alot of correlation with each other
- After removing many predictors, I look to see how the leftover predictors correlate with our target Yield
data("ChemicalManufacturingProcess")
completedData <- read.csv("imputed.csv")
complete_df_imputed <- completedData
my_df <- ChemicalManufacturingProcess[,-1]
processPredictors <- as.matrix(my_df)
yield <- ChemicalManufacturingProcess[,1]
## eliminate high correlation and near zero var
corrplot::corrplot(cor(completedData))

nearZeroVar(completedData)
## [1] 8
correlations <- cor(completedData)
highCorr <- findCorrelation(correlations, cutoff = .75)
completedData <- completedData[, -highCorr]
completedData <- completedData[,-3]
## Biiolgical material
corrplot::corrplot(cor(completedData[,1:5]))

##
corr_df <- as.data.frame(cbind(completedData,yield))
corrplot::corrplot(cor(corr_df[,1:37])[1:15,37, drop=FALSE], cl.pos='n')

corrplot::corrplot(cor(corr_df[,1:37])[16:37,37, drop=FALSE], cl.pos='n')

B Impute
- through VIM and MICE package we can visualize and handle imputation of the data +We can see form the missing values plots, that we have all of our missing values in the Manufacturing process. With 8% of Manufacturing_process_3 missing. Every other predictor has about 5 or less
- I would have thought to remove manufacturing_process3, however, it has a strong correlation with our target variable, so I believe it’s worth keeping in the model
##print missing vlaues
aggr_plot <- aggr(my_df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(my_df), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

##
## Variables sorted by number of missings:
## Variable Count
## ManufacturingProcess03 0.085227273
## ManufacturingProcess11 0.056818182
## ManufacturingProcess10 0.051136364
## ManufacturingProcess25 0.028409091
## ManufacturingProcess26 0.028409091
## ManufacturingProcess27 0.028409091
## ManufacturingProcess28 0.028409091
## ManufacturingProcess29 0.028409091
## ManufacturingProcess30 0.028409091
## ManufacturingProcess31 0.028409091
## ManufacturingProcess33 0.028409091
## ManufacturingProcess34 0.028409091
## ManufacturingProcess35 0.028409091
## ManufacturingProcess36 0.028409091
## ManufacturingProcess02 0.017045455
## ManufacturingProcess06 0.011363636
## ManufacturingProcess01 0.005681818
## ManufacturingProcess04 0.005681818
## ManufacturingProcess05 0.005681818
## ManufacturingProcess07 0.005681818
## ManufacturingProcess08 0.005681818
## ManufacturingProcess12 0.005681818
## ManufacturingProcess14 0.005681818
## ManufacturingProcess22 0.005681818
## ManufacturingProcess23 0.005681818
## ManufacturingProcess24 0.005681818
## ManufacturingProcess40 0.005681818
## ManufacturingProcess41 0.005681818
## BiologicalMaterial01 0.000000000
## BiologicalMaterial02 0.000000000
## BiologicalMaterial03 0.000000000
## BiologicalMaterial04 0.000000000
## BiologicalMaterial05 0.000000000
## BiologicalMaterial06 0.000000000
## BiologicalMaterial07 0.000000000
## BiologicalMaterial08 0.000000000
## BiologicalMaterial09 0.000000000
## BiologicalMaterial10 0.000000000
## BiologicalMaterial11 0.000000000
## BiologicalMaterial12 0.000000000
## ManufacturingProcess09 0.000000000
## ManufacturingProcess13 0.000000000
## ManufacturingProcess15 0.000000000
## ManufacturingProcess16 0.000000000
## ManufacturingProcess17 0.000000000
## ManufacturingProcess18 0.000000000
## ManufacturingProcess19 0.000000000
## ManufacturingProcess20 0.000000000
## ManufacturingProcess21 0.000000000
## ManufacturingProcess32 0.000000000
## ManufacturingProcess37 0.000000000
## ManufacturingProcess38 0.000000000
## ManufacturingProcess39 0.000000000
## ManufacturingProcess42 0.000000000
## ManufacturingProcess43 0.000000000
## ManufacturingProcess44 0.000000000
## ManufacturingProcess45 0.000000000
# tempData <- mice(my_df,m=5,maxit=50,meth='pmm',seed=500)
# summary(tempData)
# completedData <- complete(tempData,1)
#
#
# write.csv(completedData, file = "imputed.csv")
Elastic net on trimmed df
- Below I run an elastic net cross validated model with grid search.
- RMSE of 1.41 on train set Rmse of 1.21 on test data using Elastic net
- better performance on the test set indicates that the dataset has done a good job at regularizing the results. Perhaps it’s done too good a job and we have sacrificed low bias for reduced variance
## mke split
set.seed(123)
smp_size <- floor(0.75 * nrow(completedData))
train_ind <- sample(seq_len(nrow(completedData)), size = smp_size)
train <- completedData[train_ind, ]
test <- completedData[-train_ind, ]
train_y <- yield[train_ind]
test_y <- yield[-train_ind]
## make grid
myControl <- trainControl(
method = "cv",
number = 10
)
my_df <- as.data.frame(cbind(train_y,train))
model <- train(
train_y ~.,
my_df,
tuneGrid = expand.grid(
alpha = seq(.05, 1, length = 20),
lambda = seq(0.0001, 1, length = 20)
),
method = "glmnet",
trControl = myControl
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## plot and grab best model
model$bestTune
## alpha lambda
## 263 0.7 0.1053526

results <- model$results
##print model
kableExtra::kable(results[367,])
|
|
alpha
|
lambda
|
RMSE
|
Rsquared
|
MAE
|
RMSESD
|
RsquaredSD
|
MAESD
|
|
367
|
0.95
|
0.3158579
|
1.412605
|
0.4924437
|
1.16116
|
0.3629435
|
0.226485
|
0.2873425
|
### gete rmse on test
predictions <- predict(model,test)
RMSE(predictions,test_y)
## [1] 1.162893
## plot residuals
xyplot(train_y ~ predict(model),
type = c("p", "r"),
xlab = "Predicted", ylab = "Observed")

predicted <- predict(model)
actual <-test_y
xyplot((predicted-actual) ~ predicted,
type = c("p", "g"),
xlab = "Predicted", ylab = "Residuals")

trimmed_coeff_df_elastic <- coef(model$finalModel, model$finalModel$lambdaOpt)
trimmed_coeff_df_elastic <- varImp(model, lambda = model$lambda.min,scale=TRUE)
#plot(trimmed_coeff_df_elastic, top = 10)
PLS on trimmed df
- produced a rmse of 1.63 with 1 ncomponent on train data and a error of 1.29 on test data.
- As before, too much regularization
train <- as.matrix(train)
test <- as.matrix(test)
#train_y <- permeability[train_ind,]
#test_y <- permeability[-train_ind,]
ctrl <- caret::trainControl(method = "cv", number = 10)
set.seed(100)
lmFit1 <- train(train, train_y,
method = "pls",
tuneLength = 50,
trControl = ctrl,
preProc = c("center", "scale"))
plot(lmFit1)

##predict and describe predicted distributon
predictions <- predict(lmFit1,test)
RMSE(predictions,test_y)
## [1] 1.290248
## plot resids
xyplot(train_y ~ predict(lmFit1),
type = c("p", "r"),
xlab = "Predicted", ylab = "Observed")

a <- resid(lmFit1)
xyplot(resid(lmFit1) ~ predict(lmFit1),
type = c("p", "g"),
xlab = "Predicted", ylab = "Residuals")

a <- lmFit1$finalModel$coefficients
aa <- a[1:36,1,1][1:36]
aa <- sort(aa)
coeff1 <- head(aa,2)
coeef2 <- tail(aa,7)
trimmed_df_pls_coeff <- paste(names(coeef2),names(coeff1))
gbmImp <- varImp(lmFit1,scale=TRUE)
#plot(gbmImp, top = 20)
#vif_trimmeddf_elastic <- varImp(lmFit1, lambda = lmFit1$lambda.min)
LM model on trimmed dataset
- rmse on train is poor 4.25 but its 1.19 on test
# make split
set.seed(123)
smp_size <- floor(0.75 * nrow(completedData))
train_ind <- sample(seq_len(nrow(completedData)), size = smp_size)
train <- completedData[train_ind, ]
test <- completedData[-train_ind, ]
train_y <- yield[train_ind]
test_y <- yield[-train_ind]
## make grid
myControl <- trainControl(
method = "cv",
number = 10
)
my_df <- as.data.frame(cbind(train_y,train))
model <- train(
train_y ~.,
my_df,
method = "lm",
trControl = myControl
)
## plot and grab best model
model
## Linear Regression
##
## 132 samples
## 36 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 118, 120, 120, 118, 118, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.254199 0.372825 2.084139
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
### gete rmse on test
predictions <- predict(model,test)
RMSE(predictions,test_y)
## [1] 1.197764
## plot residuals
xyplot(train_y ~ predict(model),
type = c("p", "r"),
xlab = "Predicted", ylab = "Observed")

predicted <- predict(model)
actual <-test_y
xyplot((predicted-actual) ~ predicted,
type = c("p", "g"),
xlab = "Predicted", ylab = "Residuals")

#trimmed_coeff_df_elastic <- coef(model$finalModel, model$finalModel$lambdaOpt)
ELASTIC NET FULL DATASET
- rmse 1.14, 23 predictors, rmse on test is 1.02
set.seed(123)
## make train test spit
smp_size <- floor(0.75 * nrow(complete_df_imputed))
train_ind <- sample(seq_len(nrow(complete_df_imputed)), size = smp_size)
train <- complete_df_imputed[train_ind, ]
test <- complete_df_imputed[-train_ind, ]
train_y <- yield[train_ind]
test_y <- yield[-train_ind]
myControl <- trainControl(
method = "cv",
number = 10
)
complete_df_imputed <- as.data.frame(cbind(train_y,train))
model <- train(
train_y ~.,
complete_df_imputed,
tuneGrid = expand.grid(
alpha = seq(.05, 1, length = 20),
lambda = seq(0.0001, 1, length = 20)
),
method = "glmnet",
trControl = myControl
)
model$bestTune
## alpha lambda
## 66 0.2 0.2632316

results <- model$results
##print model
kableExtra::kable(results[66,])
|
|
alpha
|
lambda
|
RMSE
|
Rsquared
|
MAE
|
RMSESD
|
RsquaredSD
|
MAESD
|
|
66
|
0.2
|
0.2632316
|
1.144105
|
0.6639513
|
0.9484295
|
0.34128
|
0.1494338
|
0.2958873
|
## alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD
## 66 0.2 0.2632316 1.144105 0.6639513 0.9484295 0.34128 0.1494338
## MAESD
## 66 0.2958873
predictions <- predict(model,test)
RMSE(predictions,test_y)
## [1] 1.029555
xyplot(train_y ~ predict(model),
type = c("p", "r"),
xlab = "Predicted", ylab = "Observed")

predicted <- predict(model)
actual <-test_y
xyplot((predicted-actual) ~ predicted,
type = c("p", "g"),
xlab = "Predicted", ylab = "Residuals")

full_df_coeff_elastic <- coef(model$finalModel, model$finalModel$lambdaOpt)
vif_fulldf_elastic <- varImp(model, lambda = model$lambda.min,scale=FALSE)
#plot(vif_fulldf_elastic, top = 40)
PLS FULL DATASET
- rmse on test is 1.007, rmse on train set is 1.55
train <- as.matrix(train)
test <- as.matrix(test)
#train_y <- permeability[train_ind,]
#test_y <- permeability[-train_ind,]
ctrl <- caret::trainControl(method = "cv", number = 10)
set.seed(100)
lmFit1 <- train(train, train_y,
method = "pls",
tuneLength = 50,
trControl = ctrl,
preProc = c("center", "scale"))
plot(lmFit1)

## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 1.552594 0.3921650 1.209544 0.4124142 0.1820347 0.2525004
## 2 2 2.058701 0.4230206 1.304422 1.5293874 0.2620351 0.5277752
## 3 3 1.490754 0.5138102 1.127335 0.5825268 0.1870572 0.2675844
## 4 4 1.699364 0.4834722 1.232303 0.9009505 0.2251233 0.4107088
## 5 5 2.155600 0.4573780 1.365375 1.8750159 0.2352405 0.6294696
##predict and describe predicted distributon
predictions <- predict(lmFit1,test)
plot(predictions)

## [1] 1.00796
xyplot(train_y ~ predict(lmFit1),
type = c("p", "r"),
xlab = "Predicted", ylab = "Observed")

a <- resid(lmFit1)
predict(lmFit1)
## [1] 42.16519 38.83836 40.62348 38.63827 39.72907 42.79438 38.38109
## [8] 39.22384 39.24237 40.61099 39.45135 39.59081 40.55557 39.25299
## [15] 41.10172 39.27550 42.57414 42.86000 41.66412 39.81627 39.15234
## [22] 38.94366 37.99911 38.92530 38.42385 39.31155 41.50877 39.71693
## [29] 42.05367 40.96423 38.92472 40.72651 40.17118 39.90778 41.51291
## [36] 41.13780 39.70574 40.89406 41.84864 42.17006 39.37243 40.63704
## [43] 38.59988 42.21322 39.66085 39.46519 37.84796 39.80112 40.32086
## [50] 39.46606 43.54452 39.85262 40.15755 40.97714 39.26967 41.79547
## [57] 40.87700 39.79899 39.08738 38.74099 40.37218 41.31790 40.98142
## [64] 38.69335 39.74506 40.60059 39.15750 38.02706 40.17124 43.16924
## [71] 39.48711 41.58739 40.11136 38.49683 44.35341 40.36322 42.93887
## [78] 40.86362 39.39762 39.87609 41.98394 38.58907 39.14966 38.61419
## [85] 41.80021 39.78389 40.54743 38.32227 40.01923 39.06757 42.40990
## [92] 39.82310 38.22424 42.04551 36.19080 39.56180 38.37833 40.76995
## [99] 43.20860 39.15549 42.88362 39.19323 40.76300 38.72370 39.97285
## [106] 39.93132 41.69907 38.79407 36.38516 39.51768 37.96447 39.44864
## [113] 40.09195 40.48718 42.72904 42.19010 42.28546 39.72290 40.39007
## [120] 39.60549 37.86695 40.35823 37.79553 39.17294 40.98604 40.04032
## [127] 39.80159 43.26842 38.38811 38.38968 40.57585 42.42684
xyplot(resid(lmFit1) ~ predict(lmFit1),
type = c("p", "g"),
xlab = "Predicted", ylab = "Residuals")

a <- lmFit1$finalModel$coefficients
aa <- a[1:36,1,1][1:36]
aa <- sort(aa)
coeff1 <- head(aa,2)
coeef2 <- tail(aa,7)
full_coeff_pls <- paste(names(coeef2),names(coeff1))
gbmImp2 <- varImp(lmFit1)
#plot(gbmImp2, top = 40)
E best predictors
- It’s tough to say which process is more important between Biological and Manufactoring. At first I wanted to say Biological seems more important becuase given how few predictors there are that are biological, a higher percentage of them seem to make it onto importance of the models.
- However,manufactoring process 36 gets identified by all models in top 5 predictor in terms of importance. It seems on 3/4 of our cross validated models (pls and elastic net), it’s the most important term. In the elastic net, only 2 terms have iportance levels over 1 and they are both manufactoring(36,34)
## full df
# full_coeff_pls
# full_df_coeff_elastic
#
# ## trimmed df
# trimmed_coeff_df_elastic
# trimmed_df_pls_coeff
#
#
# vif_trimmeddf_elastic
# vif_fulldf_elastic
plot(trimmed_coeff_df_elastic, top = 10)

plot(vif_fulldf_elastic, top = 10)



#
# MP 36,33,06,17,12,28,11,04,02,37,30,34
# B 03,11,05,10
F
- looking at yield versus our best predictors, it seems that they all have high correlation with our yield.
- also interesting, is that it seems like the biological Material, all have positive correlation with our yield, where as It seems our manufactoring process can lead to better or worse yield.
- So it seems our biological material is some sort of accelarant towards our yeilds.
- we can calso Identify manufactoring process that may need improvement in our chain, for instance 17 and 36.
- as Manufactoring process 36 is our most important predictor as mentioned earlier, it seems we need to look at what is happening at this point in the production line that is hurting our yeild
corrplot::corrplot(cor(corr_df[,1:37])[c(1,2,4,5,11),37, drop=FALSE], cl.pos='n')

corrplot::corrplot(cor(corr_df[,1:37])[c(18,31,29,28),37, drop=FALSE], cl.pos='n')

Try XGboost on trimmed data
- Train rmse 1.021033
- test rmse .96
- code commented out and model has been loaded in next code block
# xgb_trcontrol = trainControl(
# method = "cv",
# number = 5,
# allowParallel = TRUE,
# verboseIter = FALSE,
# returnData = FALSE
# )
#
# xgbGrid <- expand.grid(nrounds = c(100,200),
# max_depth = c(10, 15, 20, 25,35,50),
# colsample_bytree = seq(0.5, 0.9, length.out = 5),
# eta = 0.1,
# gamma=0,
# min_child_weight = 1,
# subsample = 1
# )
# set.seed(0)
# xgb_model = train(
# train, train_y,
# trControl = xgb_trcontrol,
# tuneGrid = xgbGrid,
# method = "xgbTree"
# )
#
# save(xgb_model, file = "my_model1.rda")
load("my_model1.rda")
xgb_model
## eXtreme Gradient Boosting
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 106, 107, 105, 104, 106
## Resampling results across tuning parameters:
##
## max_depth colsample_bytree nrounds RMSE Rsquared MAE
## 10 0.5 100 1.089833 0.6561387 0.8400149
## 10 0.5 200 1.087939 0.6568919 0.8390295
## 10 0.5 300 1.087939 0.6568919 0.8390295
## 10 0.6 100 1.024427 0.6969343 0.7891147
## 10 0.6 200 1.021033 0.6984054 0.7854744
## 10 0.6 300 1.021033 0.6984054 0.7854744
## 10 0.7 100 1.120550 0.6317680 0.8590185
## 10 0.7 200 1.119162 0.6323874 0.8591020
## 10 0.7 300 1.119162 0.6323874 0.8591020
## 10 0.8 100 1.085781 0.6549901 0.8547885
## 10 0.8 200 1.083727 0.6561141 0.8546676
## 10 0.8 300 1.083727 0.6561141 0.8546676
## 10 0.9 100 1.118152 0.6324301 0.9008768
## 10 0.9 200 1.114427 0.6347232 0.8987003
## 10 0.9 300 1.114427 0.6347232 0.8987003
## 15 0.5 100 1.077720 0.6635686 0.8262174
## 15 0.5 200 1.075672 0.6643636 0.8245807
## 15 0.5 300 1.075672 0.6643636 0.8245807
## 15 0.6 100 1.085069 0.6524938 0.8339228
## 15 0.6 200 1.083333 0.6530858 0.8331233
## 15 0.6 300 1.083333 0.6530858 0.8331233
## 15 0.7 100 1.080467 0.6570163 0.8496478
## 15 0.7 200 1.077580 0.6585723 0.8481070
## 15 0.7 300 1.077580 0.6585723 0.8481070
## 15 0.8 100 1.067447 0.6719647 0.8445498
## 15 0.8 200 1.064282 0.6737366 0.8420498
## 15 0.8 300 1.064282 0.6737366 0.8420498
## 15 0.9 100 1.162100 0.6058302 0.9047678
## 15 0.9 200 1.160799 0.6064144 0.9037859
## 15 0.9 300 1.160799 0.6064144 0.9037859
## 20 0.5 100 1.050546 0.6794086 0.8220994
## 20 0.5 200 1.048580 0.6800621 0.8214521
## 20 0.5 300 1.048580 0.6800621 0.8214521
## 20 0.6 100 1.094224 0.6605705 0.8286628
## 20 0.6 200 1.090678 0.6620653 0.8267434
## 20 0.6 300 1.090678 0.6620653 0.8267434
## 20 0.7 100 1.074859 0.6680933 0.8547456
## 20 0.7 200 1.073583 0.6683181 0.8548453
## 20 0.7 300 1.073583 0.6683181 0.8548453
## 20 0.8 100 1.092043 0.6496214 0.8525358
## 20 0.8 200 1.090770 0.6499214 0.8529834
## 20 0.8 300 1.090770 0.6499214 0.8529834
## 20 0.9 100 1.102817 0.6478829 0.8705654
## 20 0.9 200 1.101679 0.6481754 0.8719678
## 20 0.9 300 1.101679 0.6481754 0.8719678
## 25 0.5 100 1.062410 0.6798330 0.7927068
## 25 0.5 200 1.058454 0.6817518 0.7895530
## 25 0.5 300 1.058454 0.6817518 0.7895530
## 25 0.6 100 1.108388 0.6436938 0.8435180
## 25 0.6 200 1.106055 0.6448920 0.8427075
## 25 0.6 300 1.106055 0.6448920 0.8427075
## 25 0.7 100 1.130128 0.6267448 0.8615937
## 25 0.7 200 1.127409 0.6282064 0.8600612
## 25 0.7 300 1.127409 0.6282064 0.8600612
## 25 0.8 100 1.117212 0.6335799 0.8609957
## 25 0.8 200 1.115252 0.6343281 0.8600376
## 25 0.8 300 1.115252 0.6343281 0.8600376
## 25 0.9 100 1.083957 0.6606273 0.8492154
## 25 0.9 200 1.082721 0.6608264 0.8501550
## 25 0.9 300 1.082721 0.6608264 0.8501550
## 35 0.5 100 1.084220 0.6600635 0.8295226
## 35 0.5 200 1.080902 0.6616495 0.8261701
## 35 0.5 300 1.080902 0.6616495 0.8261701
## 35 0.6 100 1.083892 0.6600015 0.8532577
## 35 0.6 200 1.081348 0.6611191 0.8501705
## 35 0.6 300 1.081348 0.6611191 0.8501705
## 35 0.7 100 1.082064 0.6632483 0.8380982
## 35 0.7 200 1.078056 0.6651158 0.8359993
## 35 0.7 300 1.078056 0.6651158 0.8359993
## 35 0.8 100 1.105220 0.6487173 0.8487422
## 35 0.8 200 1.102260 0.6503868 0.8469165
## 35 0.8 300 1.102260 0.6503868 0.8469165
## 35 0.9 100 1.116250 0.6376606 0.8753835
## 35 0.9 200 1.114659 0.6383147 0.8748488
## 35 0.9 300 1.114659 0.6383147 0.8748488
## 50 0.5 100 1.132415 0.6291219 0.8551262
## 50 0.5 200 1.129727 0.6304619 0.8539035
## 50 0.5 300 1.129727 0.6304619 0.8539035
## 50 0.6 100 1.069398 0.6744416 0.8406457
## 50 0.6 200 1.067015 0.6754039 0.8395638
## 50 0.6 300 1.067015 0.6754039 0.8395638
## 50 0.7 100 1.072629 0.6713598 0.8010924
## 50 0.7 200 1.069646 0.6723456 0.7993633
## 50 0.7 300 1.069646 0.6723456 0.7993633
## 50 0.8 100 1.079503 0.6576901 0.8377318
## 50 0.8 200 1.077362 0.6587970 0.8378671
## 50 0.8 300 1.077362 0.6587970 0.8378671
## 50 0.9 100 1.127813 0.6304260 0.8992666
## 50 0.9 200 1.127238 0.6306405 0.8989618
## 50 0.9 300 1.127238 0.6306405 0.8989618
##
## Tuning parameter 'eta' was held constant at a value of 0.1
##
## Tuning parameter 'min_child_weight' was held constant at a value of
## 1
## Tuning parameter 'subsample' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 200, max_depth =
## 10, eta = 0.1, gamma = 0, colsample_bytree = 0.6, min_child_weight =
## 1 and subsample = 1.
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 5 200 10 0.1 0 0.6 1 1
predicted = predict(xgb_model, test)
residuals = test_y - predicted
RMSE(predicted,test_y)
## [1] 0.9659133
#options(repr.plot.width=8, repr.plot.height=4)
my_data = as.data.frame(cbind(predicted = predicted,
observed = test_y))
# Plot predictions vs test data
ggplot(my_data,aes(predicted, observed)) + geom_point(color = "darkred", alpha = 0.5) +
geom_smooth(method=lm)+ ggtitle('Linear Regression ') + ggtitle("Extreme Gradient Boosting: Prediction vs Test Data") +
xlab("Predecited Power Output ") + ylab("Observed Power Output") +
theme(plot.title = element_text(color="darkgreen",size=16,hjust = 0.5),
axis.text.y = element_text(size=12), axis.text.x = element_text(size=12,hjust=.5),
axis.title.x = element_text(size=14), axis.title.y = element_text(size=14))
