df <- read.xlsx("train.xlsx",1)
test <- read.xlsx("test.xlsx",1)| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Brand.Code* | 1 | 2451 | 2.51 | 1.00 | 2.00 | 2.51 | 0.00 | 1.00 | 4.00 | 3.00 | 0.38 | -1.06 | 0.02 |
| Carb.Volume | 2 | 2561 | 5.37 | 0.11 | 5.35 | 5.37 | 0.11 | 5.04 | 5.70 | 0.66 | 0.39 | -0.47 | 0.00 |
| Fill.Ounces | 3 | 2533 | 23.97 | 0.09 | 23.97 | 23.98 | 0.08 | 23.63 | 24.32 | 0.69 | -0.02 | 0.86 | 0.00 |
| PC.Volume | 4 | 2532 | 0.28 | 0.06 | 0.27 | 0.27 | 0.05 | 0.08 | 0.48 | 0.40 | 0.34 | 0.67 | 0.00 |
| Carb.Pressure | 5 | 2544 | 68.19 | 3.54 | 68.20 | 68.12 | 3.56 | 57.00 | 79.40 | 22.40 | 0.18 | -0.01 | 0.07 |
| Carb.Temp | 6 | 2545 | 141.09 | 4.04 | 140.80 | 140.99 | 3.85 | 128.60 | 154.00 | 25.40 | 0.25 | 0.24 | 0.08 |
| PSC | 7 | 2538 | 0.08 | 0.05 | 0.08 | 0.08 | 0.05 | 0.00 | 0.27 | 0.27 | 0.85 | 0.65 | 0.00 |
| PSC.Fill | 8 | 2548 | 0.20 | 0.12 | 0.18 | 0.18 | 0.12 | 0.00 | 0.62 | 0.62 | 0.93 | 0.77 | 0.00 |
| PSC.CO2 | 9 | 2532 | 0.06 | 0.04 | 0.04 | 0.05 | 0.03 | 0.00 | 0.24 | 0.24 | 1.73 | 3.73 | 0.00 |
| Mnf.Flow | 10 | 2569 | 24.57 | 119.48 | 65.20 | 21.07 | 169.02 | -100.20 | 229.40 | 329.60 | 0.00 | -1.87 | 2.36 |
| Carb.Pressure1 | 11 | 2539 | 122.59 | 4.74 | 123.20 | 122.54 | 4.45 | 105.60 | 140.20 | 34.60 | 0.05 | 0.14 | 0.09 |
| Fill.Pressure | 12 | 2549 | 47.92 | 3.18 | 46.40 | 47.71 | 2.37 | 34.60 | 60.40 | 25.80 | 0.55 | 1.41 | 0.06 |
| Hyd.Pressure1 | 13 | 2560 | 12.44 | 12.43 | 11.40 | 10.84 | 16.90 | -0.80 | 58.00 | 58.80 | 0.78 | -0.14 | 0.25 |
| Hyd.Pressure2 | 14 | 2556 | 20.96 | 16.39 | 28.60 | 21.05 | 13.34 | 0.00 | 59.40 | 59.40 | -0.30 | -1.56 | 0.32 |
| Hyd.Pressure3 | 15 | 2556 | 20.46 | 15.98 | 27.60 | 20.51 | 13.94 | -1.20 | 50.00 | 51.20 | -0.32 | -1.57 | 0.32 |
| Hyd.Pressure4 | 16 | 2541 | 96.29 | 13.12 | 96.00 | 95.45 | 11.86 | 52.00 | 142.00 | 90.00 | 0.55 | 0.63 | 0.26 |
| Filler.Level | 17 | 2551 | 109.25 | 15.70 | 118.40 | 111.04 | 9.19 | 55.80 | 161.20 | 105.40 | -0.85 | 0.05 | 0.31 |
| Filler.Speed | 18 | 2514 | 3687.20 | 770.82 | 3982.00 | 3919.99 | 47.44 | 998.00 | 4030.00 | 3032.00 | -2.87 | 6.71 | 15.37 |
| Temperature | 19 | 2557 | 65.97 | 1.38 | 65.60 | 65.80 | 0.89 | 63.60 | 76.20 | 12.60 | 2.39 | 10.16 | 0.03 |
| Usage.cont | 20 | 2566 | 20.99 | 2.98 | 21.79 | 21.25 | 3.19 | 12.08 | 25.90 | 13.82 | -0.54 | -1.02 | 0.06 |
| Carb.Flow | 21 | 2569 | 2468.35 | 1073.70 | 3028.00 | 2601.14 | 326.17 | 26.00 | 5104.00 | 5078.00 | -0.99 | -0.58 | 21.18 |
| Density | 22 | 2570 | 1.17 | 0.38 | 0.98 | 1.15 | 0.15 | 0.24 | 1.92 | 1.68 | 0.53 | -1.20 | 0.01 |
| MFR | 23 | 2359 | 704.05 | 73.90 | 724.00 | 718.16 | 15.42 | 31.40 | 868.60 | 837.20 | -5.09 | 30.46 | 1.52 |
| Balling | 24 | 2570 | 2.20 | 0.93 | 1.65 | 2.13 | 0.37 | -0.17 | 4.01 | 4.18 | 0.59 | -1.39 | 0.02 |
| Pressure.Vacuum | 25 | 2571 | -5.22 | 0.57 | -5.40 | -5.25 | 0.59 | -6.60 | -3.60 | 3.00 | 0.53 | -0.03 | 0.01 |
| PH | 26 | 2567 | 8.55 | 0.17 | 8.54 | 8.55 | 0.18 | 7.88 | 9.36 | 1.48 | -0.29 | 0.06 | 0.00 |
| Oxygen.Filler | 27 | 2559 | 0.05 | 0.05 | 0.03 | 0.04 | 0.02 | 0.00 | 0.40 | 0.40 | 2.66 | 11.09 | 0.00 |
| Bowl.Setpoint | 28 | 2569 | 109.33 | 15.30 | 120.00 | 111.35 | 0.00 | 70.00 | 140.00 | 70.00 | -0.97 | -0.06 | 0.30 |
| Pressure.Setpoint | 29 | 2559 | 47.62 | 2.04 | 46.00 | 47.60 | 0.00 | 44.00 | 52.00 | 8.00 | 0.20 | -1.60 | 0.04 |
| Air.Pressurer | 30 | 2571 | 142.83 | 1.21 | 142.60 | 142.58 | 0.59 | 140.80 | 148.20 | 7.40 | 2.25 | 4.73 | 0.02 |
| Alch.Rel | 31 | 2562 | 6.90 | 0.51 | 6.56 | 6.84 | 0.06 | 5.28 | 8.62 | 3.34 | 0.88 | -0.85 | 0.01 |
| Carb.Rel | 32 | 2561 | 5.44 | 0.13 | 5.40 | 5.43 | 0.12 | 4.96 | 6.06 | 1.10 | 0.50 | -0.29 | 0.00 |
| Balling.Lvl | 33 | 2570 | 2.05 | 0.87 | 1.48 | 1.98 | 0.21 | 0.00 | 3.66 | 3.66 | 0.59 | -1.49 | 0.02 |
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## Warning: Removed 4 rows containing non-finite values (stat_count).
## Grab observations where PH is NA
kable(df[(which(is.na(df$PH))),],digits =2,'markdown',booktabs =T)| Brand.Code | Carb.Volume | Fill.Ounces | PC.Volume | Carb.Pressure | Carb.Temp | PSC | PSC.Fill | PSC.CO2 | Mnf.Flow | Carb.Pressure1 | Fill.Pressure | Hyd.Pressure1 | Hyd.Pressure2 | Hyd.Pressure3 | Hyd.Pressure4 | Filler.Level | Filler.Speed | Temperature | Usage.cont | Carb.Flow | Density | MFR | Balling | Pressure.Vacuum | PH | Oxygen.Filler | Bowl.Setpoint | Pressure.Setpoint | Air.Pressurer | Alch.Rel | Carb.Rel | Balling.Lvl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 738 | B | 5.34 | 23.95 | 0.11 | 65.2 | 138.0 | 0.01 | 0.18 | 0.04 | -100.0 | 138.4 | NA | 0.0 | 0.0 | 0.0 | 90 | NA | NA | NA | 14.76 | 38 | 0.24 | NA | -0.17 | -5.2 | NA | NA | 120 | 46 | 143.2 | NA | NA | 0.00 |
| 1535 | B | 5.22 | 23.78 | 0.30 | 73.2 | 151.8 | 0.11 | 0.10 | 0.04 | NA | 128.2 | NA | 0.2 | 0.6 | 31.8 | 52 | NA | NA | 71.2 | 18.34 | 272 | 0.62 | NA | 0.75 | -5.2 | NA | 0.4 | 90 | 50 | 142.6 | 6.24 | 5.46 | 2.72 |
| 1719 | B | 5.31 | 23.85 | 0.16 | 65.4 | 138.6 | 0.02 | 0.34 | 0.02 | NA | 133.2 | NA | 0.0 | 0.0 | 1.6 | NA | NA | NA | NA | 23.94 | 32 | NA | NA | NA | -4.8 | NA | 0.4 | 70 | 50 | 143.4 | NA | NA | 0.00 |
| 2214 | B | 5.27 | 23.90 | 0.23 | 67.2 | 142.8 | 0.02 | 0.34 | 0.06 | 0.2 | 131.0 | NA | -0.6 | 0.2 | -1.2 | NA | NA | 1406 | 67.6 | 23.72 | 44 | 0.60 | NA | 0.70 | -5.4 | NA | 0.4 | 110 | 50 | 142.2 | 6.54 | 5.38 | 1.34 |
## Visualizations for Brand code factor column
a <- df %>%
select_if(is.factor) %>%
gather %>%
ggplot(aes(x = value)) +
geom_histogram(stat='count') +
facet_wrap(~key)## Warning: Ignoring unknown parameters: binwidth, bins, pad
b <- df %>%
select_if(is.factor) %>%
gather %>%
ggplot(aes(x = value)) +
geom_boxplot(aes(x = df$Brand.Code,y = df$PH)) +
facet_wrap(~key)
plot_grid(a,b)## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## Warning: Removed 329 rows containing missing values (geom_point).
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 291 rows containing non-finite values (stat_count).
## Warning: Removed 418 rows containing missing values (geom_point).
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 383 rows containing non-finite values (stat_count).
## Warning: Removed 69 rows containing missing values (geom_point).
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 46 rows containing non-finite values (stat_count).
kable(table(df$Bowl.Setpoint), digits = 2,'markdown', booktabs =T)| Var1 | Freq |
|---|---|
| 70 | 99 |
| 80 | 96 |
| 90 | 434 |
| 100 | 112 |
| 110 | 437 |
| 120 | 1307 |
| 122 | 1 |
| 126 | 10 |
| 130 | 51 |
| 134 | 2 |
| 140 | 20 |
kable(table(df$Pressure.Setpoint), digits = 2, 'markdown', booktabs =T)| Var1 | Freq |
|---|---|
| 44 | 96 |
| 46 | 1322 |
| 46.4 | 1 |
| 46.6 | 1 |
| 46.8 | 1 |
| 48 | 125 |
| 50 | 1002 |
| 52 | 11 |
kable(table(df$PSC.CO2), digits = 2, 'markdown',booktabs =T)| Var1 | Freq |
|---|---|
| 0 | 108 |
| 0.0199999999999996 | 325 |
| 0.0200000000000005 | 288 |
| 0.0399999999999991 | 37 |
| 0.04 | 624 |
| 0.0599999999999996 | 283 |
| 0.0600000000000005 | 219 |
| 0.0799999999999992 | 23 |
| 0.08 | 234 |
| 0.0999999999999996 | 79 |
| 0.100000000000001 | 65 |
| 0.119999999999999 | 10 |
| 0.12 | 72 |
| 0.14 | 30 |
| 0.140000000000001 | 19 |
| 0.159999999999999 | 3 |
| 0.16 | 36 |
| 0.18 | 12 |
| 0.180000000000001 | 8 |
| 0.199999999999999 | 2 |
| 0.2 | 16 |
| 0.22 | 22 |
| 0.24 | 17 |
df$PSC.CO2 <- round(df$PSC.CO2,2)
## rounded new table for PSC CO2
table(df$PSC.CO2)##
## 0 0.02 0.04 0.06 0.08 0.1 0.12 0.14 0.16 0.18 0.2 0.22 0.24
## 108 613 661 502 257 144 82 49 39 20 18 22 17
## rounded to integer new table for Pressure setpoint
df$Pressure.Setpoint <- round(df$Pressure.Setpoint,0)
table(df$Pressure.Setpoint)##
## 44 46 47 48 50 52
## 96 1323 2 125 1002 11
## After rounding visualize Pressure.Setpoint
ggplot(df, aes(Pressure.Setpoint, PH)) +
geom_point()## Warning: Removed 16 rows containing missing values (geom_point).
## After rounding visualize PSC.CO2
ggplot(df, aes(PSC.CO2, PH)) +
geom_point()## Warning: Removed 43 rows containing missing values (geom_point).
df %>%
filter(Bowl.Setpoint> 120 & Bowl.Setpoint <130) %>%
dplyr::select(PH)## PH
## 1 8.74
## 2 8.74
## 3 8.74
## 4 8.76
## 5 8.62
## 6 8.62
## 7 8.60
## 8 8.62
## 9 8.56
## 10 8.58
## 11 8.44
ggplot(df, aes(Bowl.Setpoint, PH)) +
geom_point()## Warning: Removed 6 rows containing missing values (geom_point).
## missing values
pMiss <- function(x){sum(is.na(x))/length(x)}
apply(df,2,pMiss)## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## 0.0466744457 0.0038895371 0.0147802412 0.0151691949
## Carb.Pressure Carb.Temp PSC PSC.Fill
## 0.0105017503 0.0101127966 0.0128354726 0.0089459354
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## 0.0151691949 0.0007779074 0.0124465189 0.0085569817
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## 0.0042784909 0.0058343057 0.0058343057 0.0116686114
## Filler.Level Filler.Speed Temperature Usage.cont
## 0.0077790743 0.0221703617 0.0054453520 0.0019447686
## Carb.Flow Density MFR Balling
## 0.0007779074 0.0003889537 0.0824581875 0.0003889537
## Pressure.Vacuum PH Oxygen.Filler Bowl.Setpoint
## 0.0000000000 0.0015558149 0.0046674446 0.0007779074
## Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 0.0046674446 0.0000000000 0.0035005834 0.0038895371
## Balling.Lvl
## 0.0003889537
aggr_plot <- aggr(df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)
##
## Variables sorted by number of missings:
## Variable Count
## MFR 0.0824581875
## Brand.Code 0.0466744457
## Filler.Speed 0.0221703617
## PC.Volume 0.0151691949
## PSC.CO2 0.0151691949
## Fill.Ounces 0.0147802412
## PSC 0.0128354726
## Carb.Pressure1 0.0124465189
## Hyd.Pressure4 0.0116686114
## Carb.Pressure 0.0105017503
## Carb.Temp 0.0101127966
## PSC.Fill 0.0089459354
## Fill.Pressure 0.0085569817
## Filler.Level 0.0077790743
## Hyd.Pressure2 0.0058343057
## Hyd.Pressure3 0.0058343057
## Temperature 0.0054453520
## Oxygen.Filler 0.0046674446
## Pressure.Setpoint 0.0046674446
## Hyd.Pressure1 0.0042784909
## Carb.Volume 0.0038895371
## Carb.Rel 0.0038895371
## Alch.Rel 0.0035005834
## Usage.cont 0.0019447686
## PH 0.0015558149
## Mnf.Flow 0.0007779074
## Carb.Flow 0.0007779074
## Bowl.Setpoint 0.0007779074
## Density 0.0003889537
## Balling 0.0003889537
## Balling.Lvl 0.0003889537
## Pressure.Vacuum 0.0000000000
## Air.Pressurer 0.0000000000
#tempData <- mice(df,m=5,maxit=50,meth='pmm',seed=500)
#summary(tempData)
#completedData <- complete(tempData,1)
#write.csv(completedData, file = "imputeddf.csv")
completedData <- read.csv("imputeddf.csv")
completedData <- completedData[,-1]# for_removal <- caret::nearZeroVar(df)
# for_removal
#
# my_df <- as.data.frame(df)
# for_removal <- caret::nearZeroVar(my_df)
# my_df <- my_df[,-for_removal]
# dim(my_df)## ncomp
## 13 13
## [1] 0.1496209
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## alpha lambda
## 381 1 1e-04
## alpha lambda
## 381 1 1e-04
## [1] 0.1491899
## Warning in predicted - actual: longer object length is not a multiple of
## shorter object length
# smp_size <- floor(0.75 * nrow(completedData))
# train_ind <- sample(seq_len(nrow(completedData)), size = smp_size)
# train <- bb[train_ind, ]
# test <- bb[-train_ind, ]
# train_y <- completedData[train_ind,c('PH')]
# test_y <- completedData[-train_ind,c('PH')]
#
# xgb_trcontrol = trainControl(
# method = "cv",
# number = 5,
# allowParallel = TRUE,
# verboseIter = FALSE,
# returnData = FALSE
# )
#
# xgbGrid <- expand.grid(nrounds = c(100,200),
# max_depth = c(10, 15, 20, 25,35,50),
# colsample_bytree = seq(0.5, 0.9, length.out = 5),
# eta = 0.1,
# gamma=0,
# min_child_weight = 1,
# subsample = 1
# )
# set.seed(0)
# xgb_model = train(
# train, train_y,
# trControl = xgb_trcontrol,
# tuneGrid = xgbGrid,
# method = "xgbTree")#save(xgb_model, file = "PH_model1.rda")
load("PH_model1.rda")
set.seed(123)
xgb_model$bestTune## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 12 200 15 0.1 0 0.5 1 1
predicted = predict(xgb_model, test)
residuals = test_y - predicted
RMSE(predicted, test_y)## [1] 0.117223
my_data = as.data.frame(cbind(predicted = predicted,
observed = test_y))
# Plot predictions vs test data
ggplot(my_data,aes(predicted, test_y)) + geom_point(color = "darkred", alpha = 0.5) +
geom_smooth(method=lm)+ ggtitle('Linear Regression ') + ggtitle("Extreme Gradient Boosting: Prediction vs Test Data") +
xlab("Predecited PH ") + ylab("PH") +
theme(plot.title = element_text(color="darkgreen",size=16,hjust = 0.5),
axis.text.y = element_text(size=12), axis.text.x = element_text(size=12,hjust=.5),
axis.title.x = element_text(size=14), axis.title.y = element_text(size=14))# col_index <- varImp(xgb_model)$importance %>%
# mutate(names=row.names(.)) %>%
# arrange(-Overall)
# col_index$names
#write.csv(col_index,"full_tree.csv")varImp(lmFit1)## Warning: package 'pls' was built under R version 3.5.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Mnf.Flow 100.00
## Usage.cont 65.22
## Bowl.Setpoint 61.71
## Hyd.Pressure3 54.87
## Filler.Level 51.75
## Temperature 50.58
## Pressure.Setpoint 44.91
## Hyd.Pressure2 41.84
## Fill.Pressure 39.54
## Carb.Pressure1 39.07
## Pressure.Vacuum 32.79
## Hyd.Pressure4 29.98
## Carb.Flow 29.54
## Oxygen.Filler 28.93
## Fill.Ounces 28.50
## Alch.Rel 26.02
## Density 22.36
## PSC 20.26
## Carb.Rel 19.01
## Hyd.Pressure1 18.83
varImp(model)## glmnet variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Oxygen.Filler 100.0000
## Density 47.4890
## PSC.CO2 44.8748
## Alch.Rel 36.7271
## PSC 33.8469
## Fill.Ounces 31.8064
## PC.Volume 21.1337
## PSC.Fill 14.8922
## Balling 14.7425
## Balling.Lvl 14.3287
## Carb.Volume 13.5275
## Carb.Rel 6.0500
## Temperature 5.9405
## Pressure.Setpoint 2.7717
## Usage.cont 2.2711
## Carb.Pressure1 2.2437
## Pressure.Vacuum 1.1567
## Hyd.Pressure3 1.1243
## Bowl.Setpoint 1.0981
## Carb.Temp 0.7988
varImp(xgb_model)## xgbTree variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Mnf.Flow 100.00
## Usage.cont 54.37
## Temperature 38.81
## Pressure.Vacuum 35.93
## Balling.Lvl 35.42
## Carb.Flow 35.17
## Filler.Speed 33.71
## Oxygen.Filler 32.77
## Carb.Rel 32.49
## Alch.Rel 32.20
## Carb.Volume 25.82
## Bowl.Setpoint 24.88
## Filler.Level 24.46
## Balling 22.56
## Density 22.16
## Carb.Pressure1 21.73
## Fill.Ounces 19.33
## Hyd.Pressure1 16.18
## Carb.Pressure 11.32
## MFR 10.08
# smp_size <- floor(0.75 * nrow(completedData))
# train_ind <- sample(seq_len(nrow(completedData)), size = smp_size)
# train <- bb[train_ind,-c(31,29) ]
# test <- bb[-train_ind,-c(31,29) ]
# train_y <- completedData[train_ind,c('PH')]
# test_y <- completedData[-train_ind,c('PH')]
#
# xgb_trcontrol = trainControl(
# method = "cv",
# number = 5,
# allowParallel = TRUE,
# verboseIter = FALSE,
# returnData = FALSE
# )
#
# xgbGrid <- expand.grid(nrounds = c(100,200),
# max_depth = c(10, 15, 20, 25,35,50),
# colsample_bytree = seq(0.5, 0.9, length.out = 5),
# eta = 0.1,
# gamma=0,
# min_child_weight = 1,
# subsample = 1
# )
# set.seed(0)
# xgb_model = train(
# train, train_y,
# trControl = xgb_trcontrol,
# tuneGrid = xgbGrid,
# method = "xgbTree")# completedData <- read.csv("imputed.csv")
# complete_df_imputed <- completedData
#
# complete_df_imputed <- complete_df_imputed[,-1]
# set.seed(123)
# smp_size <- floor(0.75 * nrow(complete_df_imputed))
# train_ind <- sample(seq_len(nrow(complete_df_imputed)), size = smp_size)
# train <- bb[train_ind,-1 ]
# test <- bb[-train_ind,-1 ]
# train_y <- completedData[train_ind,c('PH')]
# test_y <- completedData[-train_ind,c('PH')]
#
#
#
# library(doParallel)
# library(caret)
# registerDoParallel(4)
# getDoParWorkers()
#
# set.seed(123)
# my_control <- trainControl(method = 'cv', # for “cross-validation”
# number = 3, # number of k-folds
# savePredictions = 'final',
# allowParallel = TRUE)
#
# model_list <- caretList(train,
# train_y,
# trControl = my_control,
# methodList = c('lm', 'svmRadial', 'rf',"pls",
# 'xgbTree', 'xgbLinear',"bagEarth",'glmnet',"knn" ),
# tuneList = NULL,
# continue_on_fail = FALSE,
# preProcess = c('center','scale'))
#
# options(digits = 3)
# model_results <- data.frame(LM = min(model_list$lm$results$RMSE),
# SVM = min(model_list$svmRadial$results$RMSE),
# RF = min(model_list$rf$results$RMSE),
# XGBT = min(model_list$xgbTree$results$RMSE),
# XGBL = min(model_list$xgbLinear$results$RMSE),
# Mars = min(model_list$bagEarth$results$RMSE),
# GLMNET = min(model_list$glmnet$results$RMSE),
# PLS =min(model_list$pls$results$RMSE),
# KNN =min(model_list$knn$results$RMSE))
#
# print(model_results)# train <- readxl::read_excel('Data//StudentData.xlsx')
# eval <- readxl::read_excel('Data//StudentEvaluation.xlsx')
#
# # Summary stats table
# summary_stats <- function(df){
# minimum <- round(apply(df, 2, min, na.rm=T),2)
# maximum <- round(apply(df, 2, max, na.rm=T),2)
# quantile1 <- round(apply(df, 2, quantile, 0.25, na.rm=T),2)
# quantile3 <- round(apply(df, 2, quantile, 0.75, na.rm=T),2)
# medians <- round(apply(df, 2, median, na.rm=T),2)
# means <- round(apply(df, 2, mean, na.rm=T),2)
# stdev <- round(apply(df, 2, sd, na.rm=T),2)
# nas <- apply(apply(df, 2, is.na), 2, sum, na.rm=T)
# nas_p <- paste(round(nas / dim(df)[1], 4) * 100, '%')
# temp <- data.frame(means, stdev, minimum, maximum, quantile1, medians, quantile3, nas, nas_p)
# colnames(temp) <- c('Mean', 'St.Dev', 'Min.', 'Max.', '1st.Qu.', 'Median', '3rd.Qu.', "NA", "% NA")
# return(temp)
# }
# df$Brand.Code
# # Summary stats for train and eval sets
# train_summary <- df %>% subset(select=-c(Brand.Code)) %>% summary_stats()
# eval_summary <- eval %>% subset(select=-c(Brand.Code, PH)) %>% summary_stats()
# train_summary
# eval_summary