HW 9 A

exploration

Notes from EDA

  • Some clear multicolinearity issues(see corrplot section)
    • last 3 columns seems to share identical correlations with other prdictors
    • some of Hydrolic pressure have high correlation
    • Bailing Density, have high correlations with each other and last 3 variables
    • carb temp and Carb pressure share high correlation with each other
  • Our variables dont have very strong correlation scores with ph
    • Carb rel seems to have higheest correltaion, might be one oflast 3 columns to keep
    • MNF flow has highest correltion with PH(negative)

Ritesh things to deal with

  • PH is missing in 4 observations( see bottom of “look at entire dataset PH” section )
    • If you want to drop these or impute these, it’s up to you. i’d vote drop them
  • I think we should round off PSC.CO2 and Pressure.setpoint( see “Variables that seem discrete” section)
df <- read.xlsx("train.xlsx",1)
test <- read.xlsx("test.xlsx",1)

Look at entire dataset and PH

vars n mean sd median trimmed mad min max range skew kurtosis se
Brand.Code* 1 2451 2.51 1.00 2.00 2.51 0.00 1.00 4.00 3.00 0.38 -1.06 0.02
Carb.Volume 2 2561 5.37 0.11 5.35 5.37 0.11 5.04 5.70 0.66 0.39 -0.47 0.00
Fill.Ounces 3 2533 23.97 0.09 23.97 23.98 0.08 23.63 24.32 0.69 -0.02 0.86 0.00
PC.Volume 4 2532 0.28 0.06 0.27 0.27 0.05 0.08 0.48 0.40 0.34 0.67 0.00
Carb.Pressure 5 2544 68.19 3.54 68.20 68.12 3.56 57.00 79.40 22.40 0.18 -0.01 0.07
Carb.Temp 6 2545 141.09 4.04 140.80 140.99 3.85 128.60 154.00 25.40 0.25 0.24 0.08
PSC 7 2538 0.08 0.05 0.08 0.08 0.05 0.00 0.27 0.27 0.85 0.65 0.00
PSC.Fill 8 2548 0.20 0.12 0.18 0.18 0.12 0.00 0.62 0.62 0.93 0.77 0.00
PSC.CO2 9 2532 0.06 0.04 0.04 0.05 0.03 0.00 0.24 0.24 1.73 3.73 0.00
Mnf.Flow 10 2569 24.57 119.48 65.20 21.07 169.02 -100.20 229.40 329.60 0.00 -1.87 2.36
Carb.Pressure1 11 2539 122.59 4.74 123.20 122.54 4.45 105.60 140.20 34.60 0.05 0.14 0.09
Fill.Pressure 12 2549 47.92 3.18 46.40 47.71 2.37 34.60 60.40 25.80 0.55 1.41 0.06
Hyd.Pressure1 13 2560 12.44 12.43 11.40 10.84 16.90 -0.80 58.00 58.80 0.78 -0.14 0.25
Hyd.Pressure2 14 2556 20.96 16.39 28.60 21.05 13.34 0.00 59.40 59.40 -0.30 -1.56 0.32
Hyd.Pressure3 15 2556 20.46 15.98 27.60 20.51 13.94 -1.20 50.00 51.20 -0.32 -1.57 0.32
Hyd.Pressure4 16 2541 96.29 13.12 96.00 95.45 11.86 52.00 142.00 90.00 0.55 0.63 0.26
Filler.Level 17 2551 109.25 15.70 118.40 111.04 9.19 55.80 161.20 105.40 -0.85 0.05 0.31
Filler.Speed 18 2514 3687.20 770.82 3982.00 3919.99 47.44 998.00 4030.00 3032.00 -2.87 6.71 15.37
Temperature 19 2557 65.97 1.38 65.60 65.80 0.89 63.60 76.20 12.60 2.39 10.16 0.03
Usage.cont 20 2566 20.99 2.98 21.79 21.25 3.19 12.08 25.90 13.82 -0.54 -1.02 0.06
Carb.Flow 21 2569 2468.35 1073.70 3028.00 2601.14 326.17 26.00 5104.00 5078.00 -0.99 -0.58 21.18
Density 22 2570 1.17 0.38 0.98 1.15 0.15 0.24 1.92 1.68 0.53 -1.20 0.01
MFR 23 2359 704.05 73.90 724.00 718.16 15.42 31.40 868.60 837.20 -5.09 30.46 1.52
Balling 24 2570 2.20 0.93 1.65 2.13 0.37 -0.17 4.01 4.18 0.59 -1.39 0.02
Pressure.Vacuum 25 2571 -5.22 0.57 -5.40 -5.25 0.59 -6.60 -3.60 3.00 0.53 -0.03 0.01
PH 26 2567 8.55 0.17 8.54 8.55 0.18 7.88 9.36 1.48 -0.29 0.06 0.00
Oxygen.Filler 27 2559 0.05 0.05 0.03 0.04 0.02 0.00 0.40 0.40 2.66 11.09 0.00
Bowl.Setpoint 28 2569 109.33 15.30 120.00 111.35 0.00 70.00 140.00 70.00 -0.97 -0.06 0.30
Pressure.Setpoint 29 2559 47.62 2.04 46.00 47.60 0.00 44.00 52.00 8.00 0.20 -1.60 0.04
Air.Pressurer 30 2571 142.83 1.21 142.60 142.58 0.59 140.80 148.20 7.40 2.25 4.73 0.02
Alch.Rel 31 2562 6.90 0.51 6.56 6.84 0.06 5.28 8.62 3.34 0.88 -0.85 0.01
Carb.Rel 32 2561 5.44 0.13 5.40 5.43 0.12 4.96 6.06 1.10 0.50 -0.29 0.00
Balling.Lvl 33 2570 2.05 0.87 1.48 1.98 0.21 0.00 3.66 3.66 0.59 -1.49 0.02
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## Warning: Removed 4 rows containing non-finite values (stat_count).

Display PH is null training data observations

## Grab observations where PH is NA
kable(df[(which(is.na(df$PH))),],digits =2,'markdown',booktabs =T)
Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow Density MFR Balling Pressure.Vacuum PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
738 B 5.34 23.95 0.11 65.2 138.0 0.01 0.18 0.04 -100.0 138.4 NA 0.0 0.0 0.0 90 NA NA NA 14.76 38 0.24 NA -0.17 -5.2 NA NA 120 46 143.2 NA NA 0.00
1535 B 5.22 23.78 0.30 73.2 151.8 0.11 0.10 0.04 NA 128.2 NA 0.2 0.6 31.8 52 NA NA 71.2 18.34 272 0.62 NA 0.75 -5.2 NA 0.4 90 50 142.6 6.24 5.46 2.72
1719 B 5.31 23.85 0.16 65.4 138.6 0.02 0.34 0.02 NA 133.2 NA 0.0 0.0 1.6 NA NA NA NA 23.94 32 NA NA NA -4.8 NA 0.4 70 50 143.4 NA NA 0.00
2214 B 5.27 23.90 0.23 67.2 142.8 0.02 0.34 0.06 0.2 131.0 NA -0.6 0.2 -1.2 NA NA 1406 67.6 23.72 44 0.60 NA 0.70 -5.4 NA 0.4 110 50 142.2 6.54 5.38 1.34

Look at our brand code factor column

## Visualizations for Brand code factor column
a <- df %>% 
  select_if(is.factor) %>% 
  gather %>% 
  ggplot(aes(x = value)) + 
  geom_histogram(stat='count') + 
  facet_wrap(~key)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
b <- df %>% 
  select_if(is.factor) %>% 
  gather %>% 
  ggplot(aes(x = value)) + 
  geom_boxplot(aes(x = df$Brand.Code,y = df$PH)) + 
  facet_wrap(~key)

plot_grid(a,b)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Corrplots

Looking at PH scatterplots versus predictors and histograms of predictors

## Warning: Removed 329 rows containing missing values (geom_point).

## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 291 rows containing non-finite values (stat_count).

## Warning: Removed 418 rows containing missing values (geom_point).

## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 383 rows containing non-finite values (stat_count).

## Warning: Removed 69 rows containing missing values (geom_point).

## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 46 rows containing non-finite values (stat_count).

variables that seem discrete

  • Bowl.Setpoint
  • Pressure.Setpoint
  • PSC.CO2
kable(table(df$Bowl.Setpoint), digits = 2,'markdown', booktabs =T)
Var1 Freq
70 99
80 96
90 434
100 112
110 437
120 1307
122 1
126 10
130 51
134 2
140 20
kable(table(df$Pressure.Setpoint), digits = 2, 'markdown', booktabs =T)
Var1 Freq
44 96
46 1322
46.4 1
46.6 1
46.8 1
48 125
50 1002
52 11
kable(table(df$PSC.CO2), digits = 2, 'markdown',booktabs =T)
Var1 Freq
0 108
0.0199999999999996 325
0.0200000000000005 288
0.0399999999999991 37
0.04 624
0.0599999999999996 283
0.0600000000000005 219
0.0799999999999992 23
0.08 234
0.0999999999999996 79
0.100000000000001 65
0.119999999999999 10
0.12 72
0.14 30
0.140000000000001 19
0.159999999999999 3
0.16 36
0.18 12
0.180000000000001 8
0.199999999999999 2
0.2 16
0.22 22
0.24 17

Rounding

  • I rounded PSC.CO2 and Pressure.setpoint. I don’t think these changes will make any difference in modeling results(all seem distributed evenly around the mean of PH), but I think we should still include it in EDA as it shows how “deeply” we looked into the data.
df$PSC.CO2 <- round(df$PSC.CO2,2)
## rounded new table for PSC CO2
table(df$PSC.CO2)
## 
##    0 0.02 0.04 0.06 0.08  0.1 0.12 0.14 0.16 0.18  0.2 0.22 0.24 
##  108  613  661  502  257  144   82   49   39   20   18   22   17
## rounded to integer new table for Pressure setpoint
df$Pressure.Setpoint <- round(df$Pressure.Setpoint,0)
table(df$Pressure.Setpoint)
## 
##   44   46   47   48   50   52 
##   96 1323    2  125 1002   11

PSC.CO2 and Pressure.setpoint

  • Scatterplots with new rounded off data
## After rounding visualize Pressure.Setpoint
ggplot(df, aes(Pressure.Setpoint, PH)) + 
  geom_point()
## Warning: Removed 16 rows containing missing values (geom_point).

## After rounding visualize PSC.CO2
ggplot(df, aes(PSC.CO2, PH)) + 
  geom_point()
## Warning: Removed 43 rows containing missing values (geom_point).

bowl.Setpoint predictor

  • Setpoint seems to sequence by 10 integers, and then has random integers towards the end. I filtered for those random observations to see any trend. I then visualize the variable. I don’t believe we need to make any changes to it
df %>% 
    filter(Bowl.Setpoint> 120 & Bowl.Setpoint <130) %>% 
    dplyr::select(PH)
##      PH
## 1  8.74
## 2  8.74
## 3  8.74
## 4  8.76
## 5  8.62
## 6  8.62
## 7  8.60
## 8  8.62
## 9  8.56
## 10 8.58
## 11 8.44
ggplot(df, aes(Bowl.Setpoint, PH)) + 
  geom_point()
## Warning: Removed 6 rows containing missing values (geom_point).

Imputation and visualizing missing data

  • The rest of the analysis would require the imputed csv
## missing values
pMiss <- function(x){sum(is.na(x))/length(x)}
apply(df,2,pMiss)
##        Brand.Code       Carb.Volume       Fill.Ounces         PC.Volume 
##      0.0466744457      0.0038895371      0.0147802412      0.0151691949 
##     Carb.Pressure         Carb.Temp               PSC          PSC.Fill 
##      0.0105017503      0.0101127966      0.0128354726      0.0089459354 
##           PSC.CO2          Mnf.Flow    Carb.Pressure1     Fill.Pressure 
##      0.0151691949      0.0007779074      0.0124465189      0.0085569817 
##     Hyd.Pressure1     Hyd.Pressure2     Hyd.Pressure3     Hyd.Pressure4 
##      0.0042784909      0.0058343057      0.0058343057      0.0116686114 
##      Filler.Level      Filler.Speed       Temperature        Usage.cont 
##      0.0077790743      0.0221703617      0.0054453520      0.0019447686 
##         Carb.Flow           Density               MFR           Balling 
##      0.0007779074      0.0003889537      0.0824581875      0.0003889537 
##   Pressure.Vacuum                PH     Oxygen.Filler     Bowl.Setpoint 
##      0.0000000000      0.0015558149      0.0046674446      0.0007779074 
## Pressure.Setpoint     Air.Pressurer          Alch.Rel          Carb.Rel 
##      0.0046674446      0.0000000000      0.0035005834      0.0038895371 
##       Balling.Lvl 
##      0.0003889537
aggr_plot <- aggr(df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

## 
##  Variables sorted by number of missings: 
##           Variable        Count
##                MFR 0.0824581875
##         Brand.Code 0.0466744457
##       Filler.Speed 0.0221703617
##          PC.Volume 0.0151691949
##            PSC.CO2 0.0151691949
##        Fill.Ounces 0.0147802412
##                PSC 0.0128354726
##     Carb.Pressure1 0.0124465189
##      Hyd.Pressure4 0.0116686114
##      Carb.Pressure 0.0105017503
##          Carb.Temp 0.0101127966
##           PSC.Fill 0.0089459354
##      Fill.Pressure 0.0085569817
##       Filler.Level 0.0077790743
##      Hyd.Pressure2 0.0058343057
##      Hyd.Pressure3 0.0058343057
##        Temperature 0.0054453520
##      Oxygen.Filler 0.0046674446
##  Pressure.Setpoint 0.0046674446
##      Hyd.Pressure1 0.0042784909
##        Carb.Volume 0.0038895371
##           Carb.Rel 0.0038895371
##           Alch.Rel 0.0035005834
##         Usage.cont 0.0019447686
##                 PH 0.0015558149
##           Mnf.Flow 0.0007779074
##          Carb.Flow 0.0007779074
##      Bowl.Setpoint 0.0007779074
##            Density 0.0003889537
##            Balling 0.0003889537
##        Balling.Lvl 0.0003889537
##    Pressure.Vacuum 0.0000000000
##      Air.Pressurer 0.0000000000
#tempData <- mice(df,m=5,maxit=50,meth='pmm',seed=500)
#summary(tempData)
#completedData <- complete(tempData,1)
#write.csv(completedData, file = "imputeddf.csv")

completedData <- read.csv("imputeddf.csv")
completedData <- completedData[,-1]
# for_removal <- caret::nearZeroVar(df)
# for_removal
# 
# my_df <- as.data.frame(df)
# for_removal <- caret::nearZeroVar(my_df)
# my_df <- my_df[,-for_removal]
# dim(my_df)

Modeling

  • I needed to hot encode (Brand.code), instead i just dropped it. Figure Jun going in whatever direction he chooses anyway, was more just to get an idea of what is happening

PLS

##    ncomp
## 13    13
## [1] 0.1496209

GLMNET

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

##     alpha lambda
## 381     1  1e-04

Visualize GLMNET

##     alpha lambda
## 381     1  1e-04
## [1] 0.1491899

## Warning in predicted - actual: longer object length is not a multiple of
## shorter object length

Full xgboost tree

# smp_size <- floor(0.75 * nrow(completedData))
# train_ind <- sample(seq_len(nrow(completedData)), size = smp_size)
# train <- bb[train_ind, ]
# test <- bb[-train_ind, ]
# train_y <- completedData[train_ind,c('PH')]
# test_y <- completedData[-train_ind,c('PH')]
# 
# xgb_trcontrol = trainControl(
#   method = "cv",
#   number = 5,
#   allowParallel = TRUE,
#   verboseIter = FALSE,
#   returnData = FALSE
# )
# 
# xgbGrid <- expand.grid(nrounds = c(100,200),
#                        max_depth = c(10, 15, 20, 25,35,50),
#                        colsample_bytree = seq(0.5, 0.9, length.out = 5),
#                        eta = 0.1,
#                        gamma=0,
#                        min_child_weight = 1,
#                        subsample = 1
#                       )
# set.seed(0)
# xgb_model = train(
#   train, train_y,
#   trControl = xgb_trcontrol,
#   tuneGrid = xgbGrid,
#   method = "xgbTree")

Visualize xgboost

#save(xgb_model, file = "PH_model1.rda")
load("PH_model1.rda")
set.seed(123)


xgb_model$bestTune
##    nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 12     200        15 0.1     0              0.5                1         1
predicted = predict(xgb_model, test)
residuals = test_y - predicted
RMSE(predicted, test_y)
## [1] 0.117223
my_data = as.data.frame(cbind(predicted = predicted,
                            observed = test_y))
# Plot predictions vs test data
ggplot(my_data,aes(predicted, test_y)) + geom_point(color = "darkred", alpha = 0.5) + 
    geom_smooth(method=lm)+ ggtitle('Linear Regression ') + ggtitle("Extreme Gradient Boosting: Prediction vs Test Data") +
      xlab("Predecited PH ") + ylab("PH") + 
        theme(plot.title = element_text(color="darkgreen",size=16,hjust = 0.5),
         axis.text.y = element_text(size=12), axis.text.x = element_text(size=12,hjust=.5),
         axis.title.x = element_text(size=14), axis.title.y = element_text(size=14))

# col_index <- varImp(xgb_model)$importance %>% 
#   mutate(names=row.names(.)) %>% 
#   arrange(-Overall)
# col_index$names
#write.csv(col_index,"full_tree.csv")

Variable importance

  • pls, elastic net, xgboost
varImp(lmFit1)
## Warning: package 'pls' was built under R version 3.5.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## The following object is masked from 'package:stats':
## 
##     loadings
## pls variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                   Overall
## Mnf.Flow           100.00
## Usage.cont          65.22
## Bowl.Setpoint       61.71
## Hyd.Pressure3       54.87
## Filler.Level        51.75
## Temperature         50.58
## Pressure.Setpoint   44.91
## Hyd.Pressure2       41.84
## Fill.Pressure       39.54
## Carb.Pressure1      39.07
## Pressure.Vacuum     32.79
## Hyd.Pressure4       29.98
## Carb.Flow           29.54
## Oxygen.Filler       28.93
## Fill.Ounces         28.50
## Alch.Rel            26.02
## Density             22.36
## PSC                 20.26
## Carb.Rel            19.01
## Hyd.Pressure1       18.83
varImp(model)
## glmnet variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                    Overall
## Oxygen.Filler     100.0000
## Density            47.4890
## PSC.CO2            44.8748
## Alch.Rel           36.7271
## PSC                33.8469
## Fill.Ounces        31.8064
## PC.Volume          21.1337
## PSC.Fill           14.8922
## Balling            14.7425
## Balling.Lvl        14.3287
## Carb.Volume        13.5275
## Carb.Rel            6.0500
## Temperature         5.9405
## Pressure.Setpoint   2.7717
## Usage.cont          2.2711
## Carb.Pressure1      2.2437
## Pressure.Vacuum     1.1567
## Hyd.Pressure3       1.1243
## Bowl.Setpoint       1.0981
## Carb.Temp           0.7988
varImp(xgb_model)
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                 Overall
## Mnf.Flow         100.00
## Usage.cont        54.37
## Temperature       38.81
## Pressure.Vacuum   35.93
## Balling.Lvl       35.42
## Carb.Flow         35.17
## Filler.Speed      33.71
## Oxygen.Filler     32.77
## Carb.Rel          32.49
## Alch.Rel          32.20
## Carb.Volume       25.82
## Bowl.Setpoint     24.88
## Filler.Level      24.46
## Balling           22.56
## Density           22.16
## Carb.Pressure1    21.73
## Fill.Ounces       19.33
## Hyd.Pressure1     16.18
## Carb.Pressure     11.32
## MFR               10.08

Run models without collinearity issues

  • didnt get to this yet, but maybe i can run some models in subsetted dataset, while Jun chooses to run on entire dataset?
# smp_size <- floor(0.75 * nrow(completedData))
# train_ind <- sample(seq_len(nrow(completedData)), size = smp_size)
# train <- bb[train_ind,-c(31,29) ]
# test <- bb[-train_ind,-c(31,29) ]
# train_y <- completedData[train_ind,c('PH')]
# test_y <- completedData[-train_ind,c('PH')]
# 
# xgb_trcontrol = trainControl(
#   method = "cv",
#   number = 5,
#   allowParallel = TRUE,
#   verboseIter = FALSE,
#   returnData = FALSE
# )
# 
# xgbGrid <- expand.grid(nrounds = c(100,200),
#                        max_depth = c(10, 15, 20, 25,35,50),
#                        colsample_bytree = seq(0.5, 0.9, length.out = 5),
#                        eta = 0.1,
#                        gamma=0,
#                        min_child_weight = 1,
#                        subsample = 1
#                       )
# set.seed(0)
# xgb_model = train(
#   train, train_y,
#   trControl = xgb_trcontrol,
#   tuneGrid = xgbGrid,
#   method = "xgbTree")

Caret ensemble

  • wasn’t working
# completedData <- read.csv("imputed.csv")
# complete_df_imputed <- completedData
# 
# complete_df_imputed <- complete_df_imputed[,-1]
# set.seed(123)
# smp_size <- floor(0.75 * nrow(complete_df_imputed))
# train_ind <- sample(seq_len(nrow(complete_df_imputed)), size = smp_size)
# train <- bb[train_ind,-1 ]
# test <- bb[-train_ind,-1 ]
# train_y <- completedData[train_ind,c('PH')]
# test_y <- completedData[-train_ind,c('PH')]
# 
# 
# 
# library(doParallel)
# library(caret)
# registerDoParallel(4)
# getDoParWorkers()
# 
# set.seed(123)
# my_control <- trainControl(method = 'cv', # for “cross-validation”
#                            number = 3, # number of k-folds
#                            savePredictions = 'final',
#                            allowParallel = TRUE)
# 
# model_list <- caretList(train,
#                         train_y,
#                         trControl = my_control,
#                         methodList = c('lm', 'svmRadial', 'rf',"pls", 
#                                        'xgbTree', 'xgbLinear',"bagEarth",'glmnet',"knn" ),
#                         tuneList = NULL,
#                         continue_on_fail = FALSE, 
#                         preProcess = c('center','scale'))
# 
# options(digits = 3)
# model_results <- data.frame(LM = min(model_list$lm$results$RMSE),
#  SVM = min(model_list$svmRadial$results$RMSE),
#  RF = min(model_list$rf$results$RMSE),
#  XGBT = min(model_list$xgbTree$results$RMSE),
#  XGBL = min(model_list$xgbLinear$results$RMSE),
#  Mars = min(model_list$bagEarth$results$RMSE),
#  GLMNET = min(model_list$glmnet$results$RMSE),
#  PLS =min(model_list$pls$results$RMSE),
#  KNN =min(model_list$knn$results$RMSE))
#  
# print(model_results)

Jun’s Code

  • this isn’t Jun’s complete slack code. I added it becuase i wanted to take a look at the functions he posted to slack
# train <- readxl::read_excel('Data//StudentData.xlsx')
# eval <- readxl::read_excel('Data//StudentEvaluation.xlsx')
# 
# # Summary stats table
# summary_stats <- function(df){
#   minimum <- round(apply(df, 2, min, na.rm=T),2)
#   maximum <- round(apply(df, 2, max, na.rm=T),2)
#   quantile1 <- round(apply(df, 2, quantile, 0.25, na.rm=T),2)
#   quantile3 <- round(apply(df, 2, quantile, 0.75, na.rm=T),2)
#   medians <- round(apply(df, 2, median, na.rm=T),2)
#   means <- round(apply(df, 2, mean, na.rm=T),2)
#   stdev <- round(apply(df, 2, sd, na.rm=T),2)
#   nas <- apply(apply(df, 2, is.na), 2, sum, na.rm=T)
#   nas_p <- paste(round(nas / dim(df)[1], 4) * 100, '%')
#   temp <- data.frame(means, stdev, minimum, maximum, quantile1, medians, quantile3, nas, nas_p)
#   colnames(temp) <- c('Mean', 'St.Dev', 'Min.', 'Max.', '1st.Qu.', 'Median', '3rd.Qu.', "NA", "% NA")
#   return(temp)
# }
# df$Brand.Code
# # Summary stats for train and eval sets
# train_summary <- df %>% subset(select=-c(Brand.Code)) %>% summary_stats()
# eval_summary <- eval %>% subset(select=-c(Brand.Code, PH)) %>% summary_stats()
# train_summary
# eval_summary