Library

library(fpp2)
library(ggplot2)
library(tidyr)
library(dplyr)
library(stringr)
library(broom)
library(seasonal)
library(imputeTS)
library(tidymodels)
library(mice)
library(inspectdf)
library(lubridate)
library(corrplot)
library(caret)
library(fpp3)
library(randomForest)
library(Cubist)
library(iterators)
library(parallel)
library(doMC)

Data Load

data_raw <- readr::read_csv('https://raw.githubusercontent.com/christianthieme/Predictive-Analytics/main/Project2/data/StudentData%20-%20TO%20MODEL.csv')
data_raw$obs_type <- "train"
eval_raw <- readr::read_csv('https://raw.githubusercontent.com/christianthieme/Predictive-Analytics/main/Project2/data/StudentEvaluation-%20TO%20PREDICT.csv')
eval_raw$obs_type <- "test"
combined <- data_raw %>%
  rbind(eval_raw)
# convert column names to all lowercase
names(combined) <- lapply(names(combined), tolower)
# convert column name spaces to underscore
names(combined) <- str_replace_all(names(combined), ' ', '_')

df <- combined %>%
  filter(obs_type == 'train') %>%
  select(-obs_type)
eval <- combined %>%
  filter(obs_type == 'test') %>%
  select(-obs_type)
glimpse(combined)
## Rows: 2,838
## Columns: 34
## $ brand_code        <chr> "B", "A", "B", "A", "A", "A", "A", "B", "B", "B", "B…
## $ carb_volume       <dbl> 5.340000, 5.426667, 5.286667, 5.440000, 5.486667, 5.…
## $ fill_ounces       <dbl> 23.96667, 24.00667, 24.06000, 24.00667, 24.31333, 23…
## $ pc_volume         <dbl> 0.2633333, 0.2386667, 0.2633333, 0.2933333, 0.111333…
## $ carb_pressure     <dbl> 68.2, 68.4, 70.8, 63.0, 67.2, 66.6, 64.2, 67.6, 64.2…
## $ carb_temp         <dbl> 141.2, 139.6, 144.8, 132.6, 136.8, 138.4, 136.8, 141…
## $ psc               <dbl> 0.104, 0.124, 0.090, NA, 0.026, 0.090, 0.128, 0.154,…
## $ psc_fill          <dbl> 0.26, 0.22, 0.34, 0.42, 0.16, 0.24, 0.40, 0.34, 0.12…
## $ psc_co2           <dbl> 0.04, 0.04, 0.16, 0.04, 0.12, 0.04, 0.04, 0.04, 0.14…
## $ mnf_flow          <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -100…
## $ carb_pressure1    <dbl> 118.8, 121.6, 120.2, 115.2, 118.4, 119.6, 122.2, 124…
## $ fill_pressure     <dbl> 46.0, 46.0, 46.0, 46.4, 45.8, 45.6, 51.8, 46.8, 46.0…
## $ hyd_pressure1     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hyd_pressure2     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hyd_pressure3     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hyd_pressure4     <dbl> 118, 106, 82, 92, 92, 116, 124, 132, 90, 108, 94, 86…
## $ filler_level      <dbl> 121.2, 118.6, 120.0, 117.8, 118.6, 120.2, 123.4, 118…
## $ filler_speed      <dbl> 4002, 3986, 4020, 4012, 4010, 4014, NA, 1004, 4014, …
## $ temperature       <dbl> 66.0, 67.6, 67.0, 65.6, 65.6, 66.2, 65.8, 65.2, 65.4…
## $ usage_cont        <dbl> 16.18, 19.90, 17.76, 17.42, 17.68, 23.82, 20.74, 18.…
## $ carb_flow         <dbl> 2932, 3144, 2914, 3062, 3054, 2948, 30, 684, 2902, 3…
## $ density           <dbl> 0.88, 0.92, 1.58, 1.54, 1.54, 1.52, 0.84, 0.84, 0.90…
## $ mfr               <dbl> 725.0, 726.8, 735.0, 730.6, 722.8, 738.8, NA, NA, 74…
## $ balling           <dbl> 1.398, 1.498, 3.142, 3.042, 3.042, 2.992, 1.298, 1.2…
## $ pressure_vacuum   <dbl> -4.0, -4.0, -3.8, -4.4, -4.4, -4.4, -4.4, -4.4, -4.4…
## $ ph                <dbl> 8.36, 8.26, 8.94, 8.24, 8.26, 8.32, 8.40, 8.38, 8.38…
## $ oxygen_filler     <dbl> 0.022, 0.026, 0.024, 0.030, 0.030, 0.024, 0.066, 0.0…
## $ bowl_setpoint     <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 12…
## $ pressure_setpoint <dbl> 46.4, 46.8, 46.6, 46.0, 46.0, 46.0, 46.0, 46.0, 46.0…
## $ air_pressurer     <dbl> 142.6, 143.0, 142.0, 146.2, 146.2, 146.6, 146.2, 146…
## $ alch_rel          <dbl> 6.58, 6.56, 7.66, 7.14, 7.14, 7.16, 6.54, 6.52, 6.52…
## $ carb_rel          <dbl> 5.32, 5.30, 5.84, 5.42, 5.44, 5.44, 5.38, 5.34, 5.34…
## $ balling_lvl       <dbl> 1.48, 1.56, 3.28, 3.04, 3.04, 3.02, 1.44, 1.44, 1.44…
## $ obs_type          <chr> "train", "train", "train", "train", "train", "train"…

Imputation

# remove rows without a response variable
df3 <- df %>%
  filter(!is.na(ph)) %>%
  mutate(brand_code = factor(brand_code))

df3$brand_code <- case_when(
  df3$brand_code == "A" ~ 1,
  df3$brand_code == "B" ~ 2,
  df3$brand_code == "C" ~ 3,
  df3$brand_code == "D" ~ 4,
)

#summary(df3)

mode <- function(df, ...) {
  tb <- table(df)
  which(tb == max(tb)) %>% sort %>% `[`(., 1) %>% names
}

impute <- function(df, fun) {
  fval <- fun(df, na.rm = TRUE)
  y <- ifelse(is.na(df), fval, df)
  return(y)
}

df3 <- df3 %>% mutate(
    brand_code = impute(brand_code, mode),
    carb_volume     = impute(carb_volume, mean),
    fill_ounces = impute(fill_ounces, mean),
    pc_volume = impute(pc_volume, mean),
    carb_pressure = impute(carb_pressure, mean),
    carb_temp  = impute(carb_temp, mean),
    psc = impute(psc, mean),
    psc_fill = impute(psc_fill, mean),
    psc_co2 = impute(psc_co2, mean),
    carb_pressure1 = impute(carb_pressure1, mean),
    fill_pressure = impute(fill_pressure, mean),
    hyd_pressure1 = impute(hyd_pressure1, mean),
    hyd_pressure2 = impute(hyd_pressure2, mean),
    hyd_pressure3 = impute(hyd_pressure3, mean),
    hyd_pressure4 = impute(hyd_pressure4, mean),
    filler_level = impute(filler_level, mean),
    filler_speed = impute(filler_speed, mean),
    temperature = impute(temperature, mean),
    usage_cont = impute(usage_cont, mean),
    carb_flow = impute(carb_flow, mean),
    mfr = impute(mfr, mean),
    oxygen_filler = impute(oxygen_filler, mean),
    bowl_setpoint = impute(bowl_setpoint, mean),
    pressure_setpoint = impute(pressure_setpoint, mean),
    alch_rel = impute(alch_rel, mean),
    carb_rel = impute(carb_rel, mean),
    balling_lvl = impute(balling_lvl, mean)
    ) %>% 
  mutate_if(is.character, as.numeric)

visdat::vis_miss(df3)

summary(df3$ph)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.880   8.440   8.540   8.546   8.680   9.360

Split

set.seed(101)
trainIndex <-  createDataPartition(df3$ph,
                                   p = 0.8,
                                   list = F)
train <- df3[trainIndex, ]
test <- df3[-trainIndex, ]

train <- train #%>% select(-brand_code)
test <- test #%>% select(-brand_code)

dim(train)
## [1] 2055   33
dim(test)
## [1] 512  33
set.seed(101)

grid <- expand.grid(lambda = c(1.39 * 10^-5, 10^-5, 1.90 * 10^-5),
                    alpha = c(0.0001,0.00046, 0.001),
                    nrounds = c(80,90,100),
                    eta = 0.1)
xgboost <- train(ph ~ ., data = train,
                      method = "xgbLinear",
                      preProcess = c("center", "scale"),
                      tuneGrid = grid,
                      verbosity = 0,
                      trControl = trainControl(method = 'cv'))

xgboost_predict <- predict(xgboost, newdata = test)
xgboost_results <- postResample(pred = xgboost_predict, obs = test$ph)
xgboost_results
##       RMSE   Rsquared        MAE 
## 0.10074766 0.63053678 0.07130061
predict <- xgboost_predict
actual <- test$ph

MAPE(actual, predict)
## [1] 99.94865
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(101)
mars <- train(ph ~ .,
              data = train,
              method = 'earth',
              tuneGrid = marsGrid,
              trControl = trainControl(method = 'cv')
              )

mars_predict <- predict(mars, newdata = test)
summary(mars_predict)
##        y        
##  Min.   :8.259  
##  1st Qu.:8.455  
##  Median :8.543  
##  Mean   :8.549  
##  3rd Qu.:8.647  
##  Max.   :8.935
mars_results <- postResample(pred = mars_predict, obs = test$ph)
print(mars_results)
##       RMSE   Rsquared        MAE 
## 0.11767991 0.49524523 0.08878862
mars_predict <- mars_predict
actual <- test$ph

MAPE(actual, mars_predict)
## [1] 99.97576

Neural Nets

There are some steps we follow to build the model.

Step 1 - Set the tuning grid. The tuning parameters are weight, the number of hidden layers and the number of hidden units. decay is the weight decay, and there are three tuning values. size is the number of hidden units.

Setp 2- Use trainControl() function to customize the process of selecting tuning/complexity parameters and building the final model.

Step 3 - Pass those control setting to train()

avNNet is a model where the same neural network model is fit using different random number seeds. All the resulting models are used for prediction. For regression, the output from each network are averaged.

The results show that the optimum number of hidden units is 8 and the decay parameter is 0.1.

eval2 <- combined %>%
  select(-ph)

eval2$brand_code <- case_when(
  eval2$brand_code == "A" ~ 1,
  eval2$brand_code == "B" ~ 2,
  eval2$brand_code == "C" ~ 3,
  eval2$brand_code == "D" ~ 4,
)

#eval2$brand_code <- as.factor(eval2$brand_code)

eval2 <- eval2 %>%
  filter(obs_type == 'test') %>%
  select(-obs_type)

#table(eval2$brand_code)
final_predictions <- predict(xgboost, eval2)
summary(final_predictions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.156   8.457   8.537   8.547   8.663   8.863
head(final_predictions, n = 10)
##  [1] 8.400931 8.512258 8.449882 8.453444 8.593222 8.417555 8.421186 8.552644
##  [9] 8.291843 8.624135