library(fpp2)
library(ggplot2)
library(tidyr)
library(dplyr)
library(stringr)
library(broom)
library(seasonal)
library(imputeTS)
library(tidymodels)
library(mice)
library(inspectdf)
library(lubridate)
library(corrplot)
library(caret)
library(fpp3)
library(randomForest)
library(Cubist)
library(iterators)
library(parallel)
library(doMC)
data_raw <- readr::read_csv('https://raw.githubusercontent.com/christianthieme/Predictive-Analytics/main/Project2/data/StudentData%20-%20TO%20MODEL.csv')
data_raw$obs_type <- "train"
eval_raw <- readr::read_csv('https://raw.githubusercontent.com/christianthieme/Predictive-Analytics/main/Project2/data/StudentEvaluation-%20TO%20PREDICT.csv')
eval_raw$obs_type <- "test"
combined <- data_raw %>%
rbind(eval_raw)
# convert column names to all lowercase
names(combined) <- lapply(names(combined), tolower)
# convert column name spaces to underscore
names(combined) <- str_replace_all(names(combined), ' ', '_')
df <- combined %>%
filter(obs_type == 'train') %>%
select(-obs_type)
eval <- combined %>%
filter(obs_type == 'test') %>%
select(-obs_type)
glimpse(combined)
## Rows: 2,838
## Columns: 34
## $ brand_code <chr> "B", "A", "B", "A", "A", "A", "A", "B", "B", "B", "B…
## $ carb_volume <dbl> 5.340000, 5.426667, 5.286667, 5.440000, 5.486667, 5.…
## $ fill_ounces <dbl> 23.96667, 24.00667, 24.06000, 24.00667, 24.31333, 23…
## $ pc_volume <dbl> 0.2633333, 0.2386667, 0.2633333, 0.2933333, 0.111333…
## $ carb_pressure <dbl> 68.2, 68.4, 70.8, 63.0, 67.2, 66.6, 64.2, 67.6, 64.2…
## $ carb_temp <dbl> 141.2, 139.6, 144.8, 132.6, 136.8, 138.4, 136.8, 141…
## $ psc <dbl> 0.104, 0.124, 0.090, NA, 0.026, 0.090, 0.128, 0.154,…
## $ psc_fill <dbl> 0.26, 0.22, 0.34, 0.42, 0.16, 0.24, 0.40, 0.34, 0.12…
## $ psc_co2 <dbl> 0.04, 0.04, 0.16, 0.04, 0.12, 0.04, 0.04, 0.04, 0.14…
## $ mnf_flow <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -100…
## $ carb_pressure1 <dbl> 118.8, 121.6, 120.2, 115.2, 118.4, 119.6, 122.2, 124…
## $ fill_pressure <dbl> 46.0, 46.0, 46.0, 46.4, 45.8, 45.6, 51.8, 46.8, 46.0…
## $ hyd_pressure1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hyd_pressure2 <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hyd_pressure3 <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hyd_pressure4 <dbl> 118, 106, 82, 92, 92, 116, 124, 132, 90, 108, 94, 86…
## $ filler_level <dbl> 121.2, 118.6, 120.0, 117.8, 118.6, 120.2, 123.4, 118…
## $ filler_speed <dbl> 4002, 3986, 4020, 4012, 4010, 4014, NA, 1004, 4014, …
## $ temperature <dbl> 66.0, 67.6, 67.0, 65.6, 65.6, 66.2, 65.8, 65.2, 65.4…
## $ usage_cont <dbl> 16.18, 19.90, 17.76, 17.42, 17.68, 23.82, 20.74, 18.…
## $ carb_flow <dbl> 2932, 3144, 2914, 3062, 3054, 2948, 30, 684, 2902, 3…
## $ density <dbl> 0.88, 0.92, 1.58, 1.54, 1.54, 1.52, 0.84, 0.84, 0.90…
## $ mfr <dbl> 725.0, 726.8, 735.0, 730.6, 722.8, 738.8, NA, NA, 74…
## $ balling <dbl> 1.398, 1.498, 3.142, 3.042, 3.042, 2.992, 1.298, 1.2…
## $ pressure_vacuum <dbl> -4.0, -4.0, -3.8, -4.4, -4.4, -4.4, -4.4, -4.4, -4.4…
## $ ph <dbl> 8.36, 8.26, 8.94, 8.24, 8.26, 8.32, 8.40, 8.38, 8.38…
## $ oxygen_filler <dbl> 0.022, 0.026, 0.024, 0.030, 0.030, 0.024, 0.066, 0.0…
## $ bowl_setpoint <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 12…
## $ pressure_setpoint <dbl> 46.4, 46.8, 46.6, 46.0, 46.0, 46.0, 46.0, 46.0, 46.0…
## $ air_pressurer <dbl> 142.6, 143.0, 142.0, 146.2, 146.2, 146.6, 146.2, 146…
## $ alch_rel <dbl> 6.58, 6.56, 7.66, 7.14, 7.14, 7.16, 6.54, 6.52, 6.52…
## $ carb_rel <dbl> 5.32, 5.30, 5.84, 5.42, 5.44, 5.44, 5.38, 5.34, 5.34…
## $ balling_lvl <dbl> 1.48, 1.56, 3.28, 3.04, 3.04, 3.02, 1.44, 1.44, 1.44…
## $ obs_type <chr> "train", "train", "train", "train", "train", "train"…
# remove rows without a response variable
df3 <- df %>%
filter(!is.na(ph)) %>%
mutate(brand_code = factor(brand_code))
df3$brand_code <- case_when(
df3$brand_code == "A" ~ 1,
df3$brand_code == "B" ~ 2,
df3$brand_code == "C" ~ 3,
df3$brand_code == "D" ~ 4,
)
#summary(df3)
mode <- function(df, ...) {
tb <- table(df)
which(tb == max(tb)) %>% sort %>% `[`(., 1) %>% names
}
impute <- function(df, fun) {
fval <- fun(df, na.rm = TRUE)
y <- ifelse(is.na(df), fval, df)
return(y)
}
df3 <- df3 %>% mutate(
brand_code = impute(brand_code, mode),
carb_volume = impute(carb_volume, mean),
fill_ounces = impute(fill_ounces, mean),
pc_volume = impute(pc_volume, mean),
carb_pressure = impute(carb_pressure, mean),
carb_temp = impute(carb_temp, mean),
psc = impute(psc, mean),
psc_fill = impute(psc_fill, mean),
psc_co2 = impute(psc_co2, mean),
carb_pressure1 = impute(carb_pressure1, mean),
fill_pressure = impute(fill_pressure, mean),
hyd_pressure1 = impute(hyd_pressure1, mean),
hyd_pressure2 = impute(hyd_pressure2, mean),
hyd_pressure3 = impute(hyd_pressure3, mean),
hyd_pressure4 = impute(hyd_pressure4, mean),
filler_level = impute(filler_level, mean),
filler_speed = impute(filler_speed, mean),
temperature = impute(temperature, mean),
usage_cont = impute(usage_cont, mean),
carb_flow = impute(carb_flow, mean),
mfr = impute(mfr, mean),
oxygen_filler = impute(oxygen_filler, mean),
bowl_setpoint = impute(bowl_setpoint, mean),
pressure_setpoint = impute(pressure_setpoint, mean),
alch_rel = impute(alch_rel, mean),
carb_rel = impute(carb_rel, mean),
balling_lvl = impute(balling_lvl, mean)
) %>%
mutate_if(is.character, as.numeric)
visdat::vis_miss(df3)
summary(df3$ph)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.880 8.440 8.540 8.546 8.680 9.360
set.seed(101)
trainIndex <- createDataPartition(df3$ph,
p = 0.8,
list = F)
train <- df3[trainIndex, ]
test <- df3[-trainIndex, ]
train <- train #%>% select(-brand_code)
test <- test #%>% select(-brand_code)
dim(train)
## [1] 2055 33
dim(test)
## [1] 512 33
set.seed(101)
grid <- expand.grid(lambda = c(1.39 * 10^-5, 10^-5, 1.90 * 10^-5),
alpha = c(0.0001,0.00046, 0.001),
nrounds = c(80,90,100),
eta = 0.1)
xgboost <- train(ph ~ ., data = train,
method = "xgbLinear",
preProcess = c("center", "scale"),
tuneGrid = grid,
verbosity = 0,
trControl = trainControl(method = 'cv'))
xgboost_predict <- predict(xgboost, newdata = test)
xgboost_results <- postResample(pred = xgboost_predict, obs = test$ph)
xgboost_results
## RMSE Rsquared MAE
## 0.10074766 0.63053678 0.07130061
predict <- xgboost_predict
actual <- test$ph
MAPE(actual, predict)
## [1] 99.94865
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(101)
mars <- train(ph ~ .,
data = train,
method = 'earth',
tuneGrid = marsGrid,
trControl = trainControl(method = 'cv')
)
mars_predict <- predict(mars, newdata = test)
summary(mars_predict)
## y
## Min. :8.259
## 1st Qu.:8.455
## Median :8.543
## Mean :8.549
## 3rd Qu.:8.647
## Max. :8.935
mars_results <- postResample(pred = mars_predict, obs = test$ph)
print(mars_results)
## RMSE Rsquared MAE
## 0.11767991 0.49524523 0.08878862
mars_predict <- mars_predict
actual <- test$ph
MAPE(actual, mars_predict)
## [1] 99.97576
There are some steps we follow to build the model.
Step 1 - Set the tuning grid. The tuning parameters are weight, the number of hidden layers and the number of hidden units. decay is the weight decay, and there are three tuning values. size is the number of hidden units.
Setp 2- Use trainControl() function to customize the process of selecting tuning/complexity parameters and building the final model.
Step 3 - Pass those control setting to train()
avNNet is a model where the same neural network model is fit using different random number seeds. All the resulting models are used for prediction. For regression, the output from each network are averaged.
The results show that the optimum number of hidden units is 8 and the decay parameter is 0.1.
eval2 <- combined %>%
select(-ph)
eval2$brand_code <- case_when(
eval2$brand_code == "A" ~ 1,
eval2$brand_code == "B" ~ 2,
eval2$brand_code == "C" ~ 3,
eval2$brand_code == "D" ~ 4,
)
#eval2$brand_code <- as.factor(eval2$brand_code)
eval2 <- eval2 %>%
filter(obs_type == 'test') %>%
select(-obs_type)
#table(eval2$brand_code)
final_predictions <- predict(xgboost, eval2)
summary(final_predictions)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.156 8.457 8.537 8.547 8.663 8.863
head(final_predictions, n = 10)
## [1] 8.400931 8.512258 8.449882 8.453444 8.593222 8.417555 8.421186 8.552644
## [9] 8.291843 8.624135