#load excel file with rio
UptimeData<- rio::import("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/Time series/Manpower/Uptimebycategories08-29-2017b.xlsx")
# replace white spaces in column names with _
names(UptimeData)<-stringr::str_replace_all(names(UptimeData),"\\s", "_")
UptimeData
UptimeData=UptimeData%>%dplyr::mutate(Month=stringr:: str_sub(`Month-Yr`, 1, 3))%>%mutate_if(is.character,factor)
UptimeData%>%head()
Durability=UptimeData%>%dplyr::filter(`Dur/EBRG`=="Durability")%>%rename(Dur=`Dur/EBRG`)
Durability=na.omit(Durability)
library(h2o)
h2o.init(nthreads = -1, #Number of threads -1 means use all cores on your machine
         max_mem_size = "8G")  #max mem size is the maximum memory to allocate to H2O
 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         2 hours 3 minutes 
    H2O cluster version:        3.10.5.3 
    H2O cluster version age:    2 months and 3 days  
    H2O cluster name:           H2O_started_from_R_nanaakwasiabayieboateng_sux705 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   6.87 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    R Version:                  R version 3.4.0 (2017-04-21) 
#==========================================================================================================
# import R object to the H2O cloud.
#convert r data to h2o object
#==========================================================================================================
datah20=as.h2o(Durability)

  |                                                                      
  |                                                                |   0%
  |                                                                      
  |================================================================| 100%
#==========================================================================================================
# Partition the data into training, validation and test sets
#==========================================================================================================
splits <- h2o.splitFrame(data = datah20, 
                         ratios = c(0.7, 0.15),  #partition data into 70%, 15%, 15% chunks
                         seed = 1)  #setting a seed will guarantee reproducibility
train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]
# Identify response and predictor variables
y <- "APG_Uptime"
x <- setdiff(names(datah20), y)  
print(x)
[1] "Month-Yr"             "Dur"                  "priority_category_cd"
[4] "build_phase"          "Build_Category"       "Day_of_Week"         
[7] "vehicle_family_ref"   "Month"               
#====================================================================================================================================
# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
#=====================================================================================================================================
glm_fit2 <- h2o.glm(x = x, 
                    y = y, 
                    training_frame = train,
                    model_id = "glm_fit2",
                    validation_frame = valid,
                    family = "gaussian",
                    lambda_search = TRUE) 
Dropping bad and constant columns: [Dur].

  |                                                                      
  |                                                                |   0%
  |                                                                      
  |===================================                             |  55%
  |                                                                      
  |================================================================| 100%
#==========================================================================================================
# Let's compare the performance of the two GLMs
#==========================================================================================================
glm_perf2 <- h2o.performance(model = glm_fit2,
                             newdata = test)
#==========================================================================================================
# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
#==========================================================================================================
 
 
glm_fit2@model$validation_metrics 
H2ORegressionMetrics: glm
** Reported on validation data. **

MSE:  0.08187123
RMSE:  0.2861315
MAE:  0.219839
RMSLE:  0.1992424
Mean Residual Deviance :  0.08187123
R^2 :  0.2184632
Null Deviance :13.81938
Null D.o.F. :126
Residual Deviance :10.39765
Residual D.o.F. :79
AIC :140.5792
#============================================================================================
# We will again set `ntrees = 500`, however, this time we will use early stopping in order to 
# prevent overfitting (from too many trees).  All of H2O's algorithms have early stopping available, 
# however early stopping is not enabled by default (with the exception of Deep Learning).  
# There are several parameters that should be used to control early stopping.  The three that are 
# common to all the algorithms are: `stopping_rounds`, `stopping_metric` and `stopping_tolerance`.  
# The stopping metric is the metric by which you'd like to measure performance, and so we will choose 
# AUC here.  The `score_tree_interval` is a parameter specific to the Random Forest model and the GBM.  
# Setting `score_tree_interval = 5` will score the model after every five trees.  The parameters we 
# have set below specify that the model will stop training after there have been three scoring intervals 
# where the AUC has not increased more than 0.0005.  Since we have specified a validation frame, 
# the stopping tolerance will be computed on validation AUC rather than training AUC. 
#===============================================================================================
gbm_fit3 <- h2o.gbm(x = x,
                    y = y,
                    training_frame = train,
                    model_id = "gbm_fit2",
                    #validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    seed = 1)
Dropping bad and constant columns: [Dur].

  |                                                                      
  |                                                                |   0%
  |                                                                      
  |==                                                              |   2%
  |                                                                      
  |======================                                          |  34%
  |                                                                      
  |=================================================               |  77%
  |                                                                      
  |================================================================| 100%
#==========================================================================================================
# Let's compare the performance of the two GBMs
#==========================================================================================================
gbm_perf3 <- h2o.performance(model = gbm_fit3,
                             newdata = test)
#==========================================================================================================
# Print model performance
#==========================================================================================================
gbm_perf3
H2ORegressionMetrics: gbm

MSE:  0.07664392
RMSE:  0.2768464
MAE:  0.2075424
RMSLE:  0.1820383
Mean Residual Deviance :  0.07664392
#==========================================================================================================
# Look at scoring history for third GBM model
#==========================================================================================================
plot(gbm_fit3, 
     timestep = "number_of_trees")

#==========================================================================================================
# Train a DL with early stopping
# This example will use the same model parameters as `dl_fit2`. This time, we will turn on 
# early stopping and specify the stopping criterion.  We will also pass a validation set, as is
# recommended for early stopping.
#==========================================================================================================
dl_fit3 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit2",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
                            seed = 1) 
Dropping bad and constant columns: [Dur].

  |                                                                      
  |                                                                |   0%
  |                                                                      
  |================================================================| 100%
dl_perf3 <- h2o.performance(model = dl_fit3,
                            newdata = test)
#==========================================================================================================
# Print model performance
#==========================================================================================================
dl_perf3
H2ORegressionMetrics: deeplearning

MSE:  0.07325338
RMSE:  0.2706536
MAE:  0.2157389
RMSLE:  0.1793604
Mean Residual Deviance :  0.07325338
#==========================================================================================================
# Scoring history
#==========================================================================================================
h2o.scoreHistory(dl_fit3)
Scoring History: 
            timestamp   duration training_speed   epochs iterations
1 2017-09-02 19:29:39  0.000 sec                 0.00000          0
2 2017-09-02 19:29:39  0.027 sec 231200 obs/sec  2.00000          1
3 2017-09-02 19:29:39  0.084 sec 214074 obs/sec 20.00000         10
       samples training_rmse training_deviance training_mae
1     0.000000                                             
2  1156.000000       0.28044           0.07865      0.22619
3 11560.000000       0.23176           0.05371      0.18164
#==========================================================================================================
# Look at scoring history for third DL model
#==========================================================================================================
plot(dl_fit3, 
     timestep = "epochs")

---
title: "Machine Learning with H20"
output: html_notebook
author: Nana Boateng
---


```{r}
#load excel file with rio
UptimeData<- rio::import("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/Time series/Manpower/Uptimebycategories08-29-2017b.xlsx")

# replace white spaces in column names with _

names(UptimeData)<-stringr::str_replace_all(names(UptimeData),"\\s", "_")

UptimeData


UptimeData=UptimeData%>%dplyr::mutate(Month=stringr:: str_sub(`Month-Yr`, 1, 3))%>%mutate_if(is.character,factor)


UptimeData%>%head()


Durability=UptimeData%>%dplyr::filter(`Dur/EBRG`=="Durability")%>%rename(Dur=`Dur/EBRG`)


Durability=na.omit(Durability)


```




```{r,message=FALSE,warning=FALSE}



library(h2o)

h2o.init(nthreads = -1, #Number of threads -1 means use all cores on your machine
         max_mem_size = "8G")  #max mem size is the maximum memory to allocate to H2O

#==========================================================================================================
# import R object to the H2O cloud.
#convert r data to h2o object
#==========================================================================================================

datah20=as.h2o(Durability)


#==========================================================================================================
# Partition the data into training, validation and test sets
#==========================================================================================================

splits <- h2o.splitFrame(data = datah20, 
                         ratios = c(0.7, 0.15),  #partition data into 70%, 15%, 15% chunks
                         seed = 1)  #setting a seed will guarantee reproducibility
train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]


# Identify response and predictor variables
y <- "APG_Uptime"
x <- setdiff(names(datah20), y)  
print(x)


```



```{r}
#====================================================================================================================================
# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
#=====================================================================================================================================


glm_fit2 <- h2o.glm(x = x, 
                    y = y, 
                    training_frame = train,
                    model_id = "glm_fit2",
                    validation_frame = valid,
                    family = "gaussian",
                    lambda_search = TRUE) 


#==========================================================================================================
# Let's compare the performance of the two GLMs
#==========================================================================================================


glm_perf2 <- h2o.performance(model = glm_fit2,
                             newdata = test)



#==========================================================================================================
# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
#==========================================================================================================


 
 

glm_fit2@model$validation_metrics 

```


```{r}
#============================================================================================
# We will again set `ntrees = 500`, however, this time we will use early stopping in order to 
# prevent overfitting (from too many trees).  All of H2O's algorithms have early stopping available, 
# however early stopping is not enabled by default (with the exception of Deep Learning).  
# There are several parameters that should be used to control early stopping.  The three that are 
# common to all the algorithms are: `stopping_rounds`, `stopping_metric` and `stopping_tolerance`.  
# The stopping metric is the metric by which you'd like to measure performance, and so we will choose 
# AUC here.  The `score_tree_interval` is a parameter specific to the Random Forest model and the GBM.  
# Setting `score_tree_interval = 5` will score the model after every five trees.  The parameters we 
# have set below specify that the model will stop training after there have been three scoring intervals 
# where the AUC has not increased more than 0.0005.  Since we have specified a validation frame, 
# the stopping tolerance will be computed on validation AUC rather than training AUC. 
#===============================================================================================


gbm_fit3 <- h2o.gbm(x = x,
                    y = y,
                    training_frame = train,
                    model_id = "gbm_fit2",
                    #validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    seed = 1)

#==========================================================================================================
# Let's compare the performance of the two GBMs
#==========================================================================================================



gbm_perf3 <- h2o.performance(model = gbm_fit3,
                             newdata = test)


#==========================================================================================================
# Print model performance
#==========================================================================================================



gbm_perf3

#==========================================================================================================

# Look at scoring history for third GBM model
#==========================================================================================================

plot(gbm_fit3, 
     timestep = "number_of_trees")

```



```{r}
#==========================================================================================================

# Train a DL with early stopping
# This example will use the same model parameters as `dl_fit2`. This time, we will turn on 
# early stopping and specify the stopping criterion.  We will also pass a validation set, as is
# recommended for early stopping.
#==========================================================================================================


dl_fit3 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit2",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
                            seed = 1) 

dl_perf3 <- h2o.performance(model = dl_fit3,
                            newdata = test)


#==========================================================================================================

# Print model performance
#==========================================================================================================

dl_perf3

#==========================================================================================================

# Scoring history
#==========================================================================================================



h2o.scoreHistory(dl_fit3)



#==========================================================================================================

# Look at scoring history for third DL model
#==========================================================================================================


plot(dl_fit3, 
     timestep = "epochs")


```



```{r}

```

