load library and start h2o

library(h2o)
library(data.table)
library(ggplot2)
h2o.init()
 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         2 hours 18 minutes 
    H2O cluster version:        3.10.5.3 
    H2O cluster version age:    2 months and 25 days  
    H2O cluster name:           H2O_started_from_R_r631758_qgy865 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   3.19 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    R Version:                  R version 3.4.1 (2017-06-30) 
# airlines<-h2o.importFile("C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\allyears2k_headers.csv")
raw <- h2o.importFile(path = "C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\allyears2k_headers.csv", parse=FALSE)
setup <- h2o.parseSetup(raw)
setup$column_types[which(setup$column_names %in% "AirTime")]  <- "Numeric"
setup$column_types[which(setup$column_names %in% "AirDelay")] <- "Numeric"
airlines.hex <- h2o.parseRaw(raw, col.types=setup$column_types)

  |                                                                                                           
  |                                                                                                     |   0%
  |                                                                                                           
  |=====================================================================================================| 100%
airlines<-fread("C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\allyears2k_headers.csv")
str(airlines)
Classes ‘data.table’ and 'data.frame':  43978 obs. of  31 variables:
 $ Year             : int  1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
 $ Month            : int  10 10 10 10 10 10 10 10 10 10 ...
 $ DayofMonth       : int  14 15 17 18 19 21 22 23 24 25 ...
 $ DayOfWeek        : int  3 4 6 7 1 3 4 5 6 7 ...
 $ DepTime          : int  741 729 741 729 749 728 728 731 744 729 ...
 $ CRSDepTime       : int  730 730 730 730 730 730 730 730 730 730 ...
 $ ArrTime          : int  912 903 918 847 922 848 852 902 908 851 ...
 $ CRSArrTime       : int  849 849 849 849 849 849 849 849 849 849 ...
 $ UniqueCarrier    : chr  "PS" "PS" "PS" "PS" ...
 $ FlightNum        : int  1451 1451 1451 1451 1451 1451 1451 1451 1451 1451 ...
 $ TailNum          : chr  NA NA NA NA ...
 $ ActualElapsedTime: int  91 94 97 78 93 80 84 91 84 82 ...
 $ CRSElapsedTime   : int  79 79 79 79 79 79 79 79 79 79 ...
 $ AirTime          : int  NA NA NA NA NA NA NA NA NA NA ...
 $ ArrDelay         : int  23 14 29 -2 33 -1 3 13 19 2 ...
 $ DepDelay         : int  11 -1 11 -1 19 -2 -2 1 14 -1 ...
 $ Origin           : chr  "SAN" "SAN" "SAN" "SAN" ...
 $ Dest             : chr  "SFO" "SFO" "SFO" "SFO" ...
 $ Distance         : int  447 447 447 447 447 447 447 447 447 447 ...
 $ TaxiIn           : int  NA NA NA NA NA NA NA NA NA NA ...
 $ TaxiOut          : int  NA NA NA NA NA NA NA NA NA NA ...
 $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ CancellationCode : chr  NA NA NA NA ...
 $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ CarrierDelay     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ WeatherDelay     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ NASDelay         : int  NA NA NA NA NA NA NA NA NA NA ...
 $ SecurityDelay    : int  NA NA NA NA NA NA NA NA NA NA ...
 $ LateAircraftDelay: int  NA NA NA NA NA NA NA NA NA NA ...
 $ IsArrDelayed     : chr  "YES" "YES" "YES" "NO" ...
 $ IsDepDelayed     : chr  "YES" "NO" "YES" "NO" ...
 - attr(*, ".internal.selfref")=<externalptr> 
airlines$UniqueCarrier<-as.factor(airlines$UniqueCarrier)
summary(airlines)
      Year          Month          DayofMonth     DayOfWeek        DepTime       CRSDepTime      ArrTime    
 Min.   :1987   Min.   : 1.000   Min.   : 1.0   Min.   :1.000   Min.   :   1   Min.   :   0   Min.   :   1  
 1st Qu.:1992   1st Qu.: 1.000   1st Qu.: 6.0   1st Qu.:2.000   1st Qu.: 929   1st Qu.: 910   1st Qu.:1118  
 Median :1998   Median : 1.000   Median :14.0   Median :4.000   Median :1330   Median :1320   Median :1527  
 Mean   :1998   Mean   : 1.409   Mean   :14.6   Mean   :3.821   Mean   :1346   Mean   :1313   Mean   :1505  
 3rd Qu.:2003   3rd Qu.: 1.000   3rd Qu.:23.0   3rd Qu.:5.000   3rd Qu.:1735   3rd Qu.:1720   3rd Qu.:1917  
 Max.   :2008   Max.   :10.000   Max.   :31.0   Max.   :7.000   Max.   :2400   Max.   :2359   Max.   :2400  
                                                                NA's   :1086                  NA's   :1195  
   CRSArrTime   UniqueCarrier     FlightNum        TailNum          ActualElapsedTime CRSElapsedTime
 Min.   :   0   US     :18729   Min.   :   1.0   Length:43978       Min.   : 16.0     Min.   : 17   
 1st Qu.:1109   UA     : 9434   1st Qu.: 204.0   Class :character   1st Qu.: 71.0     1st Qu.: 71   
 Median :1516   WN     : 6170   Median : 557.0   Mode  :character   Median :101.0     Median :102   
 Mean   :1485   HP     : 3451   Mean   : 818.8                      Mean   :124.8     Mean   :125   
 3rd Qu.:1903   PS     : 3212   3rd Qu.:1242.0                      3rd Qu.:151.0     3rd Qu.:151   
 Max.   :2359   DL     :  935   Max.   :3949.0                      Max.   :475.0     Max.   :437   
                (Other): 2047                                       NA's   :1195      NA's   :13    
    AirTime         ArrDelay          DepDelay         Origin              Dest              Distance     
 Min.   : 14.0   Min.   :-63.000   Min.   :-16.00   Length:43978       Length:43978       Min.   :  11.0  
 1st Qu.: 61.0   1st Qu.: -6.000   1st Qu.: -2.00   Class :character   Class :character   1st Qu.: 326.0  
 Median : 91.0   Median :  2.000   Median :  1.00   Mode  :character   Mode  :character   Median : 541.0  
 Mean   :114.3   Mean   :  9.317   Mean   : 10.01                                         Mean   : 730.2  
 3rd Qu.:140.0   3rd Qu.: 14.000   3rd Qu.: 10.00                                         3rd Qu.: 920.0  
 Max.   :402.0   Max.   :475.000   Max.   :473.00                                         Max.   :3365.0  
 NA's   :16649   NA's   :1195      NA's   :1086                                           NA's   :35      
     TaxiIn           TaxiOut         Cancelled       CancellationCode      Diverted         CarrierDelay   
 Min.   :  0.000   Min.   :  0.00   Min.   :0.00000   Length:43978       Min.   :0.000000   Min.   :  0.00  
 1st Qu.:  3.000   1st Qu.:  9.00   1st Qu.:0.00000   Class :character   1st Qu.:0.000000   1st Qu.:  0.00  
 Median :  5.000   Median : 12.00   Median :0.00000   Mode  :character   Median :0.000000   Median :  0.00  
 Mean   :  5.381   Mean   : 14.17   Mean   :0.02469                      Mean   :0.002478   Mean   :  4.05  
 3rd Qu.:  6.000   3rd Qu.: 16.00   3rd Qu.:0.00000                      3rd Qu.:0.000000   3rd Qu.:  0.00  
 Max.   :128.000   Max.   :254.00   Max.   :1.00000                      Max.   :1.000000   Max.   :369.00  
 NA's   :16026     NA's   :16024                                                            NA's   :35045   
  WeatherDelay       NASDelay      SecurityDelay   LateAircraftDelay IsArrDelayed       IsDepDelayed      
 Min.   :  0.00   Min.   :  0.00   Min.   : 0.00   Min.   :  0.00    Length:43978       Length:43978      
 1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 0.00   1st Qu.:  0.00    Class :character   Class :character  
 Median :  0.00   Median :  0.00   Median : 0.00   Median :  0.00    Mode  :character   Mode  :character  
 Mean   :  0.29   Mean   :  4.86   Mean   : 0.02   Mean   :  7.62                                         
 3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.: 0.00   3rd Qu.:  0.00                                         
 Max.   :201.00   Max.   :323.00   Max.   :14.00   Max.   :373.00                                         
 NA's   :35045    NA's   :35045    NA's   :35045   NA's   :35045                                          
ggplot(airlines, aes(x=Year))+geom_histogram()

# table(airlines$Year)
ggplot(airlines, aes(x=Month))+geom_histogram()

# table(airlines$Month)
hist(airlines$Year)

hist(airlines$Month)

airlines.h2o<-as.h2o(airlines)

  |                                                                                                           
  |                                                                                                     |   0%
  |                                                                                                           
  |=====================================================================================================| 100%

Create scatter plots by taking a random sample into R to plot and graphing linear fit

scatter_plot<-function(data,x,y,max_points=1000, fit=T){
  if(fit){
    lr<-h2o.glm(x=x,y=y,training_frame = data,family="gaussian")
      coeff<-lr@model$coefficients_table$coefficients
  }
  df<-data[,c(x,y)]
  runif<-h2o.runif(df)
  df.subset<-df[runif < max_points/nrow(data),]
  df.R<-as.data.frame(df.subset)
  h2o.rm(df.subset)
  if (fit) h2o.rm(lr@model_id)
  
  plot(x=df.R[,x],y=df.R[,y], col="blue", xlab=x, ylab=y)
  if (fit) abline(coef=coeff,col="black")
}
scatter_plot(data=airlines.h2o,x="Distance", y="AirTime", fit=T)

  |                                                                                                           
  |                                                                                                     |   0%
  |                                                                                                           
  |=====================================================================================================| 100%

# lr<-h2o.glm(x="Distance",y="AirTime",training_frame = airlines.h2o,family="gaussian")
scatter_plot(data = airlines.h2o, x = "UniqueCarrier", y = "ArrDelay", max_points = 5000, fit = F)

# lr<-h2o.glm(x="UniqueCarrier",y="ArrDelay",training_frame = airlines.h2o,family="gaussian")
# lr<-h2o.glm(x="Distance",y="AirTime",training_frame =airlines.h2o,family="gaussian")

Flight by Month calculated using H2O’s fast groupby

print("Splitting data into group of 12 month and aggregating on two columns...")
[1] "Splitting data into group of 12 month and aggregating on two columns..."
flightBymonth<-h2o.group_by(data=airlines.h2o,by="Month", nrow("Month"), sum("Cancelled"))
flightBymonth.R<-as.data.frame(flightBymonth)

Set Column Type for Enumerator or Factor Columns

airlines.h2o$Year<-as.factor(airlines.h2o$Year)
airlines.h2o$Month<-as.factor(airlines.h2o$Month)
airlines.h2o$DayOfWeek<-as.factor(airlines.h2o$DayOfWeek)
airlines.h2o$Cancelled<-as.factor(airlines.h2o$Cancelled)
airlines.h2o$FlightNum<-as.factor(airlines.h2o$FlightNum)
airlines.h2o$Origin<-as.factor(airlines.h2o$Origin)
airlines.h2o$Dest<-as.factor(airlines.h2o$Dest)
airlines.h2o$IsDepDelayed<-as.factor(airlines.h2o$IsDepDelayed)

Parameter Creation

hour1<-airlines.h2o$CRSArrTime %/% 100
mins1<-airlines.h2o$CRSArrTime %% 100
arrTime<-hour1*60+mins1
hour2 <- airlines.h2o$CRSDepTime %/% 100
mins2 <- airlines.h2o$CRSDepTime %% 100
depTime <- hour2*60+mins2
travelTime <- ifelse(arrTime - depTime > 0, arrTime - depTime, NA)
airlines.h2o$TravelTime <- travelTime
scatter_plot(airlines.h2o, "Distance", "TravelTime")

  |                                                                                                           
  |                                                                                                     |   0%
  |                                                                                                           
  |=====================================================================================================| 100%

imputation: You can also choose to impute missing value by taking the mean of subsets

h2o.impute(data=airlines.h2o, column="Distance", by=c("Origin", "Dest"))
  Origin Dest mean_Distance
1    ABE  CLT           481
2    ABE  PIT           253
3    ABQ  AMA           277
4    ABQ  BWI          1670
5    ABQ  DAL           580
6    ABQ  DEN           349

[1517 rows x 3 columns] 
scatter_plot(airlines.h2o, "Distance", "TravelTime")

  |                                                                                                           
  |                                                                                                     |   0%
  |                                                                                                           
  |=====================================================================================================| 100%

create test/train split

data.split<-h2o.splitFrame(data=airlines.h2o,ratio=0.8)
data.train<-data.split[[1]]
data.test<-data.split[[2]]

set the predictor names and the response column name

Build GLM

start<-Sys.time()
data.glm<-h2o.glm(y=response,x=predictors,training_frame = data.train, validation_frame = data.test, family="binomial", standardize = T, model_id = "glm_model", alpha=0.5, lambda=1e-05)

  |                                                                                                           
  |                                                                                                     |   0%
  |                                                                                                           
  |==                                                                                                   |   2%
  |                                                                                                           
  |=====================================================================================================| 100%
glm_time<-Sys.time()-start
print(paste("Took", round(glm_time, digits=2), units(glm_time), "to build logistic regression model."))
[1] "Took 1.62 secs to build logistic regression model."

Build GBM model

predictors=c("Year", "UniqueCarrier","DayOfWeek","Month","Distance")
start<-Sys.time()
#for balance classes https://h2o-release.s3.amazonaws.com/h2o/rel-ueno/2/docs-website/h2o-docs/data-science/algo-params/balance_classes.html
data.gbm<-h2o.gbm(y=response, x=predictors, balance_classes=T,training_frame = data.train, validation_frame = data.test, ntrees=100, max_depth=5, model_id="gbm_model", distribution="bernoulli", learn_rate = .1, min_rows=2)

  |                                                                                                           
  |                                                                                                     |   0%
  |                                                                                                           
  |=                                                                                                    |   1%
  |                                                                                                           
  |===============                                                                                      |  15%
  |                                                                                                           
  |=====================                                                                                |  21%
  |                                                                                                           
  |=========================                                                                            |  25%
  |                                                                                                           
  |=====================================                                                                |  37%
  |                                                                                                           
  |=====================================================================================================| 100%
gbm_time<-Sys.time()-start
print(paste("Took", round(gbm_time, digits = 2), units(gbm_time), "to build a GBM model.")) 
[1] "Took 6.97 secs to build a GBM model."

Build random forest mdoel

predictors=c("Year", "UniqueCarrier","DayOfWeek","Month","Distance")
start<-Sys.time()
data.drf<-h2o.randomForest(y=response, x=predictors, training_frame = data.train, validation_frame = data.test, ntrees=1500, max_depth=5, model_id="drf_model", balance_classes = T)

  |                                                                                                           
  |                                                                                                     |   0%
  |                                                                                                           
  |=                                                                                                    |   1%
  |                                                                                                           
  |==                                                                                                   |   2%
  |                                                                                                           
  |==================                                                                                   |  17%
  |                                                                                                           
  |==================================                                                                   |  34%
  |                                                                                                           
  |==================================================                                                   |  50%
  |                                                                                                           
  |=============================================================                                        |  60%
  |                                                                                                           
  |==========================================================================                           |  74%
  |                                                                                                           
  |===========================================================================================          |  90%
  |                                                                                                           
  |=====================================================================================================| 100%
drf_time <- Sys.time() - start
print(paste("Took", round(drf_time, digits = 2), units(drf_time), "to build a Random Forest model."))
[1] "Took 29.27 secs to build a Random Forest model."

comparing running tiem with randoforest package

cannot handle predictors more than 53 categories

library(randomForest)
library(caret)
trainIndex=createDataPartition(airlines$IsDepDelayed,p=0.7, list=FALSE, times=1)
#https://www.rdocumentation.org/packages/caret/versions/6.0-76/topics/createDataPartition
airlines$Year<-as.factor(airlines$Year)
airlines$Month<-as.factor(airlines$Month)
airlines$DayOfWeek<-as.factor(airlines$DayOfWeek)
airlines$Cancelled<-as.factor(airlines$Cancelled)
airlines$FlightNum<-as.factor(airlines$FlightNum)
airlines$Origin<-as.factor(airlines$Origin)
airlines$Dest<-as.factor(airlines$Dest)
airlines$IsDepDelayed<-as.factor(airlines$IsDepDelayed)
training<-airlines[trainIndex,]
testing<-airlines[-trainIndex,]
training<-as.data.frame(training)
testing<-as.data.frame(testing)
start<-Sys.time()
fit <- randomForest(IsDepDelayed ~ Year+UniqueCarrier+DayOfWeek+Month+Distance,
                      data=training, 
                      importance=TRUE, 
                      ntree=1500, na.action=na.omit)
drf_time <- Sys.time() - start
print(paste("Took", round(drf_time, digits = 2), units(drf_time), "to build a Random Forest model under randomForest package."))
[1] "Took 44.74 secs to build a Random Forest model under randomForest package."

build deep learning model

start   <- Sys.time()
data.dl<-h2o.deeplearning(y=response, x=predictors, training_frame = data.train, validation_frame = data.test, hidden=c(10,10), epochs = 5, balance_classes=T, loss="Automatic", variable_importances=T)
dl_time <- Sys.time() - start
print(paste("Took", round(dl_time, digits = 2), units(dl_time), "to build a Deep Learning model."))

Variable Importance - For feature selection and rerunning a model build

print("GLM: Sorted Standardized Coefficient Magnitudes To Find Nonzero Coefficients")
data.glm@model$standardized_coefficient_magnitudes
print("GBM: Variable Importance")
data.gbm@model$variable_importances
print("Random Forest: Variable Importance")
data.drf@model$variable_importances
print("Deep Learning: Variable Importance")
data.dl@model$variable_importances
