load library and start h2o
library(h2o)
library(data.table)
library(ggplot2)
h2o.init()
Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 2 hours 18 minutes
H2O cluster version: 3.10.5.3
H2O cluster version age: 2 months and 25 days
H2O cluster name: H2O_started_from_R_r631758_qgy865
H2O cluster total nodes: 1
H2O cluster total memory: 3.19 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
R Version: R version 3.4.1 (2017-06-30)
# airlines<-h2o.importFile("C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\allyears2k_headers.csv")
raw <- h2o.importFile(path = "C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\allyears2k_headers.csv", parse=FALSE)
setup <- h2o.parseSetup(raw)
setup$column_types[which(setup$column_names %in% "AirTime")] <- "Numeric"
setup$column_types[which(setup$column_names %in% "AirDelay")] <- "Numeric"
airlines.hex <- h2o.parseRaw(raw, col.types=setup$column_types)
|
| | 0%
|
|=====================================================================================================| 100%
airlines<-fread("C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\allyears2k_headers.csv")
str(airlines)
Classes ‘data.table’ and 'data.frame': 43978 obs. of 31 variables:
$ Year : int 1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
$ Month : int 10 10 10 10 10 10 10 10 10 10 ...
$ DayofMonth : int 14 15 17 18 19 21 22 23 24 25 ...
$ DayOfWeek : int 3 4 6 7 1 3 4 5 6 7 ...
$ DepTime : int 741 729 741 729 749 728 728 731 744 729 ...
$ CRSDepTime : int 730 730 730 730 730 730 730 730 730 730 ...
$ ArrTime : int 912 903 918 847 922 848 852 902 908 851 ...
$ CRSArrTime : int 849 849 849 849 849 849 849 849 849 849 ...
$ UniqueCarrier : chr "PS" "PS" "PS" "PS" ...
$ FlightNum : int 1451 1451 1451 1451 1451 1451 1451 1451 1451 1451 ...
$ TailNum : chr NA NA NA NA ...
$ ActualElapsedTime: int 91 94 97 78 93 80 84 91 84 82 ...
$ CRSElapsedTime : int 79 79 79 79 79 79 79 79 79 79 ...
$ AirTime : int NA NA NA NA NA NA NA NA NA NA ...
$ ArrDelay : int 23 14 29 -2 33 -1 3 13 19 2 ...
$ DepDelay : int 11 -1 11 -1 19 -2 -2 1 14 -1 ...
$ Origin : chr "SAN" "SAN" "SAN" "SAN" ...
$ Dest : chr "SFO" "SFO" "SFO" "SFO" ...
$ Distance : int 447 447 447 447 447 447 447 447 447 447 ...
$ TaxiIn : int NA NA NA NA NA NA NA NA NA NA ...
$ TaxiOut : int NA NA NA NA NA NA NA NA NA NA ...
$ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
$ CancellationCode : chr NA NA NA NA ...
$ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
$ CarrierDelay : int NA NA NA NA NA NA NA NA NA NA ...
$ WeatherDelay : int NA NA NA NA NA NA NA NA NA NA ...
$ NASDelay : int NA NA NA NA NA NA NA NA NA NA ...
$ SecurityDelay : int NA NA NA NA NA NA NA NA NA NA ...
$ LateAircraftDelay: int NA NA NA NA NA NA NA NA NA NA ...
$ IsArrDelayed : chr "YES" "YES" "YES" "NO" ...
$ IsDepDelayed : chr "YES" "NO" "YES" "NO" ...
- attr(*, ".internal.selfref")=<externalptr>
airlines$UniqueCarrier<-as.factor(airlines$UniqueCarrier)
summary(airlines)
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime
Min. :1987 Min. : 1.000 Min. : 1.0 Min. :1.000 Min. : 1 Min. : 0 Min. : 1
1st Qu.:1992 1st Qu.: 1.000 1st Qu.: 6.0 1st Qu.:2.000 1st Qu.: 929 1st Qu.: 910 1st Qu.:1118
Median :1998 Median : 1.000 Median :14.0 Median :4.000 Median :1330 Median :1320 Median :1527
Mean :1998 Mean : 1.409 Mean :14.6 Mean :3.821 Mean :1346 Mean :1313 Mean :1505
3rd Qu.:2003 3rd Qu.: 1.000 3rd Qu.:23.0 3rd Qu.:5.000 3rd Qu.:1735 3rd Qu.:1720 3rd Qu.:1917
Max. :2008 Max. :10.000 Max. :31.0 Max. :7.000 Max. :2400 Max. :2359 Max. :2400
NA's :1086 NA's :1195
CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime
Min. : 0 US :18729 Min. : 1.0 Length:43978 Min. : 16.0 Min. : 17
1st Qu.:1109 UA : 9434 1st Qu.: 204.0 Class :character 1st Qu.: 71.0 1st Qu.: 71
Median :1516 WN : 6170 Median : 557.0 Mode :character Median :101.0 Median :102
Mean :1485 HP : 3451 Mean : 818.8 Mean :124.8 Mean :125
3rd Qu.:1903 PS : 3212 3rd Qu.:1242.0 3rd Qu.:151.0 3rd Qu.:151
Max. :2359 DL : 935 Max. :3949.0 Max. :475.0 Max. :437
(Other): 2047 NA's :1195 NA's :13
AirTime ArrDelay DepDelay Origin Dest Distance
Min. : 14.0 Min. :-63.000 Min. :-16.00 Length:43978 Length:43978 Min. : 11.0
1st Qu.: 61.0 1st Qu.: -6.000 1st Qu.: -2.00 Class :character Class :character 1st Qu.: 326.0
Median : 91.0 Median : 2.000 Median : 1.00 Mode :character Mode :character Median : 541.0
Mean :114.3 Mean : 9.317 Mean : 10.01 Mean : 730.2
3rd Qu.:140.0 3rd Qu.: 14.000 3rd Qu.: 10.00 3rd Qu.: 920.0
Max. :402.0 Max. :475.000 Max. :473.00 Max. :3365.0
NA's :16649 NA's :1195 NA's :1086 NA's :35
TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay
Min. : 0.000 Min. : 0.00 Min. :0.00000 Length:43978 Min. :0.000000 Min. : 0.00
1st Qu.: 3.000 1st Qu.: 9.00 1st Qu.:0.00000 Class :character 1st Qu.:0.000000 1st Qu.: 0.00
Median : 5.000 Median : 12.00 Median :0.00000 Mode :character Median :0.000000 Median : 0.00
Mean : 5.381 Mean : 14.17 Mean :0.02469 Mean :0.002478 Mean : 4.05
3rd Qu.: 6.000 3rd Qu.: 16.00 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.: 0.00
Max. :128.000 Max. :254.00 Max. :1.00000 Max. :1.000000 Max. :369.00
NA's :16026 NA's :16024 NA's :35045
WeatherDelay NASDelay SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed
Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00 Length:43978 Length:43978
1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 Class :character Class :character
Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.00 Mode :character Mode :character
Mean : 0.29 Mean : 4.86 Mean : 0.02 Mean : 7.62
3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00
Max. :201.00 Max. :323.00 Max. :14.00 Max. :373.00
NA's :35045 NA's :35045 NA's :35045 NA's :35045
ggplot(airlines, aes(x=Year))+geom_histogram()

# table(airlines$Year)
ggplot(airlines, aes(x=Month))+geom_histogram()

# table(airlines$Month)
hist(airlines$Year)

hist(airlines$Month)

airlines.h2o<-as.h2o(airlines)
|
| | 0%
|
|=====================================================================================================| 100%
Create scatter plots by taking a random sample into R to plot and graphing linear fit
scatter_plot<-function(data,x,y,max_points=1000, fit=T){
if(fit){
lr<-h2o.glm(x=x,y=y,training_frame = data,family="gaussian")
coeff<-lr@model$coefficients_table$coefficients
}
df<-data[,c(x,y)]
runif<-h2o.runif(df)
df.subset<-df[runif < max_points/nrow(data),]
df.R<-as.data.frame(df.subset)
h2o.rm(df.subset)
if (fit) h2o.rm(lr@model_id)
plot(x=df.R[,x],y=df.R[,y], col="blue", xlab=x, ylab=y)
if (fit) abline(coef=coeff,col="black")
}
scatter_plot(data=airlines.h2o,x="Distance", y="AirTime", fit=T)
|
| | 0%
|
|=====================================================================================================| 100%

# lr<-h2o.glm(x="Distance",y="AirTime",training_frame = airlines.h2o,family="gaussian")
scatter_plot(data = airlines.h2o, x = "UniqueCarrier", y = "ArrDelay", max_points = 5000, fit = F)

# lr<-h2o.glm(x="UniqueCarrier",y="ArrDelay",training_frame = airlines.h2o,family="gaussian")
# lr<-h2o.glm(x="Distance",y="AirTime",training_frame =airlines.h2o,family="gaussian")
Flight by Month calculated using H2O’s fast groupby
print("Splitting data into group of 12 month and aggregating on two columns...")
[1] "Splitting data into group of 12 month and aggregating on two columns..."
flightBymonth<-h2o.group_by(data=airlines.h2o,by="Month", nrow("Month"), sum("Cancelled"))
flightBymonth.R<-as.data.frame(flightBymonth)
Set Column Type for Enumerator or Factor Columns
airlines.h2o$Year<-as.factor(airlines.h2o$Year)
airlines.h2o$Month<-as.factor(airlines.h2o$Month)
airlines.h2o$DayOfWeek<-as.factor(airlines.h2o$DayOfWeek)
airlines.h2o$Cancelled<-as.factor(airlines.h2o$Cancelled)
airlines.h2o$FlightNum<-as.factor(airlines.h2o$FlightNum)
airlines.h2o$Origin<-as.factor(airlines.h2o$Origin)
airlines.h2o$Dest<-as.factor(airlines.h2o$Dest)
airlines.h2o$IsDepDelayed<-as.factor(airlines.h2o$IsDepDelayed)
Parameter Creation
hour1<-airlines.h2o$CRSArrTime %/% 100
mins1<-airlines.h2o$CRSArrTime %% 100
arrTime<-hour1*60+mins1
hour2 <- airlines.h2o$CRSDepTime %/% 100
mins2 <- airlines.h2o$CRSDepTime %% 100
depTime <- hour2*60+mins2
travelTime <- ifelse(arrTime - depTime > 0, arrTime - depTime, NA)
airlines.h2o$TravelTime <- travelTime
scatter_plot(airlines.h2o, "Distance", "TravelTime")
|
| | 0%
|
|=====================================================================================================| 100%

imputation: You can also choose to impute missing value by taking the mean of subsets
h2o.impute(data=airlines.h2o, column="Distance", by=c("Origin", "Dest"))
Origin Dest mean_Distance
1 ABE CLT 481
2 ABE PIT 253
3 ABQ AMA 277
4 ABQ BWI 1670
5 ABQ DAL 580
6 ABQ DEN 349
[1517 rows x 3 columns]
scatter_plot(airlines.h2o, "Distance", "TravelTime")
|
| | 0%
|
|=====================================================================================================| 100%

create test/train split
data.split<-h2o.splitFrame(data=airlines.h2o,ratio=0.8)
data.train<-data.split[[1]]
data.test<-data.split[[2]]
set the predictor names and the response column name
Build GLM
start<-Sys.time()
data.glm<-h2o.glm(y=response,x=predictors,training_frame = data.train, validation_frame = data.test, family="binomial", standardize = T, model_id = "glm_model", alpha=0.5, lambda=1e-05)
|
| | 0%
|
|== | 2%
|
|=====================================================================================================| 100%
glm_time<-Sys.time()-start
print(paste("Took", round(glm_time, digits=2), units(glm_time), "to build logistic regression model."))
[1] "Took 1.62 secs to build logistic regression model."
Build GBM model
predictors=c("Year", "UniqueCarrier","DayOfWeek","Month","Distance")
start<-Sys.time()
#for balance classes https://h2o-release.s3.amazonaws.com/h2o/rel-ueno/2/docs-website/h2o-docs/data-science/algo-params/balance_classes.html
data.gbm<-h2o.gbm(y=response, x=predictors, balance_classes=T,training_frame = data.train, validation_frame = data.test, ntrees=100, max_depth=5, model_id="gbm_model", distribution="bernoulli", learn_rate = .1, min_rows=2)
|
| | 0%
|
|= | 1%
|
|=============== | 15%
|
|===================== | 21%
|
|========================= | 25%
|
|===================================== | 37%
|
|=====================================================================================================| 100%
gbm_time<-Sys.time()-start
print(paste("Took", round(gbm_time, digits = 2), units(gbm_time), "to build a GBM model."))
[1] "Took 6.97 secs to build a GBM model."
Build random forest mdoel
predictors=c("Year", "UniqueCarrier","DayOfWeek","Month","Distance")
start<-Sys.time()
data.drf<-h2o.randomForest(y=response, x=predictors, training_frame = data.train, validation_frame = data.test, ntrees=1500, max_depth=5, model_id="drf_model", balance_classes = T)
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|================== | 17%
|
|================================== | 34%
|
|================================================== | 50%
|
|============================================================= | 60%
|
|========================================================================== | 74%
|
|=========================================================================================== | 90%
|
|=====================================================================================================| 100%
drf_time <- Sys.time() - start
print(paste("Took", round(drf_time, digits = 2), units(drf_time), "to build a Random Forest model."))
[1] "Took 29.27 secs to build a Random Forest model."
comparing running tiem with randoforest package
cannot handle predictors more than 53 categories
library(randomForest)
library(caret)
trainIndex=createDataPartition(airlines$IsDepDelayed,p=0.7, list=FALSE, times=1)
#https://www.rdocumentation.org/packages/caret/versions/6.0-76/topics/createDataPartition
airlines$Year<-as.factor(airlines$Year)
airlines$Month<-as.factor(airlines$Month)
airlines$DayOfWeek<-as.factor(airlines$DayOfWeek)
airlines$Cancelled<-as.factor(airlines$Cancelled)
airlines$FlightNum<-as.factor(airlines$FlightNum)
airlines$Origin<-as.factor(airlines$Origin)
airlines$Dest<-as.factor(airlines$Dest)
airlines$IsDepDelayed<-as.factor(airlines$IsDepDelayed)
training<-airlines[trainIndex,]
testing<-airlines[-trainIndex,]
training<-as.data.frame(training)
testing<-as.data.frame(testing)
start<-Sys.time()
fit <- randomForest(IsDepDelayed ~ Year+UniqueCarrier+DayOfWeek+Month+Distance,
data=training,
importance=TRUE,
ntree=1500, na.action=na.omit)
drf_time <- Sys.time() - start
print(paste("Took", round(drf_time, digits = 2), units(drf_time), "to build a Random Forest model under randomForest package."))
[1] "Took 44.74 secs to build a Random Forest model under randomForest package."
build deep learning model
start <- Sys.time()
data.dl<-h2o.deeplearning(y=response, x=predictors, training_frame = data.train, validation_frame = data.test, hidden=c(10,10), epochs = 5, balance_classes=T, loss="Automatic", variable_importances=T)
dl_time <- Sys.time() - start
print(paste("Took", round(dl_time, digits = 2), units(dl_time), "to build a Deep Learning model."))
Variable Importance - For feature selection and rerunning a model build
print("GLM: Sorted Standardized Coefficient Magnitudes To Find Nonzero Coefficients")
data.glm@model$standardized_coefficient_magnitudes
print("GBM: Variable Importance")
data.gbm@model$variable_importances
print("Random Forest: Variable Importance")
data.drf@model$variable_importances
print("Deep Learning: Variable Importance")
data.dl@model$variable_importances
