Data Cleaning/Preparation:

By far the most challenging part of the project, cleaning the data consisted of these steps:

Access Seasonal Player Data from API
Select appropriate fields for main dataframe (i.e. GoalsScored, GamesPlayed, etc.)
Access Salary Data from API
Merge datasets on playerID
Discard NA cells and duplicate instances.
Repeat steps 1-5 for a different NHL season to try and increase dataset size.

authenticate('2ef2faaa-ba07-4c49-8320-8d8656', 'MYSPORTSFEEDS', type = "basic")

## <request>
## Options:
## * httpauth: 1
## * userpwd: 2ef2faaa-ba07-4c49-8320-8d8656:MYSPORTSFEEDS

#####2016

players2016 = GET("https://api.mysportsfeeds.com/v2.1/pull/nhl/2015-2016-regular/player_stats_totals.csv", 
                  authenticate("2ef2faaa-ba07-4c49-8320-8d8656", "MYSPORTSFEEDS"))

#str(players2016, max.level = 1)
players2016_content = content(players2016)
#str(players2016_content, max.level = 1, nchar.max = 17)
players2016_df = as.data.frame(players2016_content)
players2016_df = players2016_df %>%
  select("#Player ID",
         "#LastName",
         "#FirstName",
         "#Team Name",
         "#Position",
         "#GamesPlayed",
         "#Goals",
         "#Assists",
         "#Points",
         "#PowerplayPoints",
         "#ShorthandedPoints",
         "#GameWinningGoals",
         "#PlusMinus",
         "#Shots",
         "#Hits",
         "#PenaltyMinutes")

salary2016 = GET("https://api.mysportsfeeds.com/v2.1/pull/nhl/2015-2016-regular/date/20160215/dfs.csv", 
                 authenticate("2ef2faaa-ba07-4c49-8320-8d8656", "MYSPORTSFEEDS"))
#str(salary2016, max.level = 1)
salary2016_content = content(salary2016)
#str(salary2016_content, max.level = 1, nchar.max = 17)
salary2016_df = as.data.frame(salary2016_content) 
salary2016_df = salary2016_df %>%
  select("#Player ID", "#Salary")

df2016 = merge(x = players2016_df, y = salary2016_df, by = "#Player ID", all.x = TRUE)
df2016 = na.omit(df2016)

##### 2017
players2017 = GET("https://api.mysportsfeeds.com/v2.1/pull/nhl/2016-2017-regular/player_stats_totals.csv", 
                  authenticate("2ef2faaa-ba07-4c49-8320-8d8656", "MYSPORTSFEEDS"))
#str(players2017, max.level = 1)
players2017_content = content(players2017)
#str(players2017_content, max.level = 1, nchar.max = 17)
players2017_df = as.data.frame(players2017_content)
players2017_df = players2017_df %>%
  select("#Player ID",
         "#LastName",
         "#FirstName",
         "#Team Name",
         "#Position",
         "#GamesPlayed",
         "#Goals",
         "#Assists",
         "#Points",
         "#PowerplayPoints",
         "#ShorthandedPoints",
         "#GameWinningGoals",
         "#PlusMinus",
         "#Shots",
         "#Hits",
         "#PenaltyMinutes")

salary2017 = GET("https://api.mysportsfeeds.com/v2.1/pull/nhl/2016-2017-regular/date/20161123/dfs.csv", 
                 authenticate("2ef2faaa-ba07-4c49-8320-8d8656", "MYSPORTSFEEDS"))
#str(salary2017, max.level = 1)
salary2017_content = content(salary2017)
#str(salary2017_content, max.level = 1, nchar.max = 17)
salary2017_df = as.data.frame(salary2017_content)
salary2017_df = salary2017_df %>%
  select("#Player ID", "#Salary")
df2017 = merge(x = players2017_df, y = salary2017_df, by = "#Player ID", all.x = TRUE)
df2017 = na.omit(df2017)


##### 2018
players2018 = GET("https://api.mysportsfeeds.com/v2.1/pull/nhl/2017-2018-regular/player_stats_totals.csv", 
                  authenticate("2ef2faaa-ba07-4c49-8320-8d8656", "MYSPORTSFEEDS"))

#str(players2018, max.level = 1)
players2018_content = content(players2018)
#str(players2018_content, max.level = 1, nchar.max = 17)
players2018_df = as.data.frame(players2018_content)
players2018_df = players2018_df %>%
  select("#Player ID",
         "#LastName",
         "#FirstName",
         "#Team Name",
         "#Position",
         "#GamesPlayed",
         "#Goals",
         "#Assists",
         "#Points",
         "#PowerplayPoints",
         "#ShorthandedPoints",
         "#GameWinningGoals",
         "#PlusMinus",
         "#Shots",
         "#Hits",
         "#PenaltyMinutes")

salary2018 = GET("https://api.mysportsfeeds.com/v2.1/pull/nhl/2017-2018-regular/date/20180327/dfs.csv", 
                 authenticate("2ef2faaa-ba07-4c49-8320-8d8656", "MYSPORTSFEEDS"))
#str(salary2018, max.level = 1)
salary2018_content = content(salary2018)
#str(salary2018_content, max.level = 1, nchar.max = 17)
salary2018_df = as.data.frame(salary2018_content)
salary2018_df = salary2018_df %>%
  select("#Player ID", "#Salary")
df2018 = merge(x = players2018_df, y = salary2018_df, by = "#Player ID", all.x = TRUE)
df2018 = na.omit(df2018)


##### 2019
players2019 = GET("https://api.mysportsfeeds.com/v2.1/pull/nhl/2018-2019-regular/player_stats_totals.csv", 
              authenticate("2ef2faaa-ba07-4c49-8320-8d8656", "MYSPORTSFEEDS"))

#str(players2019, max.level = 1)
players2019_content = content(players2019)
#str(players2019_content, max.level = 1, nchar.max = 17)
players2019_df = as.data.frame(players2019_content)
players2019_df = players2019_df %>%
  select("#Player ID",
         "#LastName",
         "#FirstName",
         "#Team Name",
         "#Position",
         "#GamesPlayed",
         "#Goals",
         "#Assists",
         "#Points",
         "#PowerplayPoints",
         "#ShorthandedPoints",
         "#GameWinningGoals",
         "#PlusMinus",
         "#Shots",
         "#Hits",
         "#PenaltyMinutes")

salary2019 = GET("https://api.mysportsfeeds.com/v2.1/pull/nhl/2018-2019-regular/date/20190214/dfs.csv", 
              authenticate("2ef2faaa-ba07-4c49-8320-8d8656", "MYSPORTSFEEDS"))

#str(salary2019, max.level = 1)
salary2019_content = content(salary2019)
#str(salary2019_content, max.level = 1, nchar.max = 17)
salary2019_df = as.data.frame(salary2019_content)
salary2019_df = salary2019_df %>%
  select("#Player ID", "#Salary")
df2019 = merge(x = players2019_df, y = salary2019_df, by = "#Player ID", all.x = TRUE)
df2019 = na.omit(df2019)

####MERGING DATA
merged_df = bind_rows(df2016, df2017, df2018, df2019)
NHL_df = merged_df[!duplicated(merged_df$`#Player ID`), ]

write.csv(NHL_df, file = "NHL_Data.csv")
fullData<-read.csv("NHL_Data.csv")
head(fullData)

##    X X.Player.ID X.LastName X.FirstName X.Team.Name X.Position
## 1  1           1   Mitchell      Willie    Panthers          D
## 2  3          29       Moen      Travis       Stars         LW
## 3  5          63      Lupul     Joffrey Maple Leafs         LW
## 4  7         139     Gordon        Boyd     Coyotes          C
## 5  9         143       Jagr     Jaromir    Panthers         RW
## 6 11         206     Markov      Andrei   Canadiens          D
##   X.GamesPlayed X.Goals X.Assists X.Points X.PowerplayPoints
## 1            46       1         6        7                 0
## 2            23       0         2        2                 0
## 3            46      11         3       14                 4
## 4            65       2         2        4                 0
## 5            79      27        39       66                11
## 6            82       5        39       44                17
##   X.ShorthandedPoints X.GameWinningGoals X.PlusMinus X.Shots X.Hits
## 1                   0                  1          -2      33     68
## 2                   1                  0          -3      12     30
## 3                   0                  2         -10     102     58
## 4                   1                  0          -7      53     21
## 5                   0                  4          23     143     31
## 6                   0                  0          -6     117     57
##   X.PenaltyMinutes X.Salary
## 1               18     2600
## 2               21     2500
## 3               12     3200
## 4               10     2500
## 5               48     5400
## 6               38     4100

Data Exploration

The visualizations below help us understand the dataset better.

head(fullData, 5)

##   X X.Player.ID X.LastName X.FirstName X.Team.Name X.Position
## 1 1           1   Mitchell      Willie    Panthers          D
## 2 3          29       Moen      Travis       Stars         LW
## 3 5          63      Lupul     Joffrey Maple Leafs         LW
## 4 7         139     Gordon        Boyd     Coyotes          C
## 5 9         143       Jagr     Jaromir    Panthers         RW
##   X.GamesPlayed X.Goals X.Assists X.Points X.PowerplayPoints
## 1            46       1         6        7                 0
## 2            23       0         2        2                 0
## 3            46      11         3       14                 4
## 4            65       2         2        4                 0
## 5            79      27        39       66                11
##   X.ShorthandedPoints X.GameWinningGoals X.PlusMinus X.Shots X.Hits
## 1                   0                  1          -2      33     68
## 2                   1                  0          -3      12     30
## 3                   0                  2         -10     102     58
## 4                   1                  0          -7      53     21
## 5                   0                  4          23     143     31
##   X.PenaltyMinutes X.Salary
## 1               18     2600
## 2               21     2500
## 3               12     3200
## 4               10     2500
## 5               48     5400

summary(fullData)

##        X         X.Player.ID       X.LastName    X.FirstName 
##  Min.   :   1   Min.   :    1   Smith   :   6   Ryan   : 24  
##  1st Qu.: 566   1st Qu.: 4671   Brown   :   5   Michael: 15  
##  Median :1402   Median : 5264   Johnson :   4   Nick   : 15  
##  Mean   :1669   Mean   : 6386   Lindholm:   4   Alex   : 14  
##  3rd Qu.:2653   3rd Qu.: 5886   Miller  :   4   Mark   : 14  
##  Max.   :4313   Max.   :15475   Mitchell:   4   Matt   : 14  
##                                 (Other) :1034   (Other):965  
##      X.Team.Name  X.Position X.GamesPlayed      X.Goals      
##  Red Wings : 47   C :322     Min.   : 1.00   Min.   : 0.000  
##  Blackhawks: 46   D :365     1st Qu.:20.00   1st Qu.: 1.000  
##  Coyotes   : 46   LW:200     Median :56.00   Median : 4.000  
##  Panthers  : 44   RW:174     Mean   :48.04   Mean   : 7.264  
##  Canucks   : 43              3rd Qu.:76.00   3rd Qu.:11.000  
##  Flames    : 41              Max.   :82.00   Max.   :46.000  
##  (Other)   :794                                              
##    X.Assists        X.Points      X.PowerplayPoints X.ShorthandedPoints
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.000    Min.   :0.0000     
##  1st Qu.: 2.00   1st Qu.:  4.00   1st Qu.: 0.000    1st Qu.:0.0000     
##  Median : 8.00   Median : 12.00   Median : 1.000    Median :0.0000     
##  Mean   :12.15   Mean   : 19.41   Mean   : 4.432    Mean   :0.4006     
##  3rd Qu.:19.00   3rd Qu.: 32.00   3rd Qu.: 6.000    3rd Qu.:0.0000     
##  Max.   :70.00   Max.   :106.00   Max.   :39.000    Max.   :8.0000     
##                                                                        
##  X.GameWinningGoals  X.PlusMinus          X.Shots           X.Hits      
##  Min.   :0.00       Min.   :-35.0000   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:0.00       1st Qu.: -5.0000   1st Qu.: 21.00   1st Qu.: 17.00  
##  Median :0.00       Median :  0.0000   Median : 64.00   Median : 45.00  
##  Mean   :1.21       Mean   : -0.3855   Mean   : 79.64   Mean   : 58.32  
##  3rd Qu.:2.00       3rd Qu.:  4.0000   3rd Qu.:126.00   3rd Qu.: 86.00  
##  Max.   :9.00       Max.   : 33.0000   Max.   :320.00   Max.   :365.00  
##                                                                         
##  X.PenaltyMinutes    X.Salary   
##  Min.   :  0.00   Min.   :2500  
##  1st Qu.:  6.00   1st Qu.:2900  
##  Median : 18.00   Median :3200  
##  Mean   : 24.52   Mean   :3698  
##  3rd Qu.: 34.00   3rd Qu.:4200  
##  Max.   :409.00   Max.   :9100  
##

goalCount<-fullData %>%
  count(X.Goals, sort = TRUE)
  goalCount %>% 
  arrange(desc(n)) %>%
  top_n(n = 30) %>%
  ggplot(aes(x = factor(X.Goals,levels=X.Goals),y = n)) + 
  geom_col(col="white",fill="blue",size=1) + 
  coord_flip() + 
  labs(x = "No. of Players", y = "Goal Count", title = "Most Common No. of Goals Scored during the NHL Season")

salaryCount <- fullData %>%
  count(X.Salary, sort = FALSE)
salaryCount %>% 
  arrange(desc(n)) %>%
  top_n(n=10) %>%
  ggplot(aes(x = factor(X.Salary, levels = X.Salary), y = n)) + 
  geom_col(col="white",fill="red",size=1) + 
  coord_flip() + 
  labs(x = "Salary", y = "No. of Players", title="Most Common NHL Players' Salaries")

labelDf <- fullData
labelDf =  labelDf %>% 
  unite(Name, X.FirstName, X.LastName, sep = " ")

plot_ly(data = labelDf, x = ~X.Salary, y = ~X.Shots, color = ~X.Team.Name, text = ~paste("Player: ", Name)) %>%
  layout(xaxis = list(title = "Salary"), yaxis = list(title = "No. of Shots Played", range = 0,10,20,30,40,50,100,150,200,250,300,350,400))

plot_ly(data = labelDf, x = ~X.Salary, y = ~X.GamesPlayed, color = ~X.Team.Name, text = ~paste("Player: ", Name)) %>%
  layout(xaxis = list(title = "Salary"), yaxis = list(title = "No. of Games Played", range = 0,10,20,30,40,50,100,150,200,250,300,350,400))

h2o Model

data <- fullData

data$X.LastName <- as.character(data$X.LastName)
data$X.FirstName <- as.character(data$X.FirstName)
data <- data %>%
  drop_na(X.Salary)

data$Salary_Bracket <- cut(data$X.Salary, c(-Inf,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,Inf), 
                           c("0-3000", "3000-3500", "3500-4000" ,"4000-4500", "4500-5000", "5000-5500", "5500-6000", "6000-6500", "6500-7000", "7000-7500", "7500-8000", "8000-8500", "8500-9000"))
data$Salary_Bracket <- as.character(data$Salary_Bracket)

smp_size <- floor(0.75*nrow(data))
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)

train <- data[train_ind, ]
test <- data[-train_ind, ]

x_train <- train %>% select(-Salary_Bracket)
y_train <- train %>% select(Salary_Bracket)

x_test <- test

rm(train)
rm(test)

x_train_skim = skim_to_list(x_train)
names(x_train_skim)

## [1] "character" "factor"    "integer"

string_2_factor_names <- x_train %>%
  select_if(is.character) %>%
  names()


unique_numeric_values_tbl <- x_train %>%
  select_if(is.numeric) %>%
  purrr::map_df(~ unique(.) %>% length()) %>%
  gather() %>%
  arrange(value) %>%
  mutate(key = as_factor(key))
unique_numeric_values_tbl

## # A tibble: 14 x 2
##    key                 value
##    <fct>               <int>
##  1 X.ShorthandedPoints     9
##  2 X.GameWinningGoals     10
##  3 X.PowerplayPoints      33
##  4 X.Goals                38
##  5 X.Assists              54
##  6 X.Salary               57
##  7 X.PlusMinus            62
##  8 X.Points               75
##  9 X.GamesPlayed          81
## 10 X.PenaltyMinutes       99
## 11 X.Hits                182
## 12 X.Shots               222
## 13 X                     795
## 14 X.Player.ID           795

rec_obj <- recipe(~ ., data = x_train) %>%  
  step_string2factor(string_2_factor_names) %>%  
  step_meanimpute(all_numeric()) %>%  
  step_modeimpute(all_nominal()) %>%  
  prep(stringsAsFactors = FALSE)


x_train_processed <- bake(rec_obj, x_train)
x_test_processed <- bake(rec_obj, x_test)

rec_obj_for_y <- recipe(~ ., data = y_train) %>%
  prep(stringsAsFactors = FALSE)
y_train_processed  <- bake(rec_obj_for_y, y_train)

h2o.init(nthreads = -1)

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         8 minutes 23 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.22.1.1 
##     H2O cluster version age:    4 months and 3 days !!! 
##     H2O cluster name:           H2O_started_from_R_Devyani_udh155 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.00 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)

data_h2o <- as.h2o(
  bind_cols(y_train_processed, x_train_processed),
  destination_frame = "train.hex")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

new_data_h2o <- as.h2o(
  x_test_processed,
  destination_frame = "test.hex")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

data_h2o_no_destination <- as.h2o(
  bind_cols(y_train_processed, x_train_processed)
)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

h2o_keys = as.character(h2o.ls()$key)
h2o_keys = as.character(h2o.ls()$key)
h2o.rm(h2o_keys[str_detect(h2o_keys, "^file")])
splits <- h2o.splitFrame(data = data_h2o,
                         ratios = c(0.9, 0.05),
                         seed = 1234)

train_h2o <- splits[[1]]
valid_h2o <- splits[[2]]
test_h2o <- splits[[3]]

y = "Salary_Bracket"
x <- setdiff(names(train_h2o), y)

m1 <- h2o.deeplearning(
  model_id = "dl_model_first",
  x = x,
  y = y,
  training_frame = train_h2o,
  validation_frame = valid_h2o,
  
  hidden = c(32,32,32),
  epochs = 1000000,
  
  score_validation_samples = 1000,
  stopping_metric = "misclassification",
  stopping_rounds = 2,
  stopping_tolerance = 0.01
)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

h2o Results

h2o.confusionMatrix(m1)

## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##           0-3000 3000-3500 3500-4000 4000-4500 4500-5000 5000-5500
## 0-3000       309         0         0         0         0         0
## 3000-3500      0       123         0         0         0         0
## 3500-4000      0         0        93         0         0         0
## 4000-4500      0         0         0        49         0         0
## 4500-5000      0         0         0         0        38         0
## 5000-5500      0         0         0         0         0        27
## 5500-6000      0         0         0         0         0         0
## 6000-6500      0         0         0         0         0         0
## 6500-7000      0         0         0         0         0         0
## 7000-7500      0         0         0         0         0         0
## 7500-8000      0         0         0         0         0         0
## 8000-8500      0         0         0         0         0         0
## 8500-9000      0         0         0         0         0         0
## Totals       309       123        93        49        38        27
##           5500-6000 6000-6500 6500-7000 7000-7500 7500-8000 8000-8500
## 0-3000            0         0         0         0         0         0
## 3000-3500         0         0         0         0         0         0
## 3500-4000         0         0         0         0         0         0
## 4000-4500         0         0         0         0         0         0
## 4500-5000         0         0         0         0         0         0
## 5000-5500         0         0         0         0         0         0
## 5500-6000        22         0         0         0         0         0
## 6000-6500         0        18         0         0         0         0
## 6500-7000         0         0        11         0         0         0
## 7000-7500         0         0         0         3         0         0
## 7500-8000         0         0         0         0         3         0
## 8000-8500         0         0         0         0         0         6
## 8500-9000         0         0         0         0         0         0
## Totals           22        18        11         3         3         6
##           8500-9000  Error      Rate
## 0-3000            0 0.0000 = 0 / 309
## 3000-3500         0 0.0000 = 0 / 123
## 3500-4000         0 0.0000 =  0 / 93
## 4000-4500         0 0.0000 =  0 / 49
## 4500-5000         0 0.0000 =  0 / 38
## 5000-5500         0 0.0000 =  0 / 27
## 5500-6000         0 0.0000 =  0 / 22
## 6000-6500         0 0.0000 =  0 / 18
## 6500-7000         0 0.0000 =  0 / 11
## 7000-7500         0 0.0000 =   0 / 3
## 7500-8000         0 0.0000 =   0 / 3
## 8000-8500         0 0.0000 =   0 / 6
## 8500-9000         3 0.0000 =   0 / 3
## Totals            3 0.0000 = 0 / 705

More h2o:

data <- fullData

data$X.LastName <- as.character(data$X.LastName)
data$X.FirstName <- as.character(data$X.FirstName)
data <- data %>%
  drop_na(X.Salary)

smp_size <- floor(0.75*nrow(data))
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)

train <- data[train_ind, ]
test <- data[-train_ind, ]

x_train <- train %>% select(-X.Salary)
y_train <- train %>% select(X.Salary)

x_test <- test

rm(train)
rm(test)

x_train_skim = skim_to_list(x_train)
names(x_train_skim)

## [1] "character" "factor"    "integer"

string_2_factor_names <- x_train %>%
  select_if(is.character) %>%
  names()
string_2_factor_names

## [1] "X.LastName"  "X.FirstName"

unique_numeric_values_tbl <- x_train %>%
  select_if(is.numeric) %>%
  purrr::map_df(~ unique(.) %>% length()) %>%
  gather() %>%
  arrange(value) %>%
  mutate(key = as_factor(key))
unique_numeric_values_tbl

## # A tibble: 13 x 2
##    key                 value
##    <fct>               <int>
##  1 X.ShorthandedPoints     9
##  2 X.GameWinningGoals     10
##  3 X.PowerplayPoints      33
##  4 X.Goals                38
##  5 X.Assists              54
##  6 X.PlusMinus            62
##  7 X.Points               75
##  8 X.GamesPlayed          81
##  9 X.PenaltyMinutes       99
## 10 X.Hits                182
## 11 X.Shots               222
## 12 X                     795
## 13 X.Player.ID           795

rec_obj <- recipe(~ ., data = x_train) %>%  
  step_string2factor(string_2_factor_names) %>%  
  step_meanimpute(all_numeric()) %>%  
  step_modeimpute(all_nominal()) %>%  
  prep(stringsAsFactors = FALSE)
rec_obj

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##  predictor         17
## 
## Training data contained 795 data points and no missing data.
## 
## Operations:
## 
## Factor variables from X.LastName, X.FirstName [trained]
## Mean Imputation for X, X.Player.ID, X.GamesPlayed, ... [trained]
## Mode Imputation for X.LastName, X.FirstName, ... [trained]

x_train_processed <- bake(rec_obj, x_train)
x_test_processed <- bake(rec_obj, x_test)

rec_obj_for_y <- recipe(~ ., data = y_train) %>%
  step_num2factor("X.Salary") %>%  
  prep(stringsAsFactors = FALSE)
y_train_processed  <- bake(rec_obj_for_y, y_train)

h2o.init(nthreads = -1)

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         8 minutes 32 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.22.1.1 
##     H2O cluster version age:    4 months and 3 days !!! 
##     H2O cluster name:           H2O_started_from_R_Devyani_udh155 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.00 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)

h2o.clusterInfo()

## R is connected to the H2O cluster: 
##     H2O cluster uptime:         8 minutes 32 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.22.1.1 
##     H2O cluster version age:    4 months and 3 days !!! 
##     H2O cluster name:           H2O_started_from_R_Devyani_udh155 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.00 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)

data_h2o <- as.h2o(
  bind_cols(y_train_processed, x_train_processed),
  destination_frame = "train.hex")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

new_data_h2o <- as.h2o(
  x_test_processed,
  destination_frame = "test.hex")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

data_h2o_no_destination <- as.h2o(
  bind_cols(y_train_processed, x_train_processed)
)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

h2o.ls()

##                                                                                         key
## 1                                                                          RTMP_sid_ace8_41
## 2                                                                          RTMP_sid_ace8_43
## 3                                                                          RTMP_sid_ace8_44
## 4                                                                          RTMP_sid_b2a4_29
## 5                                                                          RTMP_sid_b2a4_31
## 6                                                                          RTMP_sid_b2a4_32
## 7                                                                           RTMP_sid_bf07_7
## 8                                                                           RTMP_sid_bf07_8
## 9                                                                           RTMP_sid_bf07_9
## 10                                                        _8180ca317c2398c0371959fc2d8d47a3
## 11                                                        _85831db87c32e797b22518560b8e486e
## 12                                                         _86b051c2147ec96764e54c3f9bb40f6
## 13                                                        _87dcfc75d0dcb7aa9cb3157869594e00
## 14                                                        _b0cda5f20cc67ad6f06afbfe359048fc
## 15                                                        _ba9773838dc94e6862d82a8823f64804
## 16                                                        _bc89154a77ae4edf2137a467ea784a8d
## 17                                                                   data.frame_sid_8ab6_12
## 18                                                                    data.frame_sid_8c89_3
## 19                                                                   data.frame_sid_9a3d_12
## 20                                                                   data.frame_sid_ace8_35
## 21                                                                   data.frame_sid_b2a4_25
## 22                                                                   data.frame_sid_b6f6_13
## 23                                                                    data.frame_sid_b811_3
## 24                                                                    data.frame_sid_bf07_3
## 25                                                                           dl_model_first
## 26   modelmetrics_dl_model_first@-1256586210960057920_on_RTMP_sid_bf07_7@987808120465394070
## 27  modelmetrics_dl_model_first@-1256586210960057920_on_RTMP_sid_bf07_9@-987969087741660840
## 28 modelmetrics_dl_model_first@-3128150235245331512_on_RTMP_sid_9a3d_18@6095347544275611496
## 29 modelmetrics_dl_model_first@-3128150235245331512_on_RTMP_sid_9a3d_20@6095406911818342166
## 30  modelmetrics_dl_model_first@-6700541122777596928_on_RTMP_sid_b2a4_29@987808120465394070
## 31 modelmetrics_dl_model_first@-6700541122777596928_on_RTMP_sid_b2a4_31@-987969087741660840
## 32    modelmetrics_dl_model_first@1869597267419657920_on_RTMP_sid_8c89_7@987808120465394070
## 33   modelmetrics_dl_model_first@1869597267419657920_on_RTMP_sid_8c89_9@-987969087741660840
## 34  modelmetrics_dl_model_first@2115902726856146752_on_RTMP_sid_ace8_41@6095347544275611496
## 35  modelmetrics_dl_model_first@2115902726856146752_on_RTMP_sid_ace8_43@6095406911818342166
## 36    modelmetrics_dl_model_first@4538598244355895040_on_RTMP_sid_b811_7@987808120465394070
## 37   modelmetrics_dl_model_first@4538598244355895040_on_RTMP_sid_b811_9@-987969087741660840
## 38  modelmetrics_dl_model_first@9100978093220788788_on_RTMP_sid_b6f6_19@6095347544275611496
## 39  modelmetrics_dl_model_first@9100978093220788788_on_RTMP_sid_b6f6_21@6095406911818342166
## 40                                                                                 test.hex
## 41                                                                                train.hex

h2o_keys = as.character(h2o.ls()$key)
h2o_keys = as.character(h2o.ls()$key)
h2o.rm(h2o_keys[str_detect(h2o_keys, "^file")])
h2o.ls()

##                                                                                         key
## 1                                                                          RTMP_sid_ace8_41
## 2                                                                          RTMP_sid_ace8_43
## 3                                                                          RTMP_sid_ace8_44
## 4                                                                          RTMP_sid_b2a4_29
## 5                                                                          RTMP_sid_b2a4_31
## 6                                                                          RTMP_sid_b2a4_32
## 7                                                                           RTMP_sid_bf07_7
## 8                                                                           RTMP_sid_bf07_8
## 9                                                                           RTMP_sid_bf07_9
## 10                                                        _8180ca317c2398c0371959fc2d8d47a3
## 11                                                        _85831db87c32e797b22518560b8e486e
## 12                                                         _86b051c2147ec96764e54c3f9bb40f6
## 13                                                        _87dcfc75d0dcb7aa9cb3157869594e00
## 14                                                        _b0cda5f20cc67ad6f06afbfe359048fc
## 15                                                        _ba9773838dc94e6862d82a8823f64804
## 16                                                        _bc89154a77ae4edf2137a467ea784a8d
## 17                                                                   data.frame_sid_8ab6_12
## 18                                                                    data.frame_sid_8c89_3
## 19                                                                   data.frame_sid_9a3d_12
## 20                                                                   data.frame_sid_ace8_35
## 21                                                                   data.frame_sid_b2a4_25
## 22                                                                   data.frame_sid_b6f6_13
## 23                                                                    data.frame_sid_b811_3
## 24                                                                    data.frame_sid_bf07_3
## 25                                                                           dl_model_first
## 26   modelmetrics_dl_model_first@-1256586210960057920_on_RTMP_sid_bf07_7@987808120465394070
## 27  modelmetrics_dl_model_first@-1256586210960057920_on_RTMP_sid_bf07_9@-987969087741660840
## 28 modelmetrics_dl_model_first@-3128150235245331512_on_RTMP_sid_9a3d_18@6095347544275611496
## 29 modelmetrics_dl_model_first@-3128150235245331512_on_RTMP_sid_9a3d_20@6095406911818342166
## 30  modelmetrics_dl_model_first@-6700541122777596928_on_RTMP_sid_b2a4_29@987808120465394070
## 31 modelmetrics_dl_model_first@-6700541122777596928_on_RTMP_sid_b2a4_31@-987969087741660840
## 32    modelmetrics_dl_model_first@1869597267419657920_on_RTMP_sid_8c89_7@987808120465394070
## 33   modelmetrics_dl_model_first@1869597267419657920_on_RTMP_sid_8c89_9@-987969087741660840
## 34  modelmetrics_dl_model_first@2115902726856146752_on_RTMP_sid_ace8_41@6095347544275611496
## 35  modelmetrics_dl_model_first@2115902726856146752_on_RTMP_sid_ace8_43@6095406911818342166
## 36    modelmetrics_dl_model_first@4538598244355895040_on_RTMP_sid_b811_7@987808120465394070
## 37   modelmetrics_dl_model_first@4538598244355895040_on_RTMP_sid_b811_9@-987969087741660840
## 38  modelmetrics_dl_model_first@9100978093220788788_on_RTMP_sid_b6f6_19@6095347544275611496
## 39  modelmetrics_dl_model_first@9100978093220788788_on_RTMP_sid_b6f6_21@6095406911818342166
## 40                                                                                 test.hex
## 41                                                                                train.hex

splits <- h2o.splitFrame(data = data_h2o,
                         ratios = c(0.7, 0.15),
                         seed = 1234)

train_h2o <- splits[[1]]
valid_h2o <- splits[[2]]
test_h2o <- splits[[3]]

y = "X.Salary"
x <- setdiff(names(train_h2o), y)

m1 <- h2o.deeplearning(
  model_id = "dl_model_first",
  x = x,
  y = y,
  training_frame = train_h2o,
  validation_frame = valid_h2o,
  
  hidden = c(32,32,32),
  epochs = 100,
  
  score_validation_samples = 1000,
  stopping_metric = "misclassification",
  stopping_rounds = 2,
  stopping_tolerance = 0.01
)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

summary(m1)

## Model Details:
## ==============
## 
## H2OMultinomialModel: deeplearning
## Model Key:  dl_model_first 
## Status of Neuron Layers: predicting X.Salary, 57-class classification, multinomial distribution, CrossEntropy loss, 41,593 weights/biases, 517.2 KB, 56,500 training samples, mini-batch size 1
##   layer units      type dropout       l1       l2 mean_rate rate_rms
## 1     1  1174     Input  0.00 %       NA       NA        NA       NA
## 2     2    32 Rectifier  0.00 % 0.000000 0.000000  0.284389 0.444256
## 3     3    32 Rectifier  0.00 % 0.000000 0.000000  0.003774 0.001446
## 4     4    32 Rectifier  0.00 % 0.000000 0.000000  0.036235 0.187708
## 5     5    57   Softmax      NA 0.000000 0.000000  0.291052 0.394079
##   momentum mean_weight weight_rms mean_bias bias_rms
## 1       NA          NA         NA        NA       NA
## 2 0.000000   -0.000282   0.043839  0.392372 0.243121
## 3 0.000000   -0.005611   0.196147  0.958701 0.053061
## 4 0.000000   -0.022750   0.191254  0.975558 0.023454
## 5 0.000000   -0.180308   0.665736 -0.807177 0.792636
## 
## H2OMultinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("RTMP_sid_8ab6_18")`
## MSE: (Extract with `h2o.mse`) 0.7188521
## RMSE: (Extract with `h2o.rmse`) 0.8478515
## Logloss: (Extract with `h2o.logloss`) 2.665916
## Mean Per-Class Error: 0.9008287
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##      2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800
## 2500   44    0    0    0    0   16    1    0    0    0    1    0    0    0
## 2600    4    0    0    0    0   11    4    0    0    0    0    0    1    1
## 2700    9    0    1    0    0   18    1    0    0    0    1    0    0    0
## 2800    5    0    0    0    0    7    4    0    0    0    0    0    1    0
## 2900    2    0    0    0    0   10    1    0    0    0    0    0    0    1
##      3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      6700 6800 6900 7000 7100 7400 7600 7800 8000 8100 8200 8400 8500 8600
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      9100  Error        Rate
## 2500    0 0.2903 =   18 / 62
## 2600    0 1.0000 =   21 / 21
## 2700    0 0.9667 =   29 / 30
## 2800    0 1.0000 =   17 / 17
## 2900    0 1.0000 =   14 / 14
## 
## ---
##        2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals   88    0    1    0    1  211   65    1    0    1    5    5   89
##        3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    2    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    1    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    1    0    0    0    0    0    0    0
## Totals   20    0    0    0    0   53    0    0    0    0    7    0    0
##        5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300
## 8200      0    0    0    1    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    1    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals    0    0    0   18    0    0    0    0    0    0    0    0    0
##        6400 6500 6600 6700 6800 6900 7000 7100 7400 7600 7800 8000 8100
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals    0    0    0    0    0    0    0    0    0    0    0    0    0
##        8200 8400 8500 8600 9100  Error        Rate
## 8200      0    0    0    0    0 1.0000 =     1 / 1
## 8400      0    0    0    0    0 1.0000 =     1 / 1
## 8500      0    0    0    0    0 1.0000 =     2 / 2
## 8600      0    0    0    0    0 1.0000 =     1 / 1
## 9100      0    0    0    0    0 1.0000 =     1 / 1
## Totals    0    0    0    0    0 0.7044 = 398 / 565
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-10 Hit Ratios: 
##     k hit_ratio
## 1   1  0.295575
## 2   2  0.400000
## 3   3  0.477876
## 4   4  0.534513
## 5   5  0.580531
## 6   6  0.612389
## 7   7  0.656637
## 8   8  0.690266
## 9   9  0.716814
## 10 10  0.743363
## 
## 
## H2OMultinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
## 
## Validation Set Metrics: 
## =====================
## 
## Extract validation frame with `h2o.getFrame("RTMP_sid_8ab6_20")`
## MSE: (Extract with `h2o.mse`) 0.8109977
## RMSE: (Extract with `h2o.rmse`) 0.9005541
## Logloss: (Extract with `h2o.logloss`) 3.290263
## Mean Per-Class Error: 0.5295739
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##      2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800
## 2500    5    0    0    0    0    2    0    0    0    0    0    0    0    0
## 2600    1    0    0    0    0    2    0    0    0    0    0    0    0    0
## 2700    1    0    0    0    0    8    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    1    0    0    0    0    0    0    0    1
## 2900    0    0    0    0    0    4    1    0    0    0    0    0    0    0
##      3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      6700 6800 6900 7000 7100 7400 7600 7800 8000 8100 8200 8400 8500 8600
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      9100  Error       Rate
## 2500    0 0.2857 =    2 / 7
## 2600    0 1.0000 =    3 / 3
## 2700    0 1.0000 =    9 / 9
## 2800    0 1.0000 =    2 / 2
## 2900    0 1.0000 =    5 / 5
## 
## ---
##        2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals   12    0    0    0    0   39   14    0    0    0    0    2   13
##        3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000
## 8200      0    0    0    0    0    2    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    1    0    0    0    0    0    0    0
## Totals    7    0    0    0    0    9    0    0    0    0    0    0    0
##        5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals    0    0    0    6    0    0    0    0    0    0    0    0    0
##        6400 6500 6600 6700 6800 6900 7000 7100 7400 7600 7800 8000 8100
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals    0    0    0    0    0    0    0    0    0    0    0    0    0
##        8200 8400 8500 8600 9100  Error       Rate
## 8200      0    0    0    0    0 1.0000 =    2 / 2
## 8400      0    0    0    0    0     NA =    0 / 0
## 8500      0    0    0    0    0     NA =    0 / 0
## 8600      0    0    0    0    0     NA =    0 / 0
## 9100      0    0    0    0    0 1.0000 =    1 / 1
## Totals    0    0    0    0    0 0.8039 = 82 / 102
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-10 Hit Ratios: 
##     k hit_ratio
## 1   1  0.196078
## 2   2  0.264706
## 3   3  0.323529
## 4   4  0.411765
## 5   5  0.441176
## 6   6  0.500000
## 7   7  0.529412
## 8   8  0.558824
## 9   9  0.588235
## 10 10  0.627451
## 
## 
## 
## 
## Scoring History: 
##             timestamp   duration training_speed    epochs iterations
## 1 2019-05-01 19:22:31  0.000 sec             NA   0.00000          0
## 2 2019-05-01 19:22:31  0.525 sec  11746 obs/sec  10.00000          1
## 3 2019-05-01 19:22:36  4.613 sec  12423 obs/sec 100.00000         10
## 4 2019-05-01 19:22:36  4.664 sec  12382 obs/sec 100.00000         10
##        samples training_rmse training_logloss training_r2
## 1     0.000000            NA               NA          NA
## 2  5650.000000       0.84785          2.66592     0.99479
## 3 56500.000000       0.14548          0.07591     0.99985
## 4 56500.000000       0.84785          2.66592     0.99479
##   training_classification_error validation_rmse validation_logloss
## 1                            NA              NA                 NA
## 2                       0.70442         0.90055            3.29026
## 3                       0.02124         0.93586            8.49430
## 4                       0.70442         0.90055            3.29026
##   validation_r2 validation_classification_error
## 1            NA                              NA
## 2       0.99465                         0.80392
## 3       0.99422                         0.90196
## 4       0.99465                         0.80392
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##             variable relative_importance scaled_importance percentage
## 1       X.Position.D            1.000000          1.000000   0.001855
## 2  X.PowerplayPoints            0.974560          0.974560   0.001807
## 3            X.Shots            0.973392          0.973392   0.001805
## 4 X.GameWinningGoals            0.852101          0.852101   0.001580
## 5            X.Goals            0.836885          0.836885   0.001552
## 
## ---
##                     variable relative_importance scaled_importance
## 1169      X.FirstName.Dakota            0.311681          0.311681
## 1170     X.LastName.Chartier            0.307948          0.307948
## 1171  X.LastName.missing(NA)            0.000000          0.000000
## 1172 X.FirstName.missing(NA)            0.000000          0.000000
## 1173 X.Team.Name.missing(NA)            0.000000          0.000000
## 1174  X.Position.missing(NA)            0.000000          0.000000
##      percentage
## 1169   0.000578
## 1170   0.000571
## 1171   0.000000
## 1172   0.000000
## 1173   0.000000
## 1174   0.000000

h2o.confusionMatrix(m1)

## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##      2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800
## 2500   44    0    0    0    0   16    1    0    0    0    1    0    0    0
## 2600    4    0    0    0    0   11    4    0    0    0    0    0    1    1
## 2700    9    0    1    0    0   18    1    0    0    0    1    0    0    0
## 2800    5    0    0    0    0    7    4    0    0    0    0    0    1    0
## 2900    2    0    0    0    0   10    1    0    0    0    0    0    0    1
##      3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      6700 6800 6900 7000 7100 7400 7600 7800 8000 8100 8200 8400 8500 8600
## 2500    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2600    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2700    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2800    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## 2900    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##      9100  Error        Rate
## 2500    0 0.2903 =   18 / 62
## 2600    0 1.0000 =   21 / 21
## 2700    0 0.9667 =   29 / 30
## 2800    0 1.0000 =   17 / 17
## 2900    0 1.0000 =   14 / 14
## 
## ---
##        2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals   88    0    1    0    1  211   65    1    0    1    5    5   89
##        3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    2    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    1    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    1    0    0    0    0    0    0    0
## Totals   20    0    0    0    0   53    0    0    0    0    7    0    0
##        5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300
## 8200      0    0    0    1    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    1    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals    0    0    0   18    0    0    0    0    0    0    0    0    0
##        6400 6500 6600 6700 6800 6900 7000 7100 7400 7600 7800 8000 8100
## 8200      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8400      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8500      0    0    0    0    0    0    0    0    0    0    0    0    0
## 8600      0    0    0    0    0    0    0    0    0    0    0    0    0
## 9100      0    0    0    0    0    0    0    0    0    0    0    0    0
## Totals    0    0    0    0    0    0    0    0    0    0    0    0    0
##        8200 8400 8500 8600 9100  Error        Rate
## 8200      0    0    0    0    0 1.0000 =     1 / 1
## 8400      0    0    0    0    0 1.0000 =     1 / 1
## 8500      0    0    0    0    0 1.0000 =     2 / 2
## 8600      0    0    0    0    0 1.0000 =     1 / 1
## 9100      0    0    0    0    0 1.0000 =     1 / 1
## Totals    0    0    0    0    0 0.7044 = 398 / 565

NHL Analytics Project

Business Context:

Data Cleaning/Preparation:

Data Exploration

h2o Model

h2o Results

More h2o:

Multiple Linear Regression

Regression Results

Key Takeaways