#Packages used
library(tidyverse)

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.3     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.4     v tibble    3.2.1
## v lubridate 1.9.3     v tidyr     1.3.0
## v purrr     1.0.2     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(MLmetrics)

## 
## Attaching package: 'MLmetrics'
## 
## The following object is masked from 'package:base':
## 
##     Recall

library(ggpubr)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:MLmetrics':
## 
##     MAE, RMSE
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(caTools)
library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following object is masked from 'package:pROC':
## 
##     var
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

#Uploading data sets for combination

elevation<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/USGS_CACounties_elevation_2023 (1).csv")
slope<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/SlopePercentage_Calitracts_LF2020.csv")
whp<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/WHP2020_ZipCode_Summary - zipcode_summary.csv")

weather<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/NOAA_CACounties_AverageTemp_2022.csv")
rainfall<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/NOAA_CACounties_AveragePercipitation_2022.csv")

LF_Vegdictonary<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/LF22_EVT_230 - LF22_EVT_230.csv.csv")
cali_vegtype<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/CalifornianTracts_VegType_2022LF - test.csv.csv")
#combining the weather data set first as both on the county lvl. Only want the averages of the year
cali_cweather<-weather%>%left_join(rainfall,by=join_by(ID))
cali_cweather<-cali_cweather%>%select(-c("Rank.x","Anomaly (1901-2000 base period).x","1901-2000 Mean.x","Name.y","State.y","Rank.y","Anomaly (1901-2000 base period).y","1901-2000 Mean.y"))
cali_cweather<-cali_cweather%>%rename(county_Id=ID,county_name=Name.x,State=State.x,avg_tempeture=Value.x,avg_precipitation=Value.y)

#onto topographic data, will need to combine by county and lat/long(if possible) #Need the county ID in Slope data for left combine with elevation

cali_topography<-slope%>%unite("countyID",1:2,remove = FALSE,sep = "")

#14 rows missing slope data, can be zero slope but using mice for imputation

cali_topography<-cali_topography%>%rename(tract_avgSlope="_mean",tract_countSlope="_count",tract_maxSlope="_max")
cali_topography<-complete(mice(cali_topography,method = "cart",seed = 333))

elevation<-elevation%>%rename(countyID="County FIPS Code")

#mapping the tract data in topography by county ID and the closest match by Longitude

cali_topography<-cali_topography%>%inner_join(elevation,by=join_by(countyID,closest(INTPTLON<=Longitude)))

#removing unnecessary metrics and renaming columns for readability

cali_topography<-cali_topography%>%select(-c("Latitude","Longitude","Bgn Decision Date","Entry Date","Census Code","Census Classification Code","GSA Code","OPM Code","State FIPS Code","Map","State","Class","tract_maxSlope","Gaz ID","Feature Name","tract_countSlope","MTFCC","FUNCSTAT"))
cali_topography<-cali_topography%>%rename(tractID=NAME,land_Area=ALAND,water_Area=AWATER,latitude=INTPTLAT,longitude=INTPTLON,county_avgElevation=Elevation)

#Using LandFire’s vegetation type dictionary to map tract’s average vegetation type #Filtering for CA tracts only

cali_vegetation<-cali_vegtype%>%filter(STUSPS=="CA")
lf_small<-LF_Vegdictonary%>%select("VALUE","EVT_NAME","EVT_LF","EVT_CLASS")
cali_vegetation<-cali_vegetation%>%left_join(lf_small,by=join_by(closest("_mean">=VALUE)))

#Cleaning up new data set

cali_vegetation<-cali_vegetation%>%select(-c("STATEFP","COUNTYFP","TRACTCE","AFFGEOID","NAME","NAMELSAD","STUSPS","NAMELSADCO","STATE_NAME","LSAD","ALAND","AWATER","_count","_sum","_mean","VALUE","EVT_NAME","EVT_CLASS"))

#Combing weather, topography, and vegetation

cali_features<-cali_topography%>%left_join(cali_cweather,by=join_by(County==county_name))
cali_features<-cali_features%>%select(-c("county_Id","State"))
cali_vegetation<-cali_vegetation%>% mutate(GEOID = paste("0", GEOID, sep = ""))

#Census Tract 9901 does not have vegetation as it is the shoreline, replacing NAs with Water

cali_features<-cali_features%>%left_join(cali_vegetation,by=join_by(GEOID))
cali_features<-cali_features%>%mutate(EVT_LF=replace_na(EVT_LF,"Water"))

#saving export

write.csv(cali_features,"caliTracts_features.csv")

#Reducing predictors from NRI before the combination

nri_data<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/NRI_Table_CensusTracts_Subset.csv")
nri<-nri_data%>%select("STATE","STATEABBRV","STATEFIPS","COUNTY","COUNTYTYPE","COUNTYFIPS","STCOFIPS","TRACT","TRACTFIPS","POPULATION","AREA","DRGT_EVNTS","DRGT_AFREQ","DRGT_HLRA","HWAV_EVNTS","HWAV_AFREQ","HWAV_HLRA","LTNG_EVNTS","LTNG_AFREQ","SWND_EVNTS","SWND_AFREQ","SWND_HLRA","WFIR_EVNTS","WFIR_AFREQ","WFIR_EXPA","WFIR_EXPT","WFIR_EXP_AREA","WFIR_HLRB","WFIR_HLRP","WFIR_HLRA","WFIR_HLRR","WFIR_EALT","WFIR_EALS","WFIR_EALR","WFIR_ALRA","WFIR_RISKV","WFIR_RISKS","WFIR_RISKR")

#only CA cases, converting categorical to binary

nri<-nri%>%filter(STATEABBRV=="CA")
nri<-nri%>%mutate(WFRI_R=case_when(WFIR_RISKV<13000~0,WFIR_RISKV>13000~1))

#Removal of extra columns

nri<-nri%>%select(-c(WFIR_EVNTS,WFIR_HLRR,WFIR_EALR))

#combination of cali features

cali_features<-read_csv("C:/Users/walki/Documents/GitHub/D698/Datasets/caliTracts_features.csv")

nri_cali<-nri%>%inner_join(cali_features,by=join_by(TRACTFIPS==GEOID))
nri_cali<-nri_cali%>%select(-c(STATE,STATEABBRV,STATEFIPS,COUNTY,COUNTYTYPE,COUNTYFIPS,STCOFIPS,TRACT,...1,STATEFP,COUNTYFP,TRACTCE,tractID,NAMELSAD,land_Area,latitude,longitude,County,countyID))

#renaming columns for reference

nri_cali<-nri_cali%>%rename(TRCT_WAREA=water_Area,TRCT_SLOPE=tract_avgSlope,CNTY_ELEV=county_avgElevation,CNTY_TEMP=avg_tempeture,CNTY_PRECIP=avg_precipitation,TRCT_VEGLF=EVT_LF)

write.csv(nri_cali,"nri_cali.csv")

###Data Exploration ##using nri.cali Dataset

cali_data<-read_csv("C:/Users/walki/Documents/GitHub/D698/nri_cali.csv")

## New names:
## Rows: 9098 Columns: 34
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (2): TRACTFIPS, TRCT_VEGLF dbl (32): ...1, POPULATION, AREA, DRGT_EVNTS,
## DRGT_AFREQ, DRGT_HLRA, HWAV_EV...
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`

#Removing non important columns #Note: WFIR_RISKS AND WFIR_RISV holds actual percentages of the likelihood, cannot be included as predictor value

c_data<-cali_data%>%select(-c("...1","WFIR_RISKV","WFIR_RISKS","WFIR_HLRB"))

#Looking at the summary of all variables in the data set

summary(c_data)

##   TRACTFIPS           POPULATION         AREA            DRGT_EVNTS  
##  Length:9098        Min.   :    0   Min.   :   0.008   Min.   :   0  
##  Class :character   1st Qu.: 3209   1st Qu.:   0.379   1st Qu.:1148  
##  Mode  :character   Median : 4194   Median :   0.693   Median :1372  
##                     Mean   : 4340   Mean   :  16.190   Mean   :1330  
##                     3rd Qu.: 5350   3rd Qu.:   1.644   3rd Qu.:1624  
##                     Max.   :37562   Max.   :7024.460   Max.   :2261  
##    DRGT_AFREQ       DRGT_HLRA           HWAV_EVNTS       HWAV_AFREQ    
##  Min.   :  0.00   Min.   :0.0000121   Min.   :  0.00   Min.   : 0.000  
##  1st Qu.: 52.18   1st Qu.:0.0000121   1st Qu.: 20.94   1st Qu.: 1.357  
##  Median : 62.36   Median :0.0000121   Median : 35.00   Median : 2.167  
##  Mean   : 60.47   Mean   :0.0006693   Mean   : 41.07   Mean   : 2.550  
##  3rd Qu.: 73.82   3rd Qu.:0.0016279   3rd Qu.: 66.85   3rd Qu.: 4.139  
##  Max.   :102.77   Max.   :0.0036879   Max.   :240.00   Max.   :14.861  
##    HWAV_HLRA           LTNG_EVNTS      LTNG_AFREQ        SWND_EVNTS    
##  Min.   :8.000e-10   Min.   :  0.0   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:4.569e-05   1st Qu.: 12.0   1st Qu.: 0.5254   1st Qu.: 2.000  
##  Median :7.618e-05   Median : 17.0   Median : 0.7727   Median : 5.000  
##  Mean   :7.297e-05   Mean   : 21.6   Mean   : 0.9761   Mean   : 4.387  
##  3rd Qu.:1.057e-04   3rd Qu.: 24.0   3rd Qu.: 1.0909   3rd Qu.: 6.000  
##  Max.   :2.357e-04   Max.   :302.0   Max.   :13.7060   Max.   :11.000  
##    SWND_AFREQ        SWND_HLRA           WFIR_AFREQ         WFIR_EXPA        
##  Min.   :0.00000   Min.   :3.030e-08   Min.   :0.000000   Min.   :        0  
##  1st Qu.:0.02971   1st Qu.:1.078e-06   1st Qu.:0.000000   1st Qu.:        0  
##  Median :0.13369   Median :1.766e-06   Median :0.000000   Median :        0  
##  Mean   :0.11547   Mean   :2.775e-05   Mean   :0.002021   Mean   :   563272  
##  3rd Qu.:0.17430   3rd Qu.:2.114e-05   3rd Qu.:0.001067   3rd Qu.:        0  
##  Max.   :1.46035   Max.   :8.467e-04   Max.   :0.063501   Max.   :258377478  
##    WFIR_EXPT         WFIR_EXP_AREA        WFIR_HLRP           WFIR_HLRA        
##  Min.   :0.000e+00   Min.   : 0.00000   Min.   :1.784e-06   Min.   :4.620e-07  
##  1st Qu.:0.000e+00   1st Qu.: 0.00000   1st Qu.:6.677e-06   1st Qu.:6.600e-07  
##  Median :0.000e+00   Median : 0.00000   Median :1.960e-05   Median :8.470e-07  
##  Mean   :1.903e+09   Mean   : 0.15782   Mean   :4.423e-05   Mean   :9.784e-04  
##  3rd Qu.:1.102e+09   3rd Qu.: 0.02316   3rd Qu.:1.973e-05   3rd Qu.:5.797e-05  
##  Max.   :8.390e+10   Max.   :31.74707   Max.   :9.962e-04   Max.   :2.701e-02  
##    WFIR_EALT          WFIR_EALS        WFIR_ALRA             WFRI_R      
##  Min.   :       0   Min.   :  0.00   Min.   :0.000e+00   Min.   :0.0000  
##  1st Qu.:       0   1st Qu.:  0.00   1st Qu.:0.000e+00   1st Qu.:0.0000  
##  Median :       0   Median :  0.00   Median :0.000e+00   Median :0.0000  
##  Mean   :  152785   Mean   : 29.78   Mean   :7.176e-07   Mean   :0.2071  
##  3rd Qu.:    2518   3rd Qu.: 76.73   3rd Qu.:0.000e+00   3rd Qu.:0.0000  
##  Max.   :29548172   Max.   :100.00   Max.   :1.377e-04   Max.   :1.0000  
##    TRCT_WAREA          TRCT_SLOPE       CNTY_ELEV        CNTY_TEMP    
##  Min.   :0.000e+00   Min.   :0.0000   Min.   : -84.0   Min.   :45.50  
##  1st Qu.:0.000e+00   1st Qu.:0.4167   1st Qu.:  27.0   1st Qu.:60.50  
##  Median :0.000e+00   Median :0.7900   Median :  96.0   Median :63.70  
##  Mean   :8.072e+05   Mean   :1.6222   Mean   : 193.9   Mean   :62.85  
##  3rd Qu.:4.108e+03   3rd Qu.:2.4142   3rd Qu.: 234.0   3rd Qu.:64.20  
##  Max.   :1.098e+09   Max.   :9.0156   Max.   :2534.0   Max.   :75.50  
##   CNTY_PRECIP     TRCT_VEGLF       
##  Min.   : 2.17   Length:9098       
##  1st Qu.: 7.64   Class :character  
##  Median : 8.64   Mode  :character  
##  Mean   :11.04                     
##  3rd Qu.:13.45                     
##  Max.   :52.90

#checking for null values

colSums(is.na(c_data))

##     TRACTFIPS    POPULATION          AREA    DRGT_EVNTS    DRGT_AFREQ 
##             0             0             0             0             0 
##     DRGT_HLRA    HWAV_EVNTS    HWAV_AFREQ     HWAV_HLRA    LTNG_EVNTS 
##             0             0             0             0             0 
##    LTNG_AFREQ    SWND_EVNTS    SWND_AFREQ     SWND_HLRA    WFIR_AFREQ 
##             0             0             0             0             0 
##     WFIR_EXPA     WFIR_EXPT WFIR_EXP_AREA     WFIR_HLRP     WFIR_HLRA 
##             0             0             0             0             0 
##     WFIR_EALT     WFIR_EALS     WFIR_ALRA        WFRI_R    TRCT_WAREA 
##             0             0             0             0             0 
##    TRCT_SLOPE     CNTY_ELEV     CNTY_TEMP   CNTY_PRECIP    TRCT_VEGLF 
##             0             0             0             0             0

#Reviewing the current distribution of the predictor values. See if there’s future transformations

g1<-c_data%>%ggplot(aes(x=POPULATION))+geom_histogram(bins=20)+theme_light()
g2<-c_data%>%ggplot(aes(x=AREA))+geom_histogram(bins=20)+theme_light()
g3<-c_data%>%ggplot(aes(x=DRGT_EVNTS))+geom_histogram(bins=20)+theme_light()
g4<-c_data%>%ggplot(aes(x=HWAV_AFREQ))+geom_histogram(bins=20)+theme_light()
g7<-c_data%>%ggplot(aes(x=WFIR_AFREQ))+geom_histogram(bins=20)+theme_light()
g10<-c_data%>%ggplot(aes(x=WFIR_ALRA))+geom_histogram(bins=20)+theme_light()
g14<-c_data%>%ggplot(aes(x=LTNG_AFREQ))+geom_histogram(bins=20)+theme_light()
g22<-c_data%>%ggplot(aes(x=SWND_AFREQ))+geom_histogram(bins=20)+theme_light()

#Plot for project write up, only a selected few

plt1<-ggarrange(g1,g2,g3,g4,g14,g22,g10,g7,nrow =4,ncol =2,align="h",heights = 2,font.label = list(size =3, color = "black"))

annotate_figure(plt1,top = text_grob("Distribution of Selected WildFire Predictor variables ",size=9))

#Reviewing a few variables on its boxplots in reflection with the response variable WFRI_R

g1<-c_data%>%ggplot(aes(y=TRCT_SLOPE,x=factor(WFRI_R)))+geom_boxplot()+theme_light()+labs(x="WildFire Present",y="Tract Slope")
g2<-c_data%>%ggplot(aes(y=SWND_EVNTS,x=factor(WFRI_R)))+geom_boxplot()+theme_light()+labs(x="WildFire Present",y="Annual Strong Wind Events")
g3<-c_data%>%ggplot(aes(y=CNTY_PRECIP,x=factor(WFRI_R)))+geom_boxplot()+theme_light()+labs(x="WildFire Present",y="Annual County Precipitation")
g4<-c_data%>%ggplot(aes(y=LTNG_EVNTS,x=factor(WFRI_R)))+geom_boxplot()+theme_light()+labs(x="WildFire Present",y="Annual Lighting Events")

plt1<-ggarrange(g1,g2,g3,g4,nrow =2,ncol =2,align="h",heights = 2,font.label = list(size =3, color = "black"))

annotate_figure(plt1,top = text_grob("Difference in Summary Statistics With Wildfire presence",size=9))

#Checking if the data set is imbalanced

c_data%>%ggplot(aes(fill=WFRI_R))+geom_bar(aes(x=WFRI_R))+labs(title="WildFire Cases in the Data Set",x="WildFire Presence")

## Warning: The following aesthetics were dropped during statistical transformation: fill
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

###Data Preparation

#transforming the tract vegetation life into binary dummy variables via mutate

unique(c_data$TRCT_VEGLF)

## [1] "Shrub"       "Tree"        "Developed"   "Herb"        "Agriculture"
## [6] "Sparse"      "Barren"      "Snow-Ice"    "Water"

c_data<-c_data%>%mutate(.isShrub=if_else(TRCT_VEGLF=="Shrub",1,0),
                        .isTree=if_else(TRCT_VEGLF=="Tree",1,0),
                        .isDeveloped=if_else(TRCT_VEGLF=="Developed",1,0),
                        .isHerb=if_else(TRCT_VEGLF=="Herb",1,0),
                        .isArgiculture=if_else(TRCT_VEGLF=="Argiculture",1,0),
                        .isSparse=if_else(TRCT_VEGLF=="Sparse",1,0),
                        .isBarren=if_else(TRCT_VEGLF=="Barren",1,0),
                        .isSnowIce=if_else(TRCT_VEGLF=="Snow-Ice",1,0),
                        )
c_data<-c_data%>%select(-c("TRCT_VEGLF"))

#Seeing if there’s multi-collinearity in the current predictors #Drought events and drought frequency are highly correlated, Let’s see with variable selection if one of the variables is dropped from the optimized predictor set

temp<-c_data%>%select(-c("WFRI_R","TRACTFIPS"))
temp<-cor(temp)

## Warning in stats::cor(x, ...): the standard deviation is zero

#setting the binary response and a few predictor variables as a factor before modeling

c_data<-c_data%>%mutate_at(c('WFRI_R',".isShrub",".isTree",".isDeveloped",".isHerb",".isArgiculture",".isSparse",".isBarren",".isSnowIce"),as.factor)

#Downsampling the non wildfire cases so the models can predict more fire cases

dwn_data<-downSample(x=c_data[,-ncol(c_data)],y=c_data$WFRI_R)

#Seeing new distribution

dwn_data%>%ggplot(aes(fill=WFRI_R))+geom_bar(aes(x=WFRI_R))+labs(title="WildFire Cases in the Data Set",x="WildFire Presence")

#splitting data set into testing and training

temp<-sample.split(dwn_data$WFRI_R,SplitRatio = 0.7)
training_data<-subset(dwn_data,temp==TRUE)
test_data<-subset(dwn_data,temp==FALSE)

names(test_data)

##  [1] "TRACTFIPS"      "POPULATION"     "AREA"           "DRGT_EVNTS"    
##  [5] "DRGT_AFREQ"     "DRGT_HLRA"      "HWAV_EVNTS"     "HWAV_AFREQ"    
##  [9] "HWAV_HLRA"      "LTNG_EVNTS"     "LTNG_AFREQ"     "SWND_EVNTS"    
## [13] "SWND_AFREQ"     "SWND_HLRA"      "WFIR_AFREQ"     "WFIR_EXPA"     
## [17] "WFIR_EXPT"      "WFIR_EXP_AREA"  "WFIR_HLRP"      "WFIR_HLRA"     
## [21] "WFIR_EALT"      "WFIR_EALS"      "WFIR_ALRA"      "WFRI_R"        
## [25] "TRCT_WAREA"     "TRCT_SLOPE"     "CNTY_ELEV"      "CNTY_TEMP"     
## [29] "CNTY_PRECIP"    ".isShrub"       ".isTree"        ".isDeveloped"  
## [33] ".isHerb"        ".isArgiculture" ".isSparse"      ".isBarren"     
## [37] "Class"

x_traindata <- setdiff(names(training_data), c("WFRI_R"))
names(training_data)

##  [1] "TRACTFIPS"      "POPULATION"     "AREA"           "DRGT_EVNTS"    
##  [5] "DRGT_AFREQ"     "DRGT_HLRA"      "HWAV_EVNTS"     "HWAV_AFREQ"    
##  [9] "HWAV_HLRA"      "LTNG_EVNTS"     "LTNG_AFREQ"     "SWND_EVNTS"    
## [13] "SWND_AFREQ"     "SWND_HLRA"      "WFIR_AFREQ"     "WFIR_EXPA"     
## [17] "WFIR_EXPT"      "WFIR_EXP_AREA"  "WFIR_HLRP"      "WFIR_HLRA"     
## [21] "WFIR_EALT"      "WFIR_EALS"      "WFIR_ALRA"      "WFRI_R"        
## [25] "TRCT_WAREA"     "TRCT_SLOPE"     "CNTY_ELEV"      "CNTY_TEMP"     
## [29] "CNTY_PRECIP"    ".isShrub"       ".isTree"        ".isDeveloped"  
## [33] ".isHerb"        ".isArgiculture" ".isSparse"      ".isBarren"     
## [37] "Class"

Data Analysis

#connection to h20 server

h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         28 minutes 57 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.42.0.2 
##     H2O cluster version age:    4 months and 11 days 
##     H2O cluster name:           H2O_started_from_R_walki_ufu234 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.75 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.1.2 (2021-11-01)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (4 months and 11 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

#Uploading the data set into h2o and splitting the data set into training/test. Choosing a 70/30 split and splitting testing for a validation test

train.h2o<-as.h2o(training_data)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

test.h2o<-as.h2o(test_data)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#Model 1| Random Forest

#removed wildfire exposure features it's possibly tied to the  likelihood response(,"WFIR_EXPA","WFIR_EXPT","WFIR_EXP_AREA")
features<-c("POPULATION","AREA","DRGT_AFREQ","DRGT_HLRA","HWAV_EVNTS","HWAV_AFREQ","HWAV_HLRA","LTNG_EVNTS","LTNG_AFREQ","SWND_EVNTS","SWND_AFREQ","SWND_HLRA","WFIR_AFREQ","WFIR_HLRP","WFIR_HLRA","TRCT_WAREA","TRCT_SLOPE","CNTY_ELEV","CNTY_TEMP","CNTY_PRECIP",".isShrub",".isTree",".isDeveloped",".isHerb",".isSparse",".isBarren")
response<-c("WFRI_R")

##Version 1 of Random Forest

#V1: Stopping metrics based on AUC score as preventing for overfitting and added Cross-validation with Nfolds
rf.model<-h2o.randomForest(x = features, y =response , training_frame = train.h2o, stopping_rounds = 5,stopping_tolerance = 0.001, stopping_metric = "AUC", seed = 3, balance_classes = FALSE, nfolds = 5,score_tree_interval=10, keep_cross_validation_predictions = TRUE)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [.isBarren].

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#see the first version of the tree structure
rf.model@model$model_summary

#look at cross-validation results from the model, Not much difference between cases
rf.cross<-rf.model@model$cross_validation_metrics_summary%>%select(-c(mean,sd))
#rf.cross

#review the feature importance of this random forest model and using the highest gini indexes in the final model
rf.features<-h2o.varimp(rf.model)
features_v2<-rf.features$variable[1:10]%>%as.vector()

##Version 2| RF

#Shortening max depth and trees for more conservative AUC. Applying highest gini index features to the model
rf.v2<-h2o.randomForest(x = features_v2, y =response , training_frame = train.h2o, stopping_rounds = 5,stopping_tolerance = 0.01, stopping_metric = "AUC", seed = 3, balance_classes = FALSE, nfolds = 10,score_tree_interval=10,max_depth=5,ntrees=25,min_rows = 20, keep_cross_validation_predictions = TRUE)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

RF Verison 2 - Model Performance

rf2_perf <- h2o.performance(rf.v2, test.h2o)
rf2_perf

## H2OBinomialMetrics: drf
## 
## MSE:  0.03501276
## RMSE:  0.187117
## LogLoss:  0.1388012
## Mean Per-Class Error:  0.03982301
## AUC:  0.9917065
## AUCPR:  0.9910499
## Gini:  0.983413
## R^2:  0.859949
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          0   1    Error      Rate
## 0      539  26 0.046018   =26/565
## 1       19 546 0.033628   =19/565
## Totals 558 572 0.039823  =45/1130
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.623578   0.960422 237
## 2                       max f2  0.184414   0.976413 283
## 3                 max f0point5  0.682292   0.964247 219
## 4                 max accuracy  0.661591   0.960177 225
## 5                max precision  0.997350   1.000000   0
## 6                   max recall  0.078670   1.000000 331
## 7              max specificity  0.997350   1.000000   0
## 8             max absolute_mcc  0.661591   0.920425 225
## 9   max min_per_class_accuracy  0.636041   0.959292 231
## 10 max mean_per_class_accuracy  0.661591   0.960177 225
## 11                     max tns  0.997350 565.000000   0
## 12                     max fns  0.997350 562.000000   0
## 13                     max fps  0.005567 565.000000 399
## 14                     max tps  0.078670 565.000000 331
## 15                     max tnr  0.997350   1.000000   0
## 16                     max fnr  0.997350   0.994690   0
## 17                     max fpr  0.005567   1.000000 399
## 18                     max tpr  0.078670   1.000000 331
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

rf2_pred <- h2o.predict(rf.v2,test.h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

pred1 <- as.data.frame(rf2_pred$predict)
pred1$predict <- factor(pred1$predict, levels = c(0, 1))
mean(pred1$predict==test_data$WFRI_R)

## [1] 0.9539823

#plotting logloss of the revised model
plot(rf.v2)

RF Version2 - Predictions

#from the revised model, pull results from prediction against test set
rf.pred<-h2o.predict(rf.v2,test.h2o)%>%as.data.frame()%>%pull(predict)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

rf.precsnprob<-h2o.predict(rf.v2,test.h2o)%>%as.data.frame()%>%pull(p1)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

rf.reclprob<-h2o.predict(rf.v2,test.h2o)%>%as.data.frame()%>%pull(p0)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

RF Version 2 - confusion matrix of rf predictions

rf.con<-confusionMatrix(rf.pred,test_data$WFRI_R,positive = "1",mode = "prec_recall")

rf.con

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 527  14
##          1  38 551
##                                           
##                Accuracy : 0.954           
##                  95% CI : (0.9401, 0.9654)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.908           
##                                           
##  Mcnemar's Test P-Value : 0.001425        
##                                           
##               Precision : 0.9355          
##                  Recall : 0.9752          
##                      F1 : 0.9549          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4876          
##    Detection Prevalence : 0.5212          
##       Balanced Accuracy : 0.9540          
##                                           
##        'Positive' Class : 1               
##

RF Version 2 - storing confusion matrix results

confusionM<-rf.con$byClass%>%as.data.frame()%>%t()
confusionM<-as.data.frame(confusionM)
confusionM<-confusionM%>%rename(Pos_pred_value="Pos Pred Value",Neg_Pred_value="Neg Pred Value",Detection_rate="Detection Rate", Detection_prevalence="Detection Prevalence",Balanced_Accuracy="Balanced Accuracy")
#confusionM

#creating a reference table w/ predicted probabilities, actual, and predictions all in one #see summary results of random forest model on the test data

rf.summary<-data.frame(
      obs<-test_data$WFRI_R,
      pred<-rf.pred,
      N<-rf.reclprob,
      Y<-rf.precsnprob
  )

rf.summary<-rf.summary%>%rename(obs="obs....test_data.WFRI_R",pred="pred....rf.pred", N="N....rf.reclprob", Y="Y....rf.precsnprob")

rf.auc<-roc(rf.summary$obs,Y)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

test.results<-data.frame(
  t.R2<-R2_Score(y_pred = as.numeric(as.character(rf.summary$pred)),y_true =as.numeric(as.character( rf.summary$obs))),
  t.mse<-MSE(as.numeric(as.character(rf.summary$pred)),as.numeric(as.character(rf.summary$obs))),
  t.RSME<-RMSE(as.numeric(as.character(rf.summary$pred)),as.numeric(as.character(rf.summary$obs))),
  t.AUC<-rf.auc$auc,
  t.ClassError<-mean(rf.summary$pred!=rf.summary$obs)
  
)

#Stores Test Performance of Models

test.results<-test.results%>%rename(R2="t.R2....R2_Score.y_pred...as.numeric.as.character.rf.summary.pred....",MSE="t.mse....MSE.as.numeric.as.character.rf.summary.pred....as.numeric.as.character.rf.summary.obs...",RMSE="t.RSME....RMSE.as.numeric.as.character.rf.summary.pred....as.numeric.as.character.rf.summary.obs...",AUC="t.AUC....rf.auc.auc",classError="t.ClassError....mean.rf.summary.pred....rf.summary.obs.")
test.results

##          R2       MSE      RMSE       AUC classError
## 1 0.8159292 0.0460177 0.2145174 0.9917049  0.0460177

#plotting ROC curve of Random Forest - Version 2

ggroc(rf.auc)+ggtitle("Random Forest ROC Curve of AUC= 0.9891")+geom_segment(aes(x=1,y=0,xend=0,yend=1),linetype="dotted",color="red")+theme_light()

#Model 2| Gradient Boosting Decision Tree

#Version 1| #Higher learning rate, Stopping metric on AUC as logloss saw lower R^2, Cross-validation on training set

gb_model<-h2o.gbm(x=features,y=response,training_frame = train.h2o,learn_rate = 0.1,ntrees=1000,stopping_rounds = 3,stopping_tolerance = 0.001,stopping_metric = "auc", score_tree_interval = 5,nfolds=5,seed=3)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [.isBarren].

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#see the first version of the tree structure

gb_model@model$model_summary
gb.cross<-gb_model@model$cross_validation_metrics_summary%>%select(-c(mean,sd))

#Feature importance for gb_model

gb.features<-h2o.varimp(gb_model)
features_v2<-gb.features$variable[1:10]%>%as.vector()

#Version 2 #Lowering stopping rounds for a quicker review on the AUC score, Smaller tree size from v1

gb_v2<-h2o.gbm(x=features_v2,y=response,training_frame = train.h2o,learn_rate = 0.1,ntrees=45,stopping_rounds = 2,stopping_tolerance = 0.001,stopping_metric = "auc", score_tree_interval = 5,nfolds=5,seed=3)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#see performance of prediction

gb.pred<-h2o.predict(gb_v2,test.h2o)%>%as.data.frame()%>%pull(predict)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

gb.precsnprob<-h2o.predict(gb_v2,test.h2o)%>%as.data.frame()%>%pull(p1)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

gb.reclprob<-h2o.predict(gb_v2,test.h2o)%>%as.data.frame()%>%pull(p0)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#Print the confusion matrix

gb.con<-confusionMatrix(gb.pred,test_data$WFRI_R,positive = "1",mode = "prec_recall")

gb.con

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 542  19
##          1  23 546
##                                           
##                Accuracy : 0.9628          
##                  95% CI : (0.9501, 0.9731)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9257          
##                                           
##  Mcnemar's Test P-Value : 0.6434          
##                                           
##               Precision : 0.9596          
##                  Recall : 0.9664          
##                      F1 : 0.9630          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4832          
##    Detection Prevalence : 0.5035          
##       Balanced Accuracy : 0.9628          
##                                           
##        'Positive' Class : 1               
##

#see summary results of random forest model on the test data

gb.summary<-data.frame(
      obs<-test_data$WFRI_R,
      pred<-gb.pred,
      N<-gb.reclprob,
      Y<-gb.precsnprob
  )

gb.summary<-gb.summary%>%rename(obs="obs....test_data.WFRI_R",pred="pred....gb.pred", N="N....gb.reclprob", Y="Y....gb.precsnprob")

gb.auc<-roc(gb.summary$obs,Y)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

temp<-data.frame(
  t.R2<-R2_Score(y_pred = as.numeric(as.character(gb.summary$pred)),y_true =as.numeric(as.character( gb.summary$obs))),
  t.mse<-MSE(as.numeric(as.character(gb.summary$pred)),as.numeric(as.character(gb.summary$obs))),
  t.RSME<-RMSE(as.numeric(as.character(gb.summary$pred)),as.numeric(as.character(gb.summary$obs))),
  t.AUC<-gb.auc$auc,
  t.ClassError<-mean(gb.summary$pred!=gb.summary$obs)
  
)

#plotting ROC curve of gradient boosted DT

ggroc(gb.auc)+ggtitle("Gradient Boosted Decision Tree ROC Curve of AUC= 0.9871")+geom_segment(aes(x=1,y=0,xend=0,yend=1),linetype="dotted",color="red")+theme_light()

temp<-temp%>%rename(R2="t.R2....R2_Score.y_pred...as.numeric.as.character.gb.summary.pred....",MSE="t.mse....MSE.as.numeric.as.character.gb.summary.pred....as.numeric.as.character.gb.summary.obs..."  
,RMSE="t.RSME....RMSE.as.numeric.as.character.gb.summary.pred....as.numeric.as.character.gb.summary.obs..."
,AUC="t.AUC....gb.auc.auc",classError="t.ClassError....mean.gb.summary.pred....gb.summary.obs.")

#adding gradient boosting to the results data frame

test.results[2,]<-c(R2=temp$R2,MSE=temp$MSE,RMSE=temp$RMSE,AUC=temp$AUC,classError=temp$classError)
rownames(test.results)<-c("Random Forest","Gradient Boosting")

#storing confusion matrix testing results

temp<-gb.con$byClass%>%as.data.frame()%>%t()
temp<-as.data.frame(temp)

confusionM<-confusionM%>%add_row(Sensitivity= temp$Sensitivity,Specificity=temp$Specificity, Pos_pred_value=temp$`Pos Pred Value`,Neg_Pred_value=temp$`Neg Pred Value`,Precision=temp$Precision, Recall=temp$Recall,F1=temp$F1 ,Prevalence= temp$Prevalence,Detection_rate=temp$`Detection Rate`, Detection_prevalence=temp$`Detection Prevalence`,Balanced_Accuracy=temp$`Balanced Accuracy`)

rownames(confusionM)<-c("Random Forest","Gradient Boosting")

##Part 1 of analysis RF VS GB

#Current review on the two models

confusionM

##                   Sensitivity Specificity Pos_pred_value Neg_Pred_value
## Random Forest       0.9752212   0.9327434      0.9354839      0.9741220
## Gradient Boosting   0.9663717   0.9592920      0.9595782      0.9661319
##                   Precision    Recall        F1 Prevalence Detection_rate
## Random Forest     0.9354839 0.9752212 0.9549393        0.5      0.4876106
## Gradient Boosting 0.9595782 0.9663717 0.9629630        0.5      0.4831858
##                   Detection_prevalence Balanced_Accuracy
## Random Forest                0.5212389         0.9539823
## Gradient Boosting            0.5035398         0.9628319

#see ROCs of gradient boosted DT and random forest

rocs<-list(Gradient_Boost=gb.auc,Random_Forest=rf.auc)
ggroc(rocs)+ggtitle("ROC Performance of current models")+geom_segment(aes(x=1,y=0,xend=0,yend=1),linetype="dotted",color="red")+theme_light()

#see Precision-Recall Plots of the two models

rf.perf<-h2o.performance(rf.v2,test.h2o)%>%h2o.metric()%>%as.data.frame()%>%select(c(recall,precision))
rf.perf$model<-"Random Forest"
gb.perf<-h2o.performance(gb_v2,test.h2o)%>%h2o.metric()%>%as.data.frame()%>%select(c(recall,precision))
gb.perf$model<-"Gradient Boosted DT"
combine_rpplots<-rbind(rf.perf,gb.perf)

ggplot(combine_rpplots,aes(recall,precision,group=model,color=model))+geom_line()+labs(title ="Precision-Recall AUC Curve",legend="current ML models")+theme_light()

#Plotting residuals vs fitted

rf.summary<-rf.summary%>%mutate(resid=as.numeric(obs)-as.numeric(pred))
gb.summary<-gb.summary%>%mutate(resid=as.numeric(obs)-as.numeric(pred))
g1<-gb.summary%>%ggplot(aes(pred,resid))+geom_point()+labs(title="GBDT| Residuals vs Predicted",y="Residuals",x="Predicted")+theme_light()
g2<-rf.summary%>%ggplot(aes(pred,resid))+geom_point()+labs(title="RF| Residuals vs Predicted",y="Residuals",x="Predicted")+theme_light()

plt2<-ggarrange(g1,g2,ncol = 2)
annotate_figure(plt2,top = text_grob("Residuals vs Predicted values across Models",size=9))

#feature analysis. Plotting top ten feature by its gini score

rf.features<-h2o.varimp(rf.v2)%>%as.data.frame()
gb.features<-h2o.varimp(gb_v2)%>%as.data.frame()

g1<-rf.features%>%ggplot(aes(y=variable,x=scaled_importance))+geom_bar(stat="identity")+theme_light()
g2<-gb.features%>%ggplot(aes(y=variable,x=scaled_importance))+geom_bar(stat="identity")+theme_light()

plt3<-ggarrange(g1,g2,ncol = 2)
annotate_figure(plt3,top = text_grob("Feauture Importance Across Models",size=9))

Model 3 |Auto ML Models

Run AutoML for 5 base models using the “features” data.

aml <- h2o.automl(x = features, y = response,
                  training_frame = train.h2o,
                  max_models = 5,
                  seed = 1)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
## 08:55:53.635: AutoML: XGBoost is not available; skipping it.
## 08:55:53.635: _train param, Dropping bad and constant columns: [.isBarren]
## 08:55:54.183: _train param, Dropping bad and constant columns: [.isBarren]
## 08:55:55.199: _train param, Dropping bad and constant columns: [.isBarren]
  |                                                                            
  |======                                                                |   9%
## 08:55:56.72: _train param, Dropping bad and constant columns: [.isBarren]
## 08:55:57.25: _train param, Dropping bad and constant columns: [.isBarren]
  |                                                                            
  |============                                                          |  18%
## 08:55:58.246: _train param, Dropping unused columns: [.isBarren]
  |                                                                            
  |==============                                                        |  21%
## 08:55:59.905: _train param, Dropping unused columns: [.isBarren]
  |                                                                            
  |======================================================================| 100%

View the AutoML Leaderboard

lb <- aml@leaderboard
head(lb, n = nrow(lb))  # Print all rows instead of default (6 rows)

##                                                 model_id       auc   logloss
## 1    StackedEnsemble_AllModels_1_AutoML_2_20231207_85553 0.9879412 0.1226840
## 2 StackedEnsemble_BestOfFamily_1_AutoML_2_20231207_85553 0.9875161 0.1233697
## 3                          GBM_1_AutoML_2_20231207_85553 0.9874417 0.1230330
## 4                          GBM_3_AutoML_2_20231207_85553 0.9872120 0.1300248
## 5                          GBM_2_AutoML_2_20231207_85553 0.9867102 0.1289274
## 6                          DRF_1_AutoML_2_20231207_85553 0.9849304 0.1568808
## 7                          GLM_1_AutoML_2_20231207_85553 0.9832471 0.1853065
##       aucpr mean_per_class_error      rmse        mse
## 1 0.9855875           0.04321456 0.1867403 0.03487193
## 2 0.9852610           0.04397271 0.1870281 0.03497950
## 3 0.9839523           0.04397271 0.1870764 0.03499757
## 4 0.9838299           0.04510993 0.1908446 0.03642167
## 5 0.9834360           0.04169826 0.1886203 0.03557763
## 6 0.9817261           0.04927976 0.1990331 0.03961417
## 7 0.9783591           0.05496588 0.2165196 0.04688074

View the Leader Model

leader_model <- aml@leader
leader_model

## Model Details:
## ==============
## 
## H2OBinomialModel: stackedensemble
## Model ID:  StackedEnsemble_AllModels_1_AutoML_2_20231207_85553 
## Model Summary for Stacked Ensemble: 
##                                     key            value
## 1                     Stacking strategy cross_validation
## 2  Number of base models (used / total)              5/5
## 3      # GBM base models (used / total)              3/3
## 4      # DRF base models (used / total)              1/1
## 5      # GLM base models (used / total)              1/1
## 6                 Metalearner algorithm              GLM
## 7    Metalearner fold assignment scheme           Random
## 8                    Metalearner nfolds                5
## 9               Metalearner fold_column               NA
## 10   Custom metalearner hyperparameters             None
## 
## 
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  0.01500439
## RMSE:  0.1224924
## LogLoss:  0.05879297
## Mean Per-Class Error:  0.0166793
## AUC:  0.9989019
## AUCPR:  0.9988709
## Gini:  0.9978037
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0    1    Error      Rate
## 0      1302   17 0.012889  =17/1319
## 1        27 1292 0.020470  =27/1319
## Totals 1329 1309 0.016679  =44/2638
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.673546    0.983257 218
## 2                       max f2  0.467763    0.991113 259
## 3                 max f0point5  0.743796    0.989156 196
## 4                 max accuracy  0.687783    0.983321 216
## 5                max precision  0.999509    1.000000   0
## 6                   max recall  0.395572    1.000000 274
## 7              max specificity  0.999509    1.000000   0
## 8             max absolute_mcc  0.687783    0.966681 216
## 9   max min_per_class_accuracy  0.646157    0.981046 226
## 10 max mean_per_class_accuracy  0.687783    0.983321 216
## 11                     max tns  0.999509 1319.000000   0
## 12                     max fns  0.999509 1275.000000   0
## 13                     max fps  0.000769 1319.000000 399
## 14                     max tps  0.395572 1319.000000 274
## 15                     max tnr  0.999509    1.000000   0
## 16                     max fnr  0.999509    0.966641   0
## 17                     max fpr  0.000769    1.000000 399
## 18                     max tpr  0.395572    1.000000 274
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.03487193
## RMSE:  0.1867403
## LogLoss:  0.122684
## Mean Per-Class Error:  0.04321456
## AUC:  0.9879412
## AUCPR:  0.9855875
## Gini:  0.9758823
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0    1    Error       Rate
## 0      1238   81 0.061410   =81/1319
## 1        33 1286 0.025019   =33/1319
## Totals 1271 1367 0.043215  =114/2638
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.517357    0.957558 251
## 2                       max f2  0.156322    0.979312 313
## 3                 max f0point5  0.651170    0.949962 220
## 4                 max accuracy  0.517357    0.956785 251
## 5                max precision  0.999544    1.000000   0
## 6                   max recall  0.006095    1.000000 381
## 7              max specificity  0.999544    1.000000   0
## 8             max absolute_mcc  0.517357    0.914176 251
## 9   max min_per_class_accuracy  0.651170    0.949962 220
## 10 max mean_per_class_accuracy  0.517357    0.956785 251
## 11                     max tns  0.999544 1319.000000   0
## 12                     max fns  0.999544 1260.000000   0
## 13                     max fps  0.000730 1319.000000 399
## 14                     max tps  0.006095 1319.000000 381
## 15                     max tnr  0.999544    1.000000   0
## 16                     max fnr  0.999544    0.955269   0
## 17                     max fpr  0.000730    1.000000 399
## 18                     max tpr  0.006095    1.000000 381
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                mean       sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy   0.958769 0.009706   0.944134   0.958878   0.969524   0.956190
## auc        0.988100 0.007159   0.977428   0.984768   0.994309   0.989646
## err        0.041231 0.009706   0.055866   0.041121   0.030476   0.043810
## err_count 21.800000 5.403702  30.000000  22.000000  16.000000  23.000000
## f0point5   0.944906 0.013995   0.925267   0.939436   0.962567   0.944882
##           cv_5_valid
## accuracy    0.965116
## auc         0.994351
## err         0.034884
## err_count  18.000000
## f0point5    0.952381
## 
## ---
##                         mean        sd cv_1_valid cv_2_valid cv_3_valid
## precision           0.935174  0.017493   0.912281   0.925424   0.958175
## r2                  0.861339  0.035214   0.806819   0.856606   0.896717
## recall              0.986265  0.008294   0.981132   1.000000   0.980545
## residual_deviance 128.829390 34.619440 181.411250 139.068160  95.743210
## rmse                0.185002  0.023115   0.219743   0.189297   0.160653
## specificity         0.931058  0.020403   0.908088   0.916030   0.958955
##                   cv_4_valid cv_5_valid
## precision           0.936170   0.943820
## r2                  0.858638   0.887917
## recall              0.981413   0.988235
## residual_deviance 127.882900 100.041440
## rmse                0.187933   0.167383
## specificity         0.929688   0.942529

AutoML Leader-Board Predictions

# Make predictions on the validation set
pred <- h2o.predict(aml@leader, test.h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

# Convert H2O frame to a data frame
predictions_df <- as.data.frame(pred)

head(predictions_df)

##   predict        p0           p1
## 1       0 0.9895432 0.0104567651
## 2       0 0.9982961 0.0017038765
## 3       0 0.9949776 0.0050223601
## 4       0 0.9991782 0.0008217957
## 5       0 0.9988078 0.0011921620
## 6       0 0.9937353 0.0062646507

Top model with the “AUC” metric

# Get the best model using a non-default metric
m <- h2o.get_best_model(aml, criterion = "auc")
m

## Model Details:
## ==============
## 
## H2OBinomialModel: stackedensemble
## Model ID:  StackedEnsemble_AllModels_1_AutoML_2_20231207_85553 
## Model Summary for Stacked Ensemble: 
##                                     key            value
## 1                     Stacking strategy cross_validation
## 2  Number of base models (used / total)              5/5
## 3      # GBM base models (used / total)              3/3
## 4      # DRF base models (used / total)              1/1
## 5      # GLM base models (used / total)              1/1
## 6                 Metalearner algorithm              GLM
## 7    Metalearner fold assignment scheme           Random
## 8                    Metalearner nfolds                5
## 9               Metalearner fold_column               NA
## 10   Custom metalearner hyperparameters             None
## 
## 
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  0.01500439
## RMSE:  0.1224924
## LogLoss:  0.05879297
## Mean Per-Class Error:  0.0166793
## AUC:  0.9989019
## AUCPR:  0.9988709
## Gini:  0.9978037
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0    1    Error      Rate
## 0      1302   17 0.012889  =17/1319
## 1        27 1292 0.020470  =27/1319
## Totals 1329 1309 0.016679  =44/2638
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.673546    0.983257 218
## 2                       max f2  0.467763    0.991113 259
## 3                 max f0point5  0.743796    0.989156 196
## 4                 max accuracy  0.687783    0.983321 216
## 5                max precision  0.999509    1.000000   0
## 6                   max recall  0.395572    1.000000 274
## 7              max specificity  0.999509    1.000000   0
## 8             max absolute_mcc  0.687783    0.966681 216
## 9   max min_per_class_accuracy  0.646157    0.981046 226
## 10 max mean_per_class_accuracy  0.687783    0.983321 216
## 11                     max tns  0.999509 1319.000000   0
## 12                     max fns  0.999509 1275.000000   0
## 13                     max fps  0.000769 1319.000000 399
## 14                     max tps  0.395572 1319.000000 274
## 15                     max tnr  0.999509    1.000000   0
## 16                     max fnr  0.999509    0.966641   0
## 17                     max fpr  0.000769    1.000000 399
## 18                     max tpr  0.395572    1.000000 274
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.03487193
## RMSE:  0.1867403
## LogLoss:  0.122684
## Mean Per-Class Error:  0.04321456
## AUC:  0.9879412
## AUCPR:  0.9855875
## Gini:  0.9758823
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0    1    Error       Rate
## 0      1238   81 0.061410   =81/1319
## 1        33 1286 0.025019   =33/1319
## Totals 1271 1367 0.043215  =114/2638
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.517357    0.957558 251
## 2                       max f2  0.156322    0.979312 313
## 3                 max f0point5  0.651170    0.949962 220
## 4                 max accuracy  0.517357    0.956785 251
## 5                max precision  0.999544    1.000000   0
## 6                   max recall  0.006095    1.000000 381
## 7              max specificity  0.999544    1.000000   0
## 8             max absolute_mcc  0.517357    0.914176 251
## 9   max min_per_class_accuracy  0.651170    0.949962 220
## 10 max mean_per_class_accuracy  0.517357    0.956785 251
## 11                     max tns  0.999544 1319.000000   0
## 12                     max fns  0.999544 1260.000000   0
## 13                     max fps  0.000730 1319.000000 399
## 14                     max tps  0.006095 1319.000000 381
## 15                     max tnr  0.999544    1.000000   0
## 16                     max fnr  0.999544    0.955269   0
## 17                     max fpr  0.000730    1.000000 399
## 18                     max tpr  0.006095    1.000000 381
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                mean       sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy   0.958769 0.009706   0.944134   0.958878   0.969524   0.956190
## auc        0.988100 0.007159   0.977428   0.984768   0.994309   0.989646
## err        0.041231 0.009706   0.055866   0.041121   0.030476   0.043810
## err_count 21.800000 5.403702  30.000000  22.000000  16.000000  23.000000
## f0point5   0.944906 0.013995   0.925267   0.939436   0.962567   0.944882
##           cv_5_valid
## accuracy    0.965116
## auc         0.994351
## err         0.034884
## err_count  18.000000
## f0point5    0.952381
## 
## ---
##                         mean        sd cv_1_valid cv_2_valid cv_3_valid
## precision           0.935174  0.017493   0.912281   0.925424   0.958175
## r2                  0.861339  0.035214   0.806819   0.856606   0.896717
## recall              0.986265  0.008294   0.981132   1.000000   0.980545
## residual_deviance 128.829390 34.619440 181.411250 139.068160  95.743210
## rmse                0.185002  0.023115   0.219743   0.189297   0.160653
## specificity         0.931058  0.020403   0.908088   0.916030   0.958955
##                   cv_4_valid cv_5_valid
## precision           0.936170   0.943820
## r2                  0.858638   0.887917
## recall              0.981413   0.988235
## residual_deviance 127.882900 100.041440
## rmse                0.187933   0.167383
## specificity         0.929688   0.942529

AutoML Performance

# Extract actual values from the test set
perf <- h2o.performance(leader_model, test.h2o )
perf

## H2OBinomialMetrics: stackedensemble
## 
## MSE:  0.02698049
## RMSE:  0.1642574
## LogLoss:  0.09545156
## Mean Per-Class Error:  0.03185841
## AUC:  0.9944177
## AUCPR:  0.9939606
## Gini:  0.9888355
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          0   1    Error      Rate
## 0      547  18 0.031858   =18/565
## 1       18 547 0.031858   =18/565
## Totals 565 565 0.031858  =36/1130
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.587435   0.968142 261
## 2                       max f2  0.069366   0.980222 314
## 3                 max f0point5  0.661526   0.970069 249
## 4                 max accuracy  0.587435   0.968142 261
## 5                max precision  0.999580   1.000000   0
## 6                   max recall  0.069366   1.000000 314
## 7              max specificity  0.999580   1.000000   0
## 8             max absolute_mcc  0.587435   0.936283 261
## 9   max min_per_class_accuracy  0.587435   0.968142 261
## 10 max mean_per_class_accuracy  0.587435   0.968142 261
## 11                     max tns  0.999580 565.000000   0
## 12                     max fns  0.999580 560.000000   0
## 13                     max fps  0.000689 565.000000 399
## 14                     max tps  0.069366 565.000000 314
## 15                     max tnr  0.999580   1.000000   0
## 16                     max fnr  0.999580   0.991150   0
## 17                     max fpr  0.000689   1.000000 399
## 18                     max tpr  0.069366   1.000000 314
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

#Generate predictions on a test set, you can make predictions directly on the `H2OAutoML` object
aml_pred <- h2o.predict(leader_model, test.h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#Accuracy measure on the test data
amlpred_df <- as.data.frame(aml_pred$predict)
amlpred_df$predict <- factor(amlpred_df$predict, levels = c(0,1))

#Print the confusion matrix

#Accuracy measure on the test data
aml_cm<-confusionMatrix(amlpred_df$predict,test_data$WFRI_R,positive = "1",mode = "prec_recall")

aml_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 540  14
##          1  25 551
##                                           
##                Accuracy : 0.9655          
##                  95% CI : (0.9531, 0.9753)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.931           
##                                           
##  Mcnemar's Test P-Value : 0.1093          
##                                           
##               Precision : 0.9566          
##                  Recall : 0.9752          
##                      F1 : 0.9658          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4876          
##    Detection Prevalence : 0.5097          
##       Balanced Accuracy : 0.9655          
##                                           
##        'Positive' Class : 1               
##

Model 4 - Naive-Bayes

# Build and train the model:
pros_nb <- h2o.naiveBayes(x = features,
                          y = response,
                          training_frame = train.h2o,
                          laplace = 0,
                          nfolds = 5,
                          seed = 1234, 
                          keep_cross_validation_predictions = TRUE)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [.isBarren].

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

nb_perf <- h2o.performance(pros_nb, test.h2o)
nb_perf

## H2OBinomialMetrics: naivebayes
## 
## MSE:  0.1279936
## RMSE:  0.3577619
## LogLoss:  1.062117
## Mean Per-Class Error:  0.08495575
## AUC:  0.9621114
## AUCPR:  0.9506444
## Gini:  0.9242227
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          0   1    Error      Rate
## 0      498  67 0.118584   =67/565
## 1       29 536 0.051327   =29/565
## Totals 527 603 0.084956  =96/1130
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.000327   0.917808 267
## 2                       max f2  0.000105   0.947205 300
## 3                 max f0point5  0.005658   0.916821 199
## 4                 max accuracy  0.000374   0.915044 263
## 5                max precision  0.999994   0.973684   9
## 6                   max recall  0.000000   1.000000 399
## 7              max specificity  1.000000   0.984071   0
## 8             max absolute_mcc  0.000327   0.831972 267
## 9   max min_per_class_accuracy  0.001469   0.909735 229
## 10 max mean_per_class_accuracy  0.000374   0.915044 263
## 11                     max tns  1.000000 556.000000   0
## 12                     max fns  1.000000 259.000000   0
## 13                     max fps  0.000000 565.000000 399
## 14                     max tps  0.000000 565.000000 399
## 15                     max tnr  1.000000   0.984071   0
## 16                     max fnr  1.000000   0.458407   0
## 17                     max fpr  0.000000   1.000000 399
## 18                     max tpr  0.000000   1.000000 399
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

#Generate predictions on a test set, you can make predictions directly on the `H2OAutoML` object
nb_pred <- h2o.predict(pros_nb, test.h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#Accuracy measure on the test data
nbpred_df <- as.data.frame(nb_pred$predict)
nbpred_df$predict <- factor(nbpred_df$predict, levels = c(0,1))

#Accuracy measure on the test data
nb_cm<-confusionMatrix(nbpred_df$predict,test_data$WFRI_R,positive = "1",mode = "prec_recall")

nb_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 511  51
##          1  54 514
##                                           
##                Accuracy : 0.9071          
##                  95% CI : (0.8886, 0.9234)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8142          
##                                           
##  Mcnemar's Test P-Value : 0.8453          
##                                           
##               Precision : 0.9049          
##                  Recall : 0.9097          
##                      F1 : 0.9073          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4549          
##    Detection Prevalence : 0.5027          
##       Balanced Accuracy : 0.9071          
##                                           
##        'Positive' Class : 1               
##

Model 5 - SVM Model

# Build and train the model:
svm_model <- h2o.psvm(gamma = 0.01,
                      rank_ratio = 0.1,
                      x = features,
                      y = response,
                      training_frame = train.h2o,
                      disable_training_metrics = FALSE,
                      seed = 1)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [.isBarren].

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

svm_perf <- h2o.performance(svm_model, test.h2o)
svm_perf

## H2OBinomialMetrics: psvm
## 
## MSE:  0.4522124
## RMSE:  0.6724674
## LogLoss:  NaN
## Mean Per-Class Error:  0.4522124
## AUC:  NaN
## AUCPR:  NaN
## Gini:  NaN
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         0    1    Error       Rate
## 0      60  505 0.893805   =505/565
## 1       6  559 0.010619     =6/565
## Totals 66 1064 0.452212  =511/1130
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  1.000000   0.686311   0
## 2                       max f2  1.000000   0.840854   0
## 3                 max f0point5  1.000000   0.579755   0
## 4                 max accuracy  1.000000   0.547788   0
## 5                max precision  1.000000   0.525376   0
## 6                   max recall  1.000000   0.989381   0
## 7              max specificity  1.000000   0.106195   0
## 8             max absolute_mcc  1.000000   0.203775   0
## 9   max min_per_class_accuracy  1.000000   0.106195   0
## 10 max mean_per_class_accuracy  1.000000   0.547788   0
## 11                     max tns  1.000000  60.000000   0
## 12                     max fns  1.000000   6.000000   0
## 13                     max fps  1.000000 505.000000   0
## 14                     max tps  1.000000 559.000000   0
## 15                     max tnr  1.000000   0.106195   0
## 16                     max fnr  1.000000   0.010619   0
## 17                     max fpr  1.000000   0.893805   0
## 18                     max tpr  1.000000   0.989381   0
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

#Generate predictions on a test set, you can make predictions directly on the `H2OAutoML` object
svm_pred <- h2o.predict(svm_model, test.h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#Accuracy measure on the test data
svmpred_df <- as.data.frame(svm_pred$predict)
svmpred_df$predict <- factor(svmpred_df$predict, levels = c(0,1))

#Accuracy measure on the test data
svm_cm<-confusionMatrix(svmpred_df$predict,test_data$WFRI_R,positive = "1",mode = "prec_recall")

svm_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  60   6
##          1 505 559
##                                           
##                Accuracy : 0.5478          
##                  95% CI : (0.5182, 0.5771)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.0007232       
##                                           
##                   Kappa : 0.0956          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##               Precision : 0.5254          
##                  Recall : 0.9894          
##                      F1 : 0.6863          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4947          
##    Detection Prevalence : 0.9416          
##       Balanced Accuracy : 0.5478          
##                                           
##        'Positive' Class : 1               
##

Model 6 - Deep Learning

# Build and train the model:
dl <- h2o.deeplearning(x = features,
                       y = response,
                       distribution = "AUTO",
                       hidden = c(1),
                       epochs = 1000,
                       train_samples_per_iteration = -1,
                       reproducible = TRUE,
                       activation = "Tanh",
                       single_node_mode = FALSE,
                       balance_classes = FALSE,
                       force_load_balance = FALSE,
                       seed = 23123,
                       score_training_samples = 0,
                       score_validation_samples = 0,
                       training_frame = train.h2o,
                       stopping_rounds = 0,
                       keep_cross_validation_predictions = TRUE)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [.isBarren].

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%

dl_perf <- h2o.performance(dl, test.h2o)
dl_perf

## H2OBinomialMetrics: deeplearning
## 
## MSE:  0.03297753
## RMSE:  0.1815972
## LogLoss:  0.1231176
## Mean Per-Class Error:  0.03982301
## AUC:  0.988093
## AUCPR:  0.983303
## Gini:  0.9761861
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          0   1    Error      Rate
## 0      536  29 0.051327   =29/565
## 1       16 549 0.028319   =16/565
## Totals 552 578 0.039823  =45/1130
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.388935   0.960630 190
## 2                       max f2  0.169035   0.978375 218
## 3                 max f0point5  0.726136   0.960217 163
## 4                 max accuracy  0.388935   0.960177 190
## 5                max precision  0.976608   0.993769  10
## 6                   max recall  0.004824   1.000000 343
## 7              max specificity  0.977059   0.996460   0
## 8             max absolute_mcc  0.388935   0.920598 190
## 9   max min_per_class_accuracy  0.586926   0.955752 175
## 10 max mean_per_class_accuracy  0.388935   0.960177 190
## 11                     max tns  0.977059 563.000000   0
## 12                     max fns  0.977059 357.000000   0
## 13                     max fps  0.000499 565.000000 399
## 14                     max tps  0.004824 565.000000 343
## 15                     max tnr  0.977059   0.996460   0
## 16                     max fnr  0.977059   0.631858   0
## 17                     max fpr  0.000499   1.000000 399
## 18                     max tpr  0.004824   1.000000 343
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

#Generate predictions on a test set, you can make predictions directly on the `H2OAutoML` object
dl_pred <- h2o.predict(dl, test.h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#Accuracy measure on the test data
dlpred_df <- as.data.frame(dl_pred$predict)
dlpred_df$predict <- factor(dlpred_df$predict, levels = c(0,1))

#Accuracy measure on the test data
dl_cm<-confusionMatrix(dlpred_df$predict,test_data$WFRI_R,positive = "1",mode = "prec_recall")

dl_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 537  22
##          1  28 543
##                                          
##                Accuracy : 0.9558         
##                  95% CI : (0.9421, 0.967)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9115         
##                                          
##  Mcnemar's Test P-Value : 0.4795         
##                                          
##               Precision : 0.9510         
##                  Recall : 0.9611         
##                      F1 : 0.9560         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4805         
##    Detection Prevalence : 0.5053         
##       Balanced Accuracy : 0.9558         
##                                          
##        'Positive' Class : 1              
##

Part 2 Analysis| Comparison of Top LM model compared to other two models

#Retrieve Top performing model and save separately #saving name: GBM_1_AutoML_1_20231119_133450

winining_aml<-aml@leader
winining_aml@model$model_summary

#Retrieving Top performing Auto model and save its prediction separately for performance

automl.pred<-h2o.predict(leader_model,test.h2o)%>%as.data.frame()%>%pull(predict)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

automl.precsnprob<-h2o.predict(leader_model,test.h2o)%>%as.data.frame()%>%pull(p1)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

automl.reclprob<-h2o.predict(aml@leader,test.h2o)%>%as.data.frame()%>%pull(p0)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#Print the confusion matrix
automl.con<-confusionMatrix(automl.pred,test_data$WFRI_R,positive = "1",mode = "prec_recall")

automl.con

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 540  14
##          1  25 551
##                                           
##                Accuracy : 0.9655          
##                  95% CI : (0.9531, 0.9753)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.931           
##                                           
##  Mcnemar's Test P-Value : 0.1093          
##                                           
##               Precision : 0.9566          
##                  Recall : 0.9752          
##                      F1 : 0.9658          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4876          
##    Detection Prevalence : 0.5097          
##       Balanced Accuracy : 0.9655          
##                                           
##        'Positive' Class : 1               
##

#see summary results of automled ensemble model on the test data

automl.summary<-data.frame(
      obs<-test_data$WFRI_R,
      pred<-automl.pred,
      N<-automl.reclprob,
      Y<-automl.precsnprob
  )


automl.summary<-automl.summary%>%rename(obs="obs....test_data.WFRI_R",pred="pred....automl.pred", N="N....automl.reclprob", Y="Y....automl.precsnprob")

automl.auc<-roc(automl.summary$obs,Y)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

temp<-data.frame(
  t.R2<-R2_Score(y_pred = as.numeric(as.character(automl.summary$pred)),y_true =as.numeric(as.character( automl.summary$obs))),
  t.mse<-MSE(as.numeric(as.character(automl.summary$pred)),as.numeric(as.character(automl.summary$obs))),
  t.RSME<-RMSE(as.numeric(as.character(automl.summary$pred)),as.numeric(as.character(automl.summary$obs))),
  t.AUC<-automl.auc$auc,
  t.ClassError<-mean(automl.summary$pred!=automl.summary$obs)
  
)

automl.summary

##      obs pred            N            Y
## 1      0    0 0.9895432349 0.0104567651
## 2      0    0 0.9982961235 0.0017038765
## 3      0    0 0.9949776399 0.0050223601
## 4      0    0 0.9991782043 0.0008217957
## 5      0    0 0.9988078380 0.0011921620
## 6      0    0 0.9937353493 0.0062646507
## 7      0    0 0.9985137428 0.0014862572
## 8      0    0 0.9989045629 0.0010954371
## 9      0    0 0.9986348995 0.0013651005
## 10     0    0 0.9987930498 0.0012069502
## 11     0    0 0.9889149113 0.0110850887
## 12     0    0 0.9987097250 0.0012902750
## 13     0    0 0.9991022681 0.0008977319
## 14     0    0 0.9985754257 0.0014245743
## 15     0    0 0.9987527296 0.0012472704
## 16     0    0 0.6697250209 0.3302749791
## 17     0    0 0.9913066811 0.0086933189
## 18     0    0 0.9984382240 0.0015617760
## 19     0    0 0.9921664926 0.0078335074
## 20     0    0 0.9988686598 0.0011313402
## 21     0    0 0.9991037931 0.0008962069
## 22     0    0 0.9991967218 0.0008032782
## 23     0    0 0.9989556415 0.0010443585
## 24     0    0 0.9987594873 0.0012405127
## 25     0    0 0.9866672023 0.0133327977
## 26     0    0 0.9950500502 0.0049499498
## 27     0    0 0.9950297486 0.0049702514
## 28     0    0 0.9989539476 0.0010460524
## 29     0    0 0.9984985879 0.0015014121
## 30     0    0 0.9324323047 0.0675676953
## 31     0    0 0.9992791996 0.0007208004
## 32     0    0 0.9987974302 0.0012025698
## 33     0    0 0.9818073360 0.0181926640
## 34     0    0 0.9988483790 0.0011516210
## 35     0    0 0.9987372811 0.0012627189
## 36     0    0 0.9992137801 0.0007862199
## 37     0    0 0.9989747871 0.0010252129
## 38     0    0 0.9271395400 0.0728604600
## 39     0    0 0.9985418732 0.0014581268
## 40     0    0 0.9597763738 0.0402236262
## 41     0    0 0.9985948335 0.0014051665
## 42     0    0 0.8038142947 0.1961857053
## 43     0    0 0.9988046340 0.0011953660
## 44     0    0 0.9986624960 0.0013375040
## 45     0    1 0.4150574501 0.5849425499
## 46     0    0 0.9885971392 0.0114028608
## 47     0    0 0.9991352724 0.0008647276
## 48     0    0 0.9987302842 0.0012697158
## 49     0    0 0.9991005924 0.0008994076
## 50     0    0 0.9988798960 0.0011201040
## 51     0    0 0.9679612808 0.0320387192
## 52     0    0 0.9982610066 0.0017389934
## 53     0    0 0.9769907293 0.0230092707
## 54     0    0 0.9983200688 0.0016799312
## 55     0    0 0.9986815068 0.0013184932
## 56     0    0 0.9989898539 0.0010101461
## 57     0    0 0.9958375528 0.0041624472
## 58     0    0 0.9987809166 0.0012190834
## 59     0    0 0.9987812176 0.0012187824
## 60     0    0 0.5449593443 0.4550406557
## 61     0    0 0.9984196564 0.0015803436
## 62     0    0 0.9920086796 0.0079913204
## 63     0    0 0.9851427592 0.0148572408
## 64     0    0 0.9877102710 0.0122897290
## 65     0    0 0.9987064000 0.0012936000
## 66     0    0 0.9924520356 0.0075479644
## 67     0    0 0.9991637933 0.0008362067
## 68     0    0 0.9914069344 0.0085930656
## 69     0    0 0.9895625004 0.0104374996
## 70     0    0 0.9959194440 0.0040805560
## 71     0    0 0.9689244839 0.0310755161
## 72     0    0 0.9985137063 0.0014862937
## 73     0    0 0.9850725715 0.0149274285
## 74     0    0 0.9921433791 0.0078566209
## 75     0    0 0.9953832604 0.0046167396
## 76     0    0 0.9956551707 0.0043448293
## 77     0    0 0.9810555662 0.0189444338
## 78     0    0 0.8344957402 0.1655042598
## 79     0    0 0.9987495787 0.0012504213
## 80     0    0 0.9987211986 0.0012788014
## 81     0    0 0.9772258279 0.0227741721
## 82     0    0 0.9951841046 0.0048158954
## 83     0    0 0.9986615149 0.0013384851
## 84     0    0 0.9757758055 0.0242241945
## 85     0    0 0.9986538885 0.0013461115
## 86     0    0 0.9944887029 0.0055112971
## 87     0    0 0.9986479498 0.0013520502
## 88     0    0 0.9985165828 0.0014834172
## 89     0    0 0.9988884370 0.0011115630
## 90     0    0 0.9729482262 0.0270517738
## 91     0    0 0.9916552382 0.0083447618
## 92     0    0 0.9982953585 0.0017046415
## 93     0    0 0.9988306583 0.0011693417
## 94     0    1 0.4492049086 0.5507950914
## 95     0    1 0.2205441954 0.7794558046
## 96     0    0 0.9894182474 0.0105817526
## 97     0    0 0.9928129216 0.0071870784
## 98     0    0 0.9982352241 0.0017647759
## 99     0    0 0.9991126167 0.0008873833
## 100    0    0 0.9991763231 0.0008236769
## 101    0    0 0.9985623745 0.0014376255
## 102    0    0 0.9991981702 0.0008018298
## 103    0    0 0.9990309691 0.0009690309
## 104    0    0 0.9954314094 0.0045685906
## 105    0    0 0.9991269880 0.0008730120
## 106    0    0 0.9913563103 0.0086436897
## 107    0    0 0.9987577980 0.0012422020
## 108    0    0 0.9258867256 0.0741132744
## 109    0    0 0.9956442684 0.0043557316
## 110    0    0 0.9983955671 0.0016044329
## 111    0    0 0.9900073451 0.0099926549
## 112    0    0 0.9990693764 0.0009306236
## 113    0    0 0.9990779944 0.0009220056
## 114    0    0 0.9985738131 0.0014261869
## 115    0    0 0.5317294952 0.4682705048
## 116    0    0 0.9915444410 0.0084555590
## 117    0    0 0.9987732985 0.0012267015
## 118    0    0 0.9984376713 0.0015623287
## 119    0    1 0.1773661545 0.8226338455
## 120    0    0 0.9946406959 0.0053593041
## 121    0    0 0.9982991460 0.0017008540
## 122    0    0 0.9941285522 0.0058714478
## 123    0    0 0.9896799739 0.0103200261
## 124    0    0 0.9987724215 0.0012275785
## 125    0    0 0.9915494845 0.0084505155
## 126    0    0 0.9986068967 0.0013931033
## 127    0    0 0.9841381071 0.0158618929
## 128    0    0 0.9973084174 0.0026915826
## 129    0    0 0.9922315422 0.0077684578
## 130    0    0 0.9991236359 0.0008763641
## 131    0    0 0.9949389002 0.0050610998
## 132    0    0 0.9990602272 0.0009397728
## 133    0    0 0.9987830993 0.0012169007
## 134    0    0 0.5685634812 0.4314365188
## 135    0    0 0.9987880461 0.0012119539
## 136    0    0 0.9867851808 0.0132148192
## 137    0    0 0.9864052586 0.0135947414
## 138    0    0 0.6130669357 0.3869330643
## 139    0    0 0.9939099523 0.0060900477
## 140    0    0 0.9991486454 0.0008513546
## 141    0    0 0.9738408444 0.0261591556
## 142    0    0 0.9990236421 0.0009763579
## 143    0    0 0.9991847121 0.0008152879
## 144    0    0 0.9989745233 0.0010254767
## 145    0    0 0.9940116485 0.0059883515
## 146    0    0 0.9991784379 0.0008215621
## 147    0    0 0.9988129940 0.0011870060
## 148    0    0 0.9988713525 0.0011286475
## 149    0    0 0.9987804510 0.0012195490
## 150    0    0 0.9987136354 0.0012863646
## 151    0    0 0.9013391643 0.0986608357
## 152    0    0 0.9729469786 0.0270530214
## 153    0    0 0.9930698636 0.0069301364
## 154    0    0 0.9989482697 0.0010517303
## 155    0    0 0.5433270980 0.4566729020
## 156    0    0 0.9990950401 0.0009049599
## 157    0    0 0.9988537963 0.0011462037
## 158    0    0 0.9929554728 0.0070445272
## 159    0    0 0.9913354866 0.0086645134
## 160    0    0 0.9851277310 0.0148722690
## 161    0    0 0.9858895826 0.0141104174
## 162    0    0 0.9865326011 0.0134673989
## 163    0    0 0.9991364577 0.0008635423
## 164    0    0 0.9838243028 0.0161756972
## 165    0    0 0.9990320629 0.0009679371
## 166    0    1 0.2178767727 0.7821232273
## 167    0    0 0.9989864209 0.0010135791
## 168    0    0 0.9947986858 0.0052013142
## 169    0    0 0.9886180414 0.0113819586
## 170    0    0 0.9908299153 0.0091700847
## 171    0    0 0.9990261645 0.0009738355
## 172    0    0 0.9942632465 0.0057367535
## 173    0    0 0.9988134718 0.0011865282
## 174    0    0 0.9989046266 0.0010953734
## 175    0    0 0.9943817139 0.0056182861
## 176    0    0 0.9983464249 0.0016535751
## 177    0    0 0.9990648611 0.0009351389
## 178    0    0 0.9987097000 0.0012903000
## 179    0    0 0.9982149105 0.0017850895
## 180    0    0 0.9942976373 0.0057023627
## 181    0    0 0.9990127161 0.0009872839
## 182    0    0 0.9863012719 0.0136987281
## 183    0    0 0.9987494328 0.0012505672
## 184    0    0 0.9055877803 0.0944122197
## 185    0    1 0.1213968215 0.8786031785
## 186    0    0 0.9949517195 0.0050482805
## 187    0    0 0.9917646933 0.0082353067
## 188    0    0 0.9901418254 0.0098581746
## 189    0    0 0.9989386594 0.0010613406
## 190    0    0 0.9984439174 0.0015560826
## 191    0    0 0.9915336369 0.0084663631
## 192    0    0 0.9987807076 0.0012192924
## 193    0    0 0.9849825438 0.0150174562
## 194    0    0 0.9993566883 0.0006433117
## 195    0    0 0.9989321948 0.0010678052
## 196    0    0 0.9989625721 0.0010374279
## 197    0    0 0.6997670778 0.3002329222
## 198    0    0 0.9987536511 0.0012463489
## 199    0    0 0.9914427111 0.0085572889
## 200    0    0 0.9924734985 0.0075265015
## 201    0    1 0.1467503735 0.8532496265
## 202    0    0 0.9739652795 0.0260347205
## 203    0    0 0.9940011160 0.0059988840
## 204    0    0 0.9991277400 0.0008722600
## 205    0    1 0.3735751044 0.6264248956
## 206    0    0 0.9929667488 0.0070332512
## 207    0    0 0.9960238632 0.0039761368
## 208    0    0 0.9987609902 0.0012390098
## 209    0    0 0.9943904905 0.0056095095
## 210    0    0 0.9982747299 0.0017252701
## 211    0    0 0.9911391114 0.0088608886
## 212    0    0 0.9831770494 0.0168229506
## 213    0    0 0.9981809664 0.0018190336
## 214    0    0 0.9941340242 0.0058659758
## 215    0    0 0.9991477653 0.0008522347
## 216    0    0 0.9991660106 0.0008339894
## 217    0    0 0.9918818770 0.0081181230
## 218    0    0 0.9989834735 0.0010165265
## 219    0    0 0.9988476023 0.0011523977
## 220    0    0 0.9987741696 0.0012258304
## 221    0    0 0.9991161816 0.0008838184
## 222    0    0 0.9953185091 0.0046814909
## 223    0    0 0.9891398538 0.0108601462
## 224    0    0 0.9990369088 0.0009630912
## 225    0    0 0.9917953078 0.0082046922
## 226    0    0 0.9988669238 0.0011330762
## 227    0    0 0.9937266535 0.0062733465
## 228    0    0 0.9892554830 0.0107445170
## 229    0    0 0.9985463796 0.0014536204
## 230    0    0 0.9955688207 0.0044311793
## 231    0    0 0.9988642000 0.0011358000
## 232    0    0 0.9927910211 0.0072089789
## 233    0    0 0.9915785748 0.0084214252
## 234    0    0 0.8300432915 0.1699567085
## 235    0    0 0.9988329551 0.0011670449
## 236    0    0 0.9992481194 0.0007518806
## 237    0    0 0.9854265714 0.0145734286
## 238    0    1 0.1613950606 0.8386049394
## 239    0    0 0.9986200199 0.0013799801
## 240    0    0 0.9948665571 0.0051334429
## 241    0    0 0.9988066446 0.0011933554
## 242    0    0 0.9989601059 0.0010398941
## 243    0    0 0.9891339862 0.0108660138
## 244    0    0 0.9949205379 0.0050794621
## 245    0    0 0.9990595122 0.0009404878
## 246    0    0 0.9877938988 0.0122061012
## 247    0    0 0.9989925398 0.0010074602
## 248    0    0 0.9893268988 0.0106731012
## 249    0    0 0.9905360244 0.0094639756
## 250    0    0 0.9988021116 0.0011978884
## 251    0    0 0.9964580879 0.0035419121
## 252    0    0 0.9986708754 0.0013291246
## 253    0    1 0.0494289220 0.9505710780
## 254    0    0 0.9500905821 0.0499094179
## 255    0    0 0.9987838899 0.0012161101
## 256    0    0 0.9983913609 0.0016086391
## 257    0    0 0.9910107600 0.0089892400
## 258    0    0 0.9989930230 0.0010069770
## 259    0    0 0.9898062159 0.0101937841
## 260    0    0 0.9955893398 0.0044106602
## 261    0    1 0.0271526077 0.9728473923
## 262    0    0 0.9922164008 0.0077835992
## 263    0    0 0.5470614807 0.4529385193
## 264    0    0 0.9962105888 0.0037894112
## 265    0    0 0.9988845329 0.0011154671
## 266    0    0 0.9988610751 0.0011389249
## 267    0    0 0.9984300226 0.0015699774
## 268    0    0 0.8979596383 0.1020403617
## 269    0    0 0.9911275653 0.0088724347
## 270    0    0 0.9716460429 0.0283539571
## 271    0    0 0.9940959753 0.0059040247
## 272    0    0 0.9986843301 0.0013156699
## 273    0    0 0.9991632881 0.0008367119
## 274    0    0 0.9987260497 0.0012739503
## 275    0    0 0.9977951080 0.0022048920
## 276    0    0 0.9889715945 0.0110284055
## 277    0    0 0.9987001368 0.0012998632
## 278    0    0 0.9987895150 0.0012104850
## 279    0    0 0.7844317530 0.2155682470
## 280    0    0 0.9897884585 0.0102115415
## 281    0    0 0.9936156177 0.0063843823
## 282    0    0 0.9922134581 0.0077865419
## 283    0    1 0.2282267637 0.7717732363
## 284    0    0 0.9985824639 0.0014175361
## 285    0    0 0.9988628604 0.0011371396
## 286    0    0 0.9988240222 0.0011759778
## 287    0    0 0.9989828521 0.0010171479
## 288    0    0 0.9706452970 0.0293547030
## 289    0    0 0.9986095340 0.0013904660
## 290    0    0 0.9986320132 0.0013679868
## 291    0    0 0.9987514922 0.0012485078
## 292    0    0 0.9982204278 0.0017795722
## 293    0    0 0.9989694685 0.0010305315
## 294    0    0 0.9951064580 0.0048935420
## 295    0    0 0.9982460335 0.0017539665
## 296    0    0 0.9986646619 0.0013353381
## 297    0    0 0.9932077708 0.0067922292
## 298    0    0 0.9890213012 0.0109786988
## 299    0    0 0.9986699058 0.0013300942
## 300    0    1 0.1846371673 0.8153628327
## 301    0    0 0.9987407395 0.0012592605
## 302    0    0 0.9935209626 0.0064790374
## 303    0    0 0.9939897325 0.0060102675
## 304    0    0 0.9985275868 0.0014724132
## 305    0    0 0.9916144873 0.0083855127
## 306    0    0 0.9391438913 0.0608561087
## 307    0    0 0.9989559230 0.0010440770
## 308    0    0 0.9983751344 0.0016248656
## 309    0    0 0.9941888027 0.0058111973
## 310    0    0 0.9987537265 0.0012462735
## 311    0    0 0.9987360652 0.0012639348
## 312    0    0 0.9904247758 0.0095752242
## 313    0    0 0.9941015859 0.0058984141
## 314    0    1 0.3893417844 0.6106582156
## 315    0    0 0.9988500372 0.0011499628
## 316    0    0 0.9903061308 0.0096938692
## 317    0    0 0.9937943050 0.0062056950
## 318    0    0 0.9986618927 0.0013381073
## 319    0    0 0.9982161120 0.0017838880
## 320    0    0 0.9985905376 0.0014094624
## 321    0    0 0.7974696371 0.2025303629
## 322    0    0 0.9988777821 0.0011222179
## 323    0    0 0.9978657676 0.0021342324
## 324    0    0 0.9988180069 0.0011819931
## 325    0    0 0.9987616491 0.0012383509
## 326    0    0 0.9924812878 0.0075187122
## 327    0    0 0.9927878511 0.0072121489
## 328    0    0 0.9643280084 0.0356719916
## 329    0    0 0.9985806008 0.0014193992
## 330    0    0 0.9988846805 0.0011153195
## 331    0    0 0.9993708405 0.0006291595
## 332    0    0 0.7141128005 0.2858871995
## 333    0    0 0.9878159238 0.0121840762
## 334    0    0 0.9986382731 0.0013617269
## 335    0    0 0.9986103119 0.0013896881
## 336    0    0 0.9797951230 0.0202048770
## 337    0    0 0.6573805996 0.3426194004
## 338    0    0 0.9926956903 0.0073043097
## 339    0    0 0.9986508971 0.0013491029
## 340    0    0 0.9991582432 0.0008417568
## 341    0    0 0.9941898130 0.0058101870
## 342    0    0 0.9985092965 0.0014907035
## 343    0    0 0.9988706765 0.0011293235
## 344    0    0 0.9903287948 0.0096712052
## 345    0    0 0.9793915749 0.0206084251
## 346    0    1 0.4788539346 0.5211460654
## 347    0    0 0.9945678723 0.0054321277
## 348    0    0 0.9980478153 0.0019521847
## 349    0    0 0.9987822570 0.0012177430
## 350    0    0 0.9988857937 0.0011142063
## 351    0    0 0.9927207589 0.0072792411
## 352    0    0 0.9235941693 0.0764058307
## 353    0    0 0.9990726746 0.0009273254
## 354    0    0 0.9986670371 0.0013329629
## 355    0    0 0.9876404358 0.0123595642
## 356    0    0 0.9981756579 0.0018243421
## 357    0    0 0.9985495355 0.0014504645
## 358    0    0 0.9993382924 0.0006617076
## 359    0    0 0.9985481530 0.0014518470
## 360    0    0 0.9991402800 0.0008597200
## 361    0    0 0.9988109901 0.0011890099
## 362    0    0 0.9859752828 0.0140247172
## 363    0    0 0.9910436278 0.0089563722
## 364    0    0 0.9984929749 0.0015070251
## 365    0    0 0.9987685326 0.0012314674
## 366    0    0 0.9977961382 0.0022038618
## 367    0    0 0.9884020724 0.0115979276
## 368    0    0 0.9944592555 0.0055407445
## 369    0    1 0.1478320800 0.8521679200
## 370    0    0 0.9948712751 0.0051287249
## 371    0    0 0.9987663240 0.0012336760
## 372    0    0 0.9990420866 0.0009579134
## 373    0    0 0.9980174513 0.0019825487
## 374    0    0 0.9987941052 0.0012058948
## 375    0    0 0.9957345119 0.0042654881
## 376    0    0 0.9930332966 0.0069667034
## 377    0    0 0.9991521180 0.0008478820
## 378    0    0 0.9954065537 0.0045934463
## 379    0    0 0.9906353243 0.0093646757
## 380    0    0 0.9993372017 0.0006627983
## 381    0    0 0.9929769952 0.0070230048
## 382    0    0 0.9986028794 0.0013971206
## 383    0    0 0.9873868108 0.0126131892
## 384    0    1 0.0496007470 0.9503992530
## 385    0    0 0.6321202357 0.3678797643
## 386    0    0 0.9801570319 0.0198429681
## 387    0    0 0.9982053028 0.0017946972
## 388    0    0 0.9991364970 0.0008635030
## 389    0    0 0.9961723756 0.0038276244
## 390    0    0 0.9949266314 0.0050733686
## 391    0    0 0.9986989153 0.0013010847
## 392    0    0 0.9985582559 0.0014417441
## 393    0    0 0.9989502646 0.0010497354
## 394    0    0 0.9984854581 0.0015145419
## 395    0    0 0.9977412127 0.0022587873
## 396    0    0 0.9985072458 0.0014927542
## 397    0    0 0.9882902822 0.0117097178
## 398    0    0 0.9958535234 0.0041464766
## 399    0    0 0.9988300197 0.0011699803
## 400    0    0 0.9986581155 0.0013418845
## 401    0    0 0.9989446235 0.0010553765
## 402    0    0 0.9969574648 0.0030425352
## 403    0    0 0.9984160572 0.0015839428
## 404    0    0 0.9991383407 0.0008616593
## 405    0    0 0.9898036924 0.0101963076
## 406    0    0 0.9786484189 0.0213515811
## 407    0    0 0.9925287070 0.0074712930
## 408    0    0 0.9985566464 0.0014433536
## 409    0    1 0.4375369186 0.5624630814
## 410    0    0 0.9919499883 0.0080500117
## 411    0    0 0.9987392862 0.0012607138
## 412    0    0 0.9925914960 0.0074085040
## 413    0    0 0.9990179970 0.0009820030
## 414    0    0 0.9988337906 0.0011662094
## 415    0    0 0.9989517763 0.0010482237
## 416    0    0 0.9979612283 0.0020387717
## 417    0    0 0.9936452677 0.0063547323
## 418    0    0 0.5700630581 0.4299369419
## 419    0    0 0.9257218246 0.0742781754
## 420    0    0 0.9988100777 0.0011899223
## 421    0    0 0.9991052912 0.0008947088
## 422    0    0 0.9988732400 0.0011267600
## 423    0    0 0.9990429189 0.0009570811
## 424    0    0 0.9991472975 0.0008527025
## 425    0    0 0.9926150507 0.0073849493
## 426    0    0 0.9366266811 0.0633733189
## 427    0    0 0.9990877436 0.0009122564
## 428    0    0 0.9757361460 0.0242638540
## 429    0    0 0.9983839691 0.0016160309
## 430    0    1 0.3160532497 0.6839467503
## 431    0    0 0.9964412187 0.0035587813
## 432    0    0 0.9982162998 0.0017837002
## 433    0    0 0.9987451689 0.0012548311
## 434    0    0 0.9988653220 0.0011346780
## 435    0    1 0.1696554954 0.8303445046
## 436    0    0 0.8108802238 0.1891197762
## 437    0    0 0.9983704359 0.0016295641
## 438    0    0 0.9969343032 0.0030656968
## 439    0    0 0.9982620988 0.0017379012
## 440    0    0 0.9858462928 0.0141537072
## 441    0    0 0.9977485395 0.0022514605
## 442    0    0 0.9894581843 0.0105418157
## 443    0    0 0.9724650439 0.0275349561
## 444    0    0 0.9986483367 0.0013516633
## 445    0    0 0.9990208107 0.0009791893
## 446    0    0 0.9986830816 0.0013169184
## 447    0    0 0.9989748434 0.0010251566
## 448    0    0 0.6527122969 0.3472877031
## 449    0    0 0.9950892211 0.0049107789
## 450    0    0 0.9990455673 0.0009544327
## 451    0    0 0.9983034539 0.0016965461
## 452    0    0 0.9991474116 0.0008525884
## 453    0    0 0.9982230978 0.0017769022
## 454    0    0 0.9943514845 0.0056485155
## 455    0    0 0.8492759522 0.1507240478
## 456    0    0 0.9917891004 0.0082108996
## 457    0    0 0.9988760102 0.0011239898
## 458    0    0 0.9901027596 0.0098972404
## 459    0    0 0.9988654375 0.0011345625
## 460    0    0 0.9988921333 0.0011078667
## 461    0    0 0.9985327847 0.0014672153
## 462    0    0 0.9986843573 0.0013156427
## 463    0    0 0.9956653913 0.0043346087
## 464    0    0 0.9987874416 0.0012125584
## 465    0    1 0.5108509498 0.4891490502
## 466    0    0 0.9926237117 0.0073762883
## 467    0    0 0.9987630310 0.0012369690
## 468    0    0 0.9992499782 0.0007500218
## 469    0    0 0.9984951860 0.0015048140
## 470    0    0 0.9850150091 0.0149849909
## 471    0    0 0.9944341402 0.0055658598
## 472    0    0 0.9916235062 0.0083764938
## 473    0    0 0.9865685678 0.0134314322
## 474    0    0 0.9908857837 0.0091142163
## 475    0    1 0.5005105016 0.4994894984
## 476    0    0 0.9991448710 0.0008551290
## 477    0    0 0.9992164018 0.0007835982
## 478    0    0 0.6683567187 0.3316432813
## 479    0    0 0.9987320848 0.0012679152
## 480    0    0 0.9922466467 0.0077533533
## 481    0    0 0.9591935423 0.0408064577
## 482    0    1 0.3535332145 0.6464667855
## 483    0    0 0.9985958329 0.0014041671
## 484    0    0 0.9863166010 0.0136833990
## 485    0    0 0.5724125803 0.4275874197
## 486    0    0 0.9981324315 0.0018675685
## 487    0    0 0.9984743483 0.0015256517
## 488    0    0 0.9989640413 0.0010359587
## 489    0    0 0.9989783913 0.0010216087
## 490    0    0 0.9984576021 0.0015423979
## 491    0    0 0.9990731191 0.0009268809
## 492    0    0 0.9992003309 0.0007996691
## 493    0    0 0.9796826004 0.0203173996
## 494    0    0 0.9904657788 0.0095342212
## 495    0    0 0.9892274206 0.0107725794
## 496    0    0 0.9935166656 0.0064833344
## 497    0    0 0.9983740618 0.0016259382
## 498    0    0 0.9929561173 0.0070438827
## 499    0    0 0.9988173747 0.0011826253
## 500    0    0 0.9926136054 0.0073863946
## 501    0    0 0.9898274230 0.0101725770
## 502    0    0 0.9930765260 0.0069234740
## 503    0    1 0.4271216553 0.5728783447
## 504    0    0 0.9990699582 0.0009300418
## 505    0    0 0.9989700697 0.0010299303
## 506    0    0 0.9886363483 0.0113636517
## 507    0    0 0.9984157196 0.0015842804
## 508    0    0 0.6513679710 0.3486320290
## 509    0    0 0.9917233284 0.0082766716
## 510    0    0 0.9946782239 0.0053217761
## 511    0    0 0.9980087231 0.0019912769
## 512    0    0 0.9981030070 0.0018969930
## 513    0    0 0.9991040274 0.0008959726
## 514    0    0 0.9985105251 0.0014894749
## 515    0    0 0.9987543555 0.0012456445
## 516    0    0 0.9988888117 0.0011111883
## 517    0    0 0.9989253460 0.0010746540
## 518    0    0 0.9776878814 0.0223121186
## 519    0    0 0.9982668281 0.0017331719
## 520    0    0 0.9985580311 0.0014419689
## 521    0    0 0.9948061409 0.0051938591
## 522    0    0 0.6431704153 0.3568295847
## 523    0    0 0.9950345658 0.0049654342
## 524    0    0 0.9991122179 0.0008877821
## 525    0    0 0.9989520142 0.0010479858
## 526    0    0 0.9966204102 0.0033795898
## 527    0    0 0.9978651301 0.0021348699
## 528    0    0 0.9871807452 0.0128192548
## 529    0    0 0.9991051937 0.0008948063
## 530    0    0 0.9989561439 0.0010438561
## 531    0    0 0.9965526474 0.0034473526
## 532    0    0 0.9870351851 0.0129648149
## 533    0    1 0.3419777471 0.6580222529
## 534    0    0 0.9938486653 0.0061513347
## 535    0    0 0.9899132129 0.0100867871
## 536    0    0 0.9767235395 0.0232764605
## 537    0    0 0.9986607005 0.0013392995
## 538    0    0 0.9989644070 0.0010355930
## 539    0    0 0.9985643400 0.0014356600
## 540    0    0 0.9990447537 0.0009552463
## 541    0    0 0.9982745322 0.0017254678
## 542    0    0 0.9989946813 0.0010053187
## 543    0    0 0.7728366232 0.2271633768
## 544    0    0 0.9946835895 0.0053164105
## 545    0    0 0.9981622688 0.0018377312
## 546    0    0 0.9918185931 0.0081814069
## 547    0    0 0.9879827725 0.0120172275
## 548    0    0 0.9980131499 0.0019868501
## 549    0    0 0.9987286553 0.0012713447
## 550    0    0 0.9988373431 0.0011626569
## 551    0    0 0.9642414116 0.0357585884
## 552    0    0 0.9944390880 0.0055609120
## 553    0    0 0.9950665683 0.0049334317
## 554    0    0 0.9920126596 0.0079873404
## 555    0    0 0.9984420860 0.0015579140
## 556    0    0 0.9988532058 0.0011467942
## 557    0    0 0.9991754365 0.0008245635
## 558    0    0 0.9988783991 0.0011216009
## 559    0    0 0.9951166310 0.0048833690
## 560    0    0 0.9987418363 0.0012581637
## 561    0    0 0.9987326082 0.0012673918
## 562    0    0 0.9988537158 0.0011462842
## 563    0    0 0.9646186861 0.0353813139
## 564    0    0 0.9963838281 0.0036161719
## 565    0    0 0.9985097455 0.0014902545
## 566    1    1 0.0012008758 0.9987991242
## 567    1    1 0.0176439917 0.9823560083
## 568    1    1 0.0658192251 0.9341807749
## 569    1    1 0.0644160208 0.9355839792
## 570    1    1 0.0275773380 0.9724226620
## 571    1    1 0.0044635515 0.9955364485
## 572    1    1 0.1781184087 0.8218815913
## 573    1    1 0.0061392486 0.9938607514
## 574    1    1 0.0547099528 0.9452900472
## 575    1    1 0.1084132578 0.8915867422
## 576    1    1 0.0006569921 0.9993430079
## 577    1    1 0.0190147490 0.9809852510
## 578    1    1 0.0024123889 0.9975876111
## 579    1    1 0.3248878639 0.6751121361
## 580    1    1 0.0043454249 0.9956545751
## 581    1    1 0.2142357625 0.7857642375
## 582    1    1 0.0105175277 0.9894824723
## 583    1    1 0.0011938219 0.9988061781
## 584    1    1 0.0083940991 0.9916059009
## 585    1    1 0.0073722033 0.9926277967
## 586    1    1 0.0043425440 0.9956574560
## 587    1    1 0.0219272998 0.9780727002
## 588    1    1 0.0083599633 0.9916400367
## 589    1    1 0.0112578817 0.9887421183
## 590    1    1 0.0100608395 0.9899391605
## 591    1    1 0.0112315243 0.9887684757
## 592    1    1 0.0330515851 0.9669484149
## 593    1    1 0.0013398273 0.9986601727
## 594    1    1 0.0238882374 0.9761117626
## 595    1    1 0.0152457185 0.9847542815
## 596    1    1 0.0353800574 0.9646199426
## 597    1    1 0.0050139038 0.9949860962
## 598    1    1 0.2154765201 0.7845234799
## 599    1    1 0.0448318476 0.9551681524
## 600    1    1 0.3118720975 0.6881279025
## 601    1    1 0.0005487155 0.9994512845
## 602    1    1 0.1179308550 0.8820691450
## 603    1    1 0.0005375387 0.9994624613
## 604    1    1 0.2269729388 0.7730270612
## 605    1    1 0.0395840225 0.9604159775
## 606    1    1 0.0223057581 0.9776942419
## 607    1    1 0.0056252244 0.9943747756
## 608    1    1 0.0016968827 0.9983031173
## 609    1    1 0.0088456406 0.9911543594
## 610    1    1 0.0184235256 0.9815764744
## 611    1    1 0.0591961229 0.9408038771
## 612    1    1 0.0014512937 0.9985487063
## 613    1    1 0.0254820520 0.9745179480
## 614    1    1 0.0445262551 0.9554737449
## 615    1    0 0.9253974516 0.0746025484
## 616    1    1 0.0119192198 0.9880807802
## 617    1    1 0.0046908339 0.9953091661
## 618    1    1 0.0129341838 0.9870658162
## 619    1    0 0.5582834659 0.4417165341
## 620    1    1 0.3301276514 0.6698723486
## 621    1    1 0.0121459763 0.9878540237
## 622    1    1 0.2361040743 0.7638959257
## 623    1    1 0.0284947971 0.9715052029
## 624    1    1 0.0073301680 0.9926698320
## 625    1    1 0.0094217128 0.9905782872
## 626    1    1 0.0105127094 0.9894872906
## 627    1    1 0.2474701369 0.7525298631
## 628    1    1 0.1743919106 0.8256080894
## 629    1    1 0.0336325792 0.9663674208
## 630    1    1 0.0022958288 0.9977041712
## 631    1    1 0.1245726338 0.8754273662
## 632    1    1 0.3799354533 0.6200645467
## 633    1    1 0.1588323844 0.8411676156
## 634    1    1 0.0083825311 0.9916174689
## 635    1    1 0.0093727964 0.9906272036
## 636    1    1 0.0092844819 0.9907155181
## 637    1    1 0.0270351337 0.9729648663
## 638    1    1 0.0004741268 0.9995258732
## 639    1    1 0.0152610530 0.9847389470
## 640    1    1 0.0103764643 0.9896235357
## 641    1    1 0.0007848344 0.9992151656
## 642    1    1 0.0184909835 0.9815090165
## 643    1    1 0.0204612613 0.9795387387
## 644    1    1 0.0007543051 0.9992456949
## 645    1    1 0.0827872807 0.9172127193
## 646    1    1 0.0019381462 0.9980618538
## 647    1    1 0.2840805876 0.7159194124
## 648    1    1 0.0091960904 0.9908039096
## 649    1    1 0.0302668100 0.9697331900
## 650    1    1 0.1547094700 0.8452905300
## 651    1    1 0.0285561639 0.9714438361
## 652    1    1 0.0059478486 0.9940521514
## 653    1    1 0.0020546405 0.9979453595
## 654    1    1 0.0243399073 0.9756600927
## 655    1    1 0.0286090381 0.9713909619
## 656    1    1 0.0711897351 0.9288102649
## 657    1    1 0.0045081229 0.9954918771
## 658    1    1 0.3694538549 0.6305461451
## 659    1    1 0.2014744263 0.7985255737
## 660    1    1 0.1111350410 0.8888649590
## 661    1    1 0.0110297345 0.9889702655
## 662    1    1 0.0085437000 0.9914563000
## 663    1    1 0.2022048320 0.7977951680
## 664    1    1 0.0109231011 0.9890768989
## 665    1    1 0.0665222349 0.9334777651
## 666    1    1 0.3950427450 0.6049572550
## 667    1    1 0.0114006808 0.9885993192
## 668    1    1 0.1047811228 0.8952188772
## 669    1    1 0.0015756761 0.9984243239
## 670    1    1 0.0245562530 0.9754437470
## 671    1    1 0.0320524531 0.9679475469
## 672    1    1 0.0215801986 0.9784198014
## 673    1    1 0.0007243883 0.9992756117
## 674    1    1 0.0636011777 0.9363988223
## 675    1    1 0.2916247664 0.7083752336
## 676    1    1 0.0282364654 0.9717635346
## 677    1    1 0.0940940677 0.9059059323
## 678    1    1 0.0385130468 0.9614869532
## 679    1    1 0.0191493964 0.9808506036
## 680    1    0 0.8029123317 0.1970876683
## 681    1    1 0.2268154622 0.7731845378
## 682    1    1 0.3271931900 0.6728068100
## 683    1    1 0.0303353390 0.9696646610
## 684    1    1 0.0374917839 0.9625082161
## 685    1    1 0.0029696990 0.9970303010
## 686    1    1 0.2866310739 0.7133689261
## 687    1    1 0.0029333538 0.9970666462
## 688    1    1 0.0217061573 0.9782938427
## 689    1    1 0.0552020492 0.9447979508
## 690    1    1 0.0005517270 0.9994482730
## 691    1    1 0.0007443559 0.9992556441
## 692    1    1 0.0007248074 0.9992751926
## 693    1    1 0.0492566375 0.9507433625
## 694    1    1 0.0542247671 0.9457752329
## 695    1    1 0.0306987248 0.9693012752
## 696    1    1 0.0082948353 0.9917051647
## 697    1    1 0.0006815284 0.9993184716
## 698    1    1 0.0466999055 0.9533000945
## 699    1    1 0.0238210425 0.9761789575
## 700    1    1 0.1804594542 0.8195405458
## 701    1    1 0.3384736279 0.6615263721
## 702    1    1 0.2614447113 0.7385552887
## 703    1    1 0.0154977718 0.9845022282
## 704    1    1 0.0073045553 0.9926954447
## 705    1    1 0.0656498912 0.9343501088
## 706    1    1 0.0014358499 0.9985641501
## 707    1    1 0.0271615569 0.9728384431
## 708    1    1 0.0044317040 0.9955682960
## 709    1    1 0.0058965143 0.9941034857
## 710    1    1 0.0073931380 0.9926068620
## 711    1    1 0.0348225520 0.9651774480
## 712    1    1 0.0262111734 0.9737888266
## 713    1    1 0.1313342503 0.8686657497
## 714    1    1 0.0014924411 0.9985075589
## 715    1    1 0.0950596525 0.9049403475
## 716    1    1 0.0709494996 0.9290505004
## 717    1    1 0.0459792005 0.9540207995
## 718    1    1 0.0264465509 0.9735534491
## 719    1    1 0.0310681933 0.9689318067
## 720    1    1 0.0005944219 0.9994055781
## 721    1    1 0.0114798308 0.9885201692
## 722    1    1 0.0014106405 0.9985893595
## 723    1    1 0.0951668515 0.9048331485
## 724    1    1 0.0174781049 0.9825218951
## 725    1    1 0.0345315757 0.9654684243
## 726    1    1 0.1041067774 0.8958932226
## 727    1    1 0.0078436548 0.9921563452
## 728    1    1 0.0398247275 0.9601752725
## 729    1    1 0.0103583167 0.9896416833
## 730    1    1 0.0402939478 0.9597060522
## 731    1    1 0.0746357222 0.9253642778
## 732    1    1 0.0152900837 0.9847099163
## 733    1    1 0.0212760932 0.9787239068
## 734    1    1 0.2047113180 0.7952886820
## 735    1    1 0.0026052807 0.9973947193
## 736    1    1 0.0235019223 0.9764980777
## 737    1    1 0.0011298969 0.9988701031
## 738    1    1 0.0298998181 0.9701001819
## 739    1    1 0.0035553632 0.9964446368
## 740    1    1 0.0271241451 0.9728758549
## 741    1    1 0.0077562604 0.9922437396
## 742    1    1 0.0161505692 0.9838494308
## 743    1    1 0.0315692298 0.9684307702
## 744    1    1 0.3613400351 0.6386599649
## 745    1    1 0.0006088588 0.9993911412
## 746    1    1 0.0019641133 0.9980358867
## 747    1    1 0.0558517608 0.9441482392
## 748    1    1 0.0111056751 0.9888943249
## 749    1    1 0.0008289720 0.9991710280
## 750    1    0 0.6000117047 0.3999882953
## 751    1    1 0.0258882170 0.9741117830
## 752    1    1 0.0158999016 0.9841000984
## 753    1    1 0.0094388183 0.9905611817
## 754    1    1 0.1224133968 0.8775866032
## 755    1    1 0.1769885184 0.8230114816
## 756    1    1 0.2133107932 0.7866892068
## 757    1    1 0.0214111748 0.9785888252
## 758    1    1 0.0178541208 0.9821458792
## 759    1    1 0.0196130602 0.9803869398
## 760    1    1 0.0179728072 0.9820271928
## 761    1    1 0.0411599468 0.9588400532
## 762    1    1 0.0972256926 0.9027743074
## 763    1    1 0.0872317833 0.9127682167
## 764    1    1 0.0198059353 0.9801940647
## 765    1    1 0.0048431755 0.9951568245
## 766    1    1 0.0917577736 0.9082422264
## 767    1    1 0.0085219866 0.9914780134
## 768    1    1 0.0012331151 0.9987668849
## 769    1    1 0.0016479137 0.9983520863
## 770    1    1 0.0451942941 0.9548057059
## 771    1    1 0.0080403469 0.9919596531
## 772    1    1 0.0077242987 0.9922757013
## 773    1    1 0.3562022935 0.6437977065
## 774    1    1 0.0441856369 0.9558143631
## 775    1    1 0.0764935261 0.9235064739
## 776    1    1 0.1611966279 0.8388033721
## 777    1    1 0.0066837746 0.9933162254
## 778    1    1 0.0007760914 0.9992239086
## 779    1    1 0.1865715528 0.8134284472
## 780    1    1 0.0357069696 0.9642930304
## 781    1    0 0.7088392455 0.2911607545
## 782    1    1 0.1472035190 0.8527964810
## 783    1    1 0.0019005420 0.9980994580
## 784    1    1 0.0802923258 0.9197076742
## 785    1    1 0.0105052246 0.9894947754
## 786    1    1 0.0032047321 0.9967952679
## 787    1    1 0.0998849605 0.9001150395
## 788    1    1 0.0012565402 0.9987434598
## 789    1    1 0.1087236682 0.8912763318
## 790    1    1 0.0090638283 0.9909361717
## 791    1    1 0.0072522641 0.9927477359
## 792    1    1 0.0013504378 0.9986495622
## 793    1    1 0.1974348090 0.8025651910
## 794    1    1 0.0859938832 0.9140061168
## 795    1    1 0.1003108120 0.8996891880
## 796    1    1 0.0047672209 0.9952327791
## 797    1    1 0.0788997097 0.9211002903
## 798    1    1 0.0390150922 0.9609849078
## 799    1    1 0.0125367473 0.9874632527
## 800    1    1 0.0011534199 0.9988465801
## 801    1    1 0.0013563441 0.9986436559
## 802    1    1 0.0012370721 0.9987629279
## 803    1    1 0.0019610612 0.9980389388
## 804    1    1 0.2076260330 0.7923739670
## 805    1    1 0.1124129412 0.8875870588
## 806    1    1 0.0581866940 0.9418133060
## 807    1    1 0.1335331273 0.8664668727
## 808    1    1 0.0520569160 0.9479430840
## 809    1    1 0.0161030989 0.9838969011
## 810    1    1 0.0074399298 0.9925600702
## 811    1    1 0.0569639354 0.9430360646
## 812    1    1 0.0028428122 0.9971571878
## 813    1    1 0.2021571936 0.7978428064
## 814    1    1 0.1034009430 0.8965990570
## 815    1    1 0.0068783511 0.9931216489
## 816    1    1 0.0041131582 0.9958868418
## 817    1    0 0.9237205749 0.0762794251
## 818    1    1 0.0835393754 0.9164606246
## 819    1    1 0.0120855180 0.9879144820
## 820    1    1 0.0666739102 0.9333260898
## 821    1    1 0.0045738755 0.9954261245
## 822    1    1 0.0092416890 0.9907583110
## 823    1    1 0.0011832660 0.9988167340
## 824    1    1 0.0016010076 0.9983989924
## 825    1    1 0.0005340792 0.9994659208
## 826    1    1 0.0205229271 0.9794770729
## 827    1    1 0.0026680118 0.9973319882
## 828    1    1 0.0014286906 0.9985713094
## 829    1    0 0.8025951393 0.1974048607
## 830    1    1 0.0572865814 0.9427134186
## 831    1    1 0.0892525363 0.9107474637
## 832    1    1 0.0140292141 0.9859707859
## 833    1    1 0.0026960794 0.9973039206
## 834    1    1 0.0014862044 0.9985137956
## 835    1    1 0.1085174796 0.8914825204
## 836    1    1 0.0056218147 0.9943781853
## 837    1    1 0.0014207318 0.9985792682
## 838    1    1 0.0010038933 0.9989961067
## 839    1    1 0.0038708936 0.9961291064
## 840    1    1 0.0012343749 0.9987656251
## 841    1    1 0.0482986423 0.9517013577
## 842    1    1 0.0038182937 0.9961817063
## 843    1    1 0.0114514108 0.9885485892
## 844    1    1 0.1410913741 0.8589086259
## 845    1    1 0.0008175837 0.9991824163
## 846    1    1 0.0095783454 0.9904216546
## 847    1    1 0.0169823400 0.9830176600
## 848    1    1 0.0424019836 0.9575980164
## 849    1    1 0.2124716669 0.7875283331
## 850    1    1 0.0228151842 0.9771848158
## 851    1    1 0.0218234986 0.9781765014
## 852    1    1 0.1507285772 0.8492714228
## 853    1    1 0.1718700956 0.8281299044
## 854    1    1 0.0971719265 0.9028280735
## 855    1    1 0.0008935728 0.9991064272
## 856    1    1 0.0336560794 0.9663439206
## 857    1    1 0.0005152332 0.9994847668
## 858    1    1 0.0008476609 0.9991523391
## 859    1    1 0.0023512396 0.9976487604
## 860    1    1 0.0261859009 0.9738140991
## 861    1    1 0.2146179265 0.7853820735
## 862    1    1 0.0018523516 0.9981476484
## 863    1    1 0.0024461586 0.9975538414
## 864    1    1 0.0009896803 0.9990103197
## 865    1    1 0.0435191718 0.9564808282
## 866    1    1 0.0043454240 0.9956545760
## 867    1    1 0.0027944271 0.9972055729
## 868    1    1 0.0141828132 0.9858171868
## 869    1    1 0.0050440654 0.9949559346
## 870    1    1 0.0129243426 0.9870756574
## 871    1    0 0.5504252566 0.4495747434
## 872    1    1 0.0076372704 0.9923627296
## 873    1    1 0.0017181241 0.9982818759
## 874    1    1 0.0392254856 0.9607745144
## 875    1    1 0.0020690118 0.9979309882
## 876    1    1 0.0893872318 0.9106127682
## 877    1    1 0.0012980110 0.9987019890
## 878    1    1 0.0188469553 0.9811530447
## 879    1    1 0.0474814666 0.9525185334
## 880    1    1 0.1299746415 0.8700253585
## 881    1    1 0.0593318635 0.9406681365
## 882    1    1 0.0014722196 0.9985277804
## 883    1    1 0.0123717911 0.9876282089
## 884    1    1 0.0545743857 0.9454256143
## 885    1    1 0.0009937127 0.9990062873
## 886    1    1 0.0020533668 0.9979466332
## 887    1    1 0.0072877988 0.9927122012
## 888    1    1 0.2141677608 0.7858322392
## 889    1    1 0.0402383847 0.9597616153
## 890    1    1 0.0423736409 0.9576263591
## 891    1    1 0.0251649274 0.9748350726
## 892    1    1 0.0458120737 0.9541879263
## 893    1    1 0.0436786335 0.9563213665
## 894    1    1 0.0009499488 0.9990500512
## 895    1    1 0.2065587192 0.7934412808
## 896    1    1 0.0008496475 0.9991503525
## 897    1    1 0.0420778286 0.9579221714
## 898    1    1 0.0007984395 0.9992015605
## 899    1    1 0.0579136159 0.9420863841
## 900    1    1 0.0359294603 0.9640705397
## 901    1    1 0.0021207294 0.9978792706
## 902    1    1 0.0021229472 0.9978770528
## 903    1    1 0.0888176049 0.9111823951
## 904    1    1 0.2290673317 0.7709326683
## 905    1    1 0.0806855851 0.9193144149
## 906    1    1 0.0938104308 0.9061895692
## 907    1    1 0.0047530301 0.9952469699
## 908    1    1 0.0005614910 0.9994385090
## 909    1    1 0.2945522448 0.7054477552
## 910    1    1 0.0182393017 0.9817606983
## 911    1    1 0.1352255659 0.8647744341
## 912    1    1 0.0011342217 0.9988657783
## 913    1    1 0.0234433271 0.9765566729
## 914    1    1 0.0146229876 0.9853770124
## 915    1    1 0.0169042064 0.9830957936
## 916    1    1 0.0376117683 0.9623882317
## 917    1    1 0.0432277056 0.9567722944
## 918    1    1 0.0054884450 0.9945115550
## 919    1    1 0.0089317104 0.9910682896
## 920    1    1 0.0126051140 0.9873948860
## 921    1    1 0.1462363398 0.8537636602
## 922    1    1 0.0974610897 0.9025389103
## 923    1    1 0.0013176407 0.9986823593
## 924    1    1 0.1317498713 0.8682501287
## 925    1    1 0.0029786873 0.9970213127
## 926    1    1 0.0087331358 0.9912668642
## 927    1    1 0.1137931667 0.8862068333
## 928    1    1 0.0010079598 0.9989920402
## 929    1    1 0.0045924394 0.9954075606
## 930    1    1 0.0040354868 0.9959645132
## 931    1    1 0.1107744873 0.8892255127
## 932    1    1 0.0004993515 0.9995006485
## 933    1    0 0.9000829129 0.0999170871
## 934    1    1 0.0115393148 0.9884606852
## 935    1    1 0.0936010816 0.9063989184
## 936    1    1 0.0082939615 0.9917060385
## 937    1    1 0.0481367862 0.9518632138
## 938    1    1 0.0579462926 0.9420537074
## 939    1    1 0.2087990115 0.7912009885
## 940    1    1 0.2144608493 0.7855391507
## 941    1    1 0.0199601147 0.9800398853
## 942    1    1 0.0140980677 0.9859019323
## 943    1    1 0.0556855234 0.9443144766
## 944    1    1 0.1118079613 0.8881920387
## 945    1    1 0.0072774146 0.9927225854
## 946    1    1 0.0022432990 0.9977567010
## 947    1    1 0.1964634375 0.8035365625
## 948    1    1 0.0503780970 0.9496219030
## 949    1    1 0.0017794517 0.9982205483
## 950    1    1 0.0399221356 0.9600778644
## 951    1    1 0.0163948237 0.9836051763
## 952    1    1 0.1730599005 0.8269400995
## 953    1    1 0.0038398832 0.9961601168
## 954    1    1 0.0012877606 0.9987122394
## 955    1    1 0.0039076475 0.9960923525
## 956    1    1 0.0438746056 0.9561253944
## 957    1    1 0.0334005446 0.9665994554
## 958    1    1 0.0022558274 0.9977441726
## 959    1    1 0.0614383439 0.9385616561
## 960    1    1 0.0127359525 0.9872640475
## 961    1    1 0.0004484454 0.9995515546
## 962    1    1 0.1592355424 0.8407644576
## 963    1    1 0.1393737783 0.8606262217
## 964    1    1 0.0198621946 0.9801378054
## 965    1    1 0.0696257505 0.9303742495
## 966    1    1 0.0012691339 0.9987308661
## 967    1    1 0.0127878855 0.9872121145
## 968    1    1 0.0017826102 0.9982173898
## 969    1    1 0.0459621071 0.9540378929
## 970    1    1 0.0240388499 0.9759611501
## 971    1    1 0.0027182747 0.9972817253
## 972    1    1 0.0101851350 0.9898148650
## 973    1    1 0.0096069213 0.9903930787
## 974    1    1 0.0118885943 0.9881114057
## 975    1    1 0.0569953323 0.9430046677
## 976    1    1 0.0022623519 0.9977376481
## 977    1    1 0.0153521239 0.9846478761
## 978    1    1 0.0104996392 0.9895003608
## 979    1    1 0.0026011389 0.9973988611
## 980    1    1 0.0014249056 0.9985750944
## 981    1    1 0.0273183670 0.9726816330
## 982    1    1 0.0310535772 0.9689464228
## 983    1    1 0.0313831086 0.9686168914
## 984    1    1 0.3563045656 0.6436954344
## 985    1    1 0.0678452446 0.9321547554
## 986    1    1 0.0639681943 0.9360318057
## 987    1    1 0.0138764529 0.9861235471
## 988    1    1 0.0169888725 0.9830111275
## 989    1    1 0.0010721672 0.9989278328
## 990    1    1 0.0060959541 0.9939040459
## 991    1    1 0.0115905292 0.9884094708
## 992    1    1 0.0304101058 0.9695898942
## 993    1    1 0.1344379457 0.8655620543
## 994    1    1 0.0108187989 0.9891812011
## 995    1    1 0.0167468281 0.9832531719
## 996    1    1 0.0152391168 0.9847608832
## 997    1    1 0.0084177051 0.9915822949
## 998    1    1 0.0589919593 0.9410080407
## 999    1    1 0.0224634769 0.9775365231
## 1000   1    1 0.1187931494 0.8812068506
## 1001   1    1 0.0082380597 0.9917619403
## 1002   1    1 0.0065374494 0.9934625506
## 1003   1    1 0.0022616963 0.9977383037
## 1004   1    1 0.0509876274 0.9490123726
## 1005   1    1 0.0800591000 0.9199409000
## 1006   1    1 0.0008457012 0.9991542988
## 1007   1    1 0.0040954812 0.9959045188
## 1008   1    1 0.0017137903 0.9982862097
## 1009   1    1 0.0014000838 0.9985999162
## 1010   1    1 0.0025875825 0.9974124175
## 1011   1    1 0.0014749738 0.9985250262
## 1012   1    0 0.9306338510 0.0693661490
## 1013   1    1 0.4940895185 0.5059104815
## 1014   1    1 0.0112995014 0.9887004986
## 1015   1    1 0.0067457682 0.9932542318
## 1016   1    1 0.0383311994 0.9616688006
## 1017   1    1 0.0013199714 0.9986800286
## 1018   1    1 0.0115936404 0.9884063596
## 1019   1    1 0.0195620641 0.9804379359
## 1020   1    1 0.1089077306 0.8910922694
## 1021   1    1 0.0207769153 0.9792230847
## 1022   1    1 0.0572142982 0.9427857018
## 1023   1    1 0.0039308104 0.9960691896
## 1024   1    1 0.0302718712 0.9697281288
## 1025   1    1 0.1160623219 0.8839376781
## 1026   1    1 0.0041001486 0.9958998514
## 1027   1    1 0.0049917505 0.9950082495
## 1028   1    1 0.0139220715 0.9860779285
## 1029   1    1 0.0006076925 0.9993923075
## 1030   1    1 0.1108003436 0.8891996564
## 1031   1    1 0.0004281969 0.9995718031
## 1032   1    1 0.1872897319 0.8127102681
## 1033   1    1 0.0071122616 0.9928877384
## 1034   1    1 0.0003615148 0.9996384852
## 1035   1    1 0.0063791129 0.9936208871
## 1036   1    1 0.0171523017 0.9828476983
## 1037   1    1 0.0179404156 0.9820595844
## 1038   1    1 0.0026580664 0.9973419336
## 1039   1    1 0.0351813620 0.9648186380
## 1040   1    1 0.1469428401 0.8530571599
## 1041   1    1 0.0710663967 0.9289336033
## 1042   1    0 0.7233487919 0.2766512081
## 1043   1    0 0.5763790186 0.4236209814
## 1044   1    1 0.0050675078 0.9949324922
## 1045   1    1 0.0686613977 0.9313386023
## 1046   1    1 0.0105824782 0.9894175218
## 1047   1    1 0.1123738760 0.8876261240
## 1048   1    1 0.0201397109 0.9798602891
## 1049   1    1 0.0051371022 0.9948628978
## 1050   1    1 0.0116351557 0.9883648443
## 1051   1    1 0.1209952332 0.8790047668
## 1052   1    1 0.0444971606 0.9555028394
## 1053   1    1 0.0042869262 0.9957130738
## 1054   1    1 0.0691644639 0.9308355361
## 1055   1    1 0.0676420538 0.9323579462
## 1056   1    1 0.0446396165 0.9553603835
## 1057   1    1 0.4330439652 0.5669560348
## 1058   1    1 0.0455867500 0.9544132500
## 1059   1    1 0.0112337648 0.9887662352
## 1060   1    1 0.2486431472 0.7513568528
## 1061   1    1 0.0995742672 0.9004257328
## 1062   1    1 0.0010054183 0.9989945817
## 1063   1    1 0.0015738857 0.9984261143
## 1064   1    1 0.4940298551 0.5059701449
## 1065   1    1 0.0019181550 0.9980818450
## 1066   1    1 0.0027581287 0.9972418713
## 1067   1    1 0.0437325848 0.9562674152
## 1068   1    1 0.1809753061 0.8190246939
## 1069   1    1 0.3505609764 0.6494390236
## 1070   1    1 0.1974475988 0.8025524012
## 1071   1    0 0.5740101913 0.4259898087
## 1072   1    1 0.0217886687 0.9782113313
## 1073   1    1 0.1141494150 0.8858505850
## 1074   1    1 0.0066645705 0.9933354295
## 1075   1    1 0.1645681350 0.8354318650
## 1076   1    1 0.0012960917 0.9987039083
## 1077   1    1 0.0667375751 0.9332624249
## 1078   1    1 0.0027267530 0.9972732470
## 1079   1    1 0.0004228772 0.9995771228
## 1080   1    1 0.0309459693 0.9690540307
## 1081   1    1 0.0030830604 0.9969169396
## 1082   1    1 0.0074834544 0.9925165456
## 1083   1    1 0.0092933479 0.9907066521
## 1084   1    1 0.0123442640 0.9876557360
## 1085   1    1 0.0564710708 0.9435289292
## 1086   1    1 0.0120976772 0.9879023228
## 1087   1    1 0.0533627193 0.9466372807
## 1088   1    1 0.0004519854 0.9995480146
## 1089   1    1 0.0014115912 0.9985884088
## 1090   1    1 0.0158851217 0.9841148783
## 1091   1    1 0.0171338415 0.9828661585
## 1092   1    1 0.0058300764 0.9941699236
## 1093   1    1 0.1503143425 0.8496856575
## 1094   1    1 0.0372624708 0.9627375292
## 1095   1    1 0.0992663041 0.9007336959
## 1096   1    1 0.2206438210 0.7793561790
## 1097   1    1 0.4125648549 0.5874351451
## 1098   1    1 0.2283496323 0.7716503677
## 1099   1    1 0.0092039628 0.9907960372
## 1100   1    1 0.0011107095 0.9988892905
## 1101   1    1 0.0371236277 0.9628763723
## 1102   1    0 0.7425804375 0.2574195625
## 1103   1    1 0.0036889494 0.9963110506
## 1104   1    1 0.1901782676 0.8098217324
## 1105   1    1 0.0599124730 0.9400875270
## 1106   1    1 0.0838683475 0.9161316525
## 1107   1    1 0.0007646590 0.9992353410
## 1108   1    1 0.0004347845 0.9995652155
## 1109   1    1 0.0013884138 0.9986115862
## 1110   1    1 0.0176908897 0.9823091103
## 1111   1    1 0.0013580420 0.9986419580
## 1112   1    1 0.0006054327 0.9993945673
## 1113   1    1 0.0389854252 0.9610145748
## 1114   1    1 0.3493557271 0.6506442729
## 1115   1    1 0.2869531115 0.7130468885
## 1116   1    1 0.0009031592 0.9990968408
## 1117   1    1 0.5137425313 0.4862574687
## 1118   1    1 0.0117836773 0.9882163227
## 1119   1    1 0.0007449133 0.9992550867
## 1120   1    1 0.0109147443 0.9890852557
## 1121   1    1 0.0008638554 0.9991361446
## 1122   1    1 0.1896221457 0.8103778543
## 1123   1    1 0.0025312743 0.9974687257
## 1124   1    1 0.0561152269 0.9438847731
## 1125   1    1 0.1254084523 0.8745915477
## 1126   1    1 0.3042691504 0.6957308496
## 1127   1    1 0.0641007788 0.9358992212
## 1128   1    1 0.0020794651 0.9979205349
## 1129   1    1 0.0357246897 0.9642753103
## 1130   1    1 0.0011458613 0.9988541387

#plotting ROC curve of gradient boosted DT

ggroc(automl.auc)+ggtitle("automl Ensemble Model's ROC Curve of AUC= 0.9934")+geom_segment(aes(x=1,y=0,xend=0,yend=1),linetype="dotted",color="red")+theme_light()

temp<-temp%>%rename(R2="t.R2....R2_Score.y_pred...as.numeric.as.character.automl.summary.pred....",MSE="t.mse....MSE.as.numeric.as.character.automl.summary.pred....as.numeric.as.character.automl.summary.obs..."  
,RMSE="t.RSME....RMSE.as.numeric.as.character.automl.summary.pred...."
,AUC="t.AUC....automl.auc.auc",classError="t.ClassError....mean.automl.summary.pred....automl.summary.obs.")

#adding automled ensemble to the results data frame

test.results[3,]<-c(R2=temp$R2,MSE=temp$MSE,RMSE=temp$RMSE,AUC=temp$AUC,classError=temp$classError)
rownames(test.results)<-c("Random Forest","Gradient Boosting","automled Ensemble")

#storing confusion matrix testing results

temp<-automl.con$byClass%>%as.data.frame()%>%t()
temp<-as.data.frame(temp)

confusionM<-confusionM%>%add_row(Sensitivity= temp$Sensitivity,Specificity=temp$Specificity, Pos_pred_value=temp$`Pos Pred Value`,Neg_Pred_value=temp$`Neg Pred Value`,Precision=temp$Precision, Recall=temp$Recall,F1=temp$F1 ,Prevalence= temp$Prevalence,Detection_rate=temp$`Detection Rate`, Detection_prevalence=temp$`Detection Prevalence`,Balanced_Accuracy=temp$`Balanced Accuracy`)

rownames(confusionM)<-c("Random Forest","Gradient Boosting","automled Ensemble")

#See precision and recall for all three models

automl.perf<-h2o.performance(leader_model,test.h2o)%>%h2o.metric()%>%as.data.frame()%>%select(c(recall,precision))
automl.perf$model<-"automled Ensemble"
combine_rpplots<-rbind(rf.perf,gb.perf,automl.perf)

ggplot(combine_rpplots,aes(recall,precision,group=model,color=model))+geom_line()+labs(title ="Precision-Recall AUC Curve",legend="current ML models")+theme_light()

#see ROCs of all models

rocs<-list(Gradient_Boost=gb.auc,Random_Forest=rf.auc,automled_ensemble=automl.auc)
ggroc(rocs)+ggtitle("ROC Performance of all Models")+geom_segment(aes(x=1,y=0,xend=0,yend=1),linetype="dotted",color="red")+theme_light()

library(pROC)
library(ggplot2)

rocs<-list(Gradient_Boost=gb.auc,Random_Forest=rf.auc)
ggroc(rocs)+ggtitle("ROC Performance of current models")+geom_segment(aes(x=1,y=0,xend=0,yend=1),linetype="dotted",color="red")+theme_light()

###Include NB, DPL, and SVM

#Including NB, DPL,SVM models in the roc graph

nb.pred<-h2o.predict(pros_nb,test.h2o)%>%as.data.frame()%>%pull(predict)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

nb.precsnprob<-h2o.predict(pros_nb,test.h2o)%>%as.data.frame()%>%pull(p1)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

nb.reclprob<-h2o.predict(pros_nb,test.h2o)%>%as.data.frame()%>%pull(p0)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

nb.summary<-data.frame(
      obs<-test_data$WFRI_R,
      pred<-nb.pred,
      N<-nb.reclprob,
      Y<-nb.precsnprob
  )

svm.pred<-h2o.predict(svm_model,test.h2o)%>%as.data.frame()%>%pull(predict)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

svm.precsnprob<-h2o.predict(svm_model,test.h2o)%>%as.data.frame()%>%pull(p1)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

svm.reclprob<-h2o.predict(svm_model,test.h2o)%>%as.data.frame()%>%pull(p0)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

svm.summary<-data.frame(
      obs<-test_data$WFRI_R,
      pred<-svm.pred,
      N<-svm.reclprob,
      Y<-svm.precsnprob
  )


dl.pred<-h2o.predict(dl,test.h2o)%>%as.data.frame()%>%pull(predict)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

dl.precsnprob<-h2o.predict(dl,test.h2o)%>%as.data.frame()%>%pull(p1)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

dl.reclprob<-h2o.predict(dl,test.h2o)%>%as.data.frame()%>%pull(p0)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

dl.summary<-data.frame(
      obs<-test_data$WFRI_R,
      pred<-dl.pred,
      N<-dl.reclprob,
      Y<-dl.precsnprob
  )

nb.summary<-nb.summary%>%rename(obs="obs....test_data.WFRI_R",pred="pred....nb.pred", N="N....nb.reclprob", Y="Y....nb.precsnprob")

nb.auc<-roc(nb.summary$obs,Y)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

svm.summary<-svm.summary%>%rename(obs="obs....test_data.WFRI_R",pred="pred....svm.pred", N="N....svm.reclprob", Y="Y....svm.precsnprob")

svm.auc<-roc(svm.summary$obs,Y)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

dl.summary<-dl.summary%>%rename(obs="obs....test_data.WFRI_R",pred="pred....dl.pred", N="N....dl.reclprob", Y="Y....dl.precsnprob")

dl.auc<-roc(dl.summary$obs,Y)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

rocs<-list(Gradient_Boost=gb.auc,Random_Forest=rf.auc,automl_ensemble=automl.auc,Navie_Bayes=nb.auc,supervised_learning=svm.auc,Deep_learning=dl.auc)
ggroc(rocs)+ggtitle("ROC Performance of all Models")+geom_segment(aes(x=1,y=0,xend=0,yend=1),linetype="dotted",color="red")+theme_light()

#Updating test results for model performance for all remaining models
temp<-data.frame(
  t.R2<-R2_Score(y_pred = as.numeric(as.character(nb.summary$pred)),y_true =as.numeric(as.character( nb.summary$obs))),
  t.mse<-MSE(as.numeric(as.character(nb.summary$pred)),as.numeric(as.character(nb.summary$obs))),
  t.RMSE<-RMSE(as.numeric(as.character(nb.summary$pred)),as.numeric(as.character(nb.summary$obs))),
  t.AUC<-nb.auc$auc,
  t.ClassError<-mean(nb.summary$pred!=nb.summary$obs)
  
)
temp<-temp%>%rename(R2="t.R2....R2_Score.y_pred...as.numeric.as.character.nb.summary.pred....",MSE="t.mse....MSE.as.numeric.as.character.nb.summary.pred....as.numeric.as.character.nb.summary.obs..."  
,RMSE="t.RMSE....RMSE.as.numeric.as.character.nb.summary.pred....as.numeric.as.character.nb.summary.obs..."
,AUC="t.AUC....nb.auc.auc",classError="t.ClassError....mean.nb.summary.pred....nb.summary.obs.")

test.results[4,]<-c(R2=temp$R2,MSE=temp$MSE,RMSE=temp$RMSE,AUC=temp$AUC,classError=temp$classError)
rownames(test.results)<-c("Random Forest","Gradient Boosting","automl- stack ensemble","Navie Bayes")

temp<-data.frame(
  t.R2<-R2_Score(y_pred = as.numeric(as.character(svm.summary$pred)),y_true =as.numeric(as.character( svm.summary$obs))),
  t.mse<-MSE(as.numeric(as.character(svm.summary$pred)),as.numeric(as.character(svm.summary$obs))),
  t.RMSE<-RMSE(as.numeric(as.character(svm.summary$pred)),as.numeric(as.character(svm.summary$obs))),
  t.AUC<-svm.auc$auc,
  t.ClassError<-mean(svm.summary$pred!=svm.summary$obs)
  
)
temp<-temp%>%rename(R2="t.R2....R2_Score.y_pred...as.numeric.as.character.svm.summary.pred....",MSE="t.mse....MSE.as.numeric.as.character.svm.summary.pred....as.numeric.as.character.svm.summary.obs..."  
,RMSE="t.RMSE....RMSE.as.numeric.as.character.svm.summary.pred....as.numeric.as.character.svm.summary.obs..."
,AUC="t.AUC....svm.auc.auc",classError="t.ClassError....mean.svm.summary.pred....svm.summary.obs.")

test.results[5,]<-c(R2=temp$R2,MSE=temp$MSE,RMSE=temp$RMSE,AUC=temp$AUC,classError=temp$classError)
rownames(test.results)<-c("Random Forest","Gradient Boosting","automl- stack ensemble","Navie Bayes","Supervised Learning Model")

temp<-data.frame(
  t.R2<-R2_Score(y_pred = as.numeric(as.character(dl.summary$pred)),y_true =as.numeric(as.character( dl.summary$obs))),
  t.mse<-MSE(as.numeric(as.character(dl.summary$pred)),as.numeric(as.character(dl.summary$obs))),
  t.RMSE<-RMSE(as.numeric(as.character(dl.summary$pred)),as.numeric(as.character(dl.summary$obs))),
  t.AUC<-dl.auc$auc,
  t.ClassError<-mean(dl.summary$pred!=dl.summary$obs)
  
)
temp<-temp%>%rename(R2="t.R2....R2_Score.y_pred...as.numeric.as.character.dl.summary.pred....",MSE="t.mse....MSE.as.numeric.as.character.dl.summary.pred....as.numeric.as.character.dl.summary.obs..."  
,RMSE="t.RMSE....RMSE.as.numeric.as.character.dl.summary.pred....as.numeric.as.character.dl.summary.obs..."
,AUC="t.AUC....dl.auc.auc",classError="t.ClassError....mean.dl.summary.pred....dl.summary.obs.")

test.results[6,]<-c(R2=temp$R2,MSE=temp$MSE,RMSE=temp$RMSE,AUC=temp$AUC,classError=temp$classError)
rownames(test.results)<-c("Random Forest","Gradient Boosting","automl- stack ensemble","Navie Bayes","Supervised Learning Model","Deep Learning")

# Assuming you have defined the rocs2 list with AUC values

rocs2 <- list(
  Gradient_Boost = roc(gb.summary$obs, gb.summary$Y),
  Random_Forest = roc(rf.summary$obs, rf.summary$Y),
  Stacked_ensemble = roc(automl.summary$obs, automl.summary$Y)
  
)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Convert the list of ROC objects to a data frame
roc_data <- do.call(rbind, lapply(names(rocs2), function(model) {
  data.frame(
    model = model,
    sensitivity = rocs2[[model]]$sensitivities,
    specificity = 1 - rocs2[[model]]$specificities
  )
}))

# Create a ggplot object with reversed x-axis layout
ggplot(roc_data, aes(x = specificity, y = sensitivity, color = model)) +
  geom_line() +  # Use default line weight
  ggtitle("ROC Performance of all Models") +
  geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), linetype = "dashed", color = "red") +
  theme_light() +
  scale_x_continuous(
    breaks = seq(1, 0, by = -0.1),  # Reverse the breaks
    labels = seq(1, 0, by = -0.1),  # Reverse the labels
    limits = c(0, 1)  # Adjust the limits as needed
  )

Tables - Confusion Matrix

names(nb_cm$byClass)

names(nb_cm$overall)

## [1] "Accuracy"       "Kappa"          "AccuracyLower"  "AccuracyUpper" 
## [5] "AccuracyNull"   "AccuracyPValue" "McnemarPValue"

metrics_drf <- c(rf.con$overall[1], rf.con$byClass[5], rf.con$byClass[6], rf.con$byClass[7])
metrics_aml <- c(aml_cm$overall[1], aml_cm$byClass[5], aml_cm$byClass[6], aml_cm$byClass[7])
metrics_dl <- c(dl_cm$overall[1], dl_cm$byClass[5], dl_cm$byClass[6], dl_cm$byClass[7])
metrics_gbm <- c(gb.con$overall[1], gb.con$byClass[5], gb.con$byClass[6], gb.con$byClass[7])
metrics_nb <- c(nb_cm$overall[1], nb_cm$byClass[5], nb_cm$byClass[6], nb_cm$byClass[7])
metrics_svm <- c(svm_cm$overall[1], svm_cm$byClass[5], svm_cm$byClass[6], svm_cm$byClass[7])

# Round the values to four decimals
metrics_drf <- round(metrics_drf, 4)
metrics_aml <- round(metrics_aml, 4)
metrics_dl <- round(metrics_dl, 4)
metrics_gbm <- round(metrics_gbm, 4)
metrics_nb <- round(metrics_nb, 4)
metrics_svm <- round(metrics_svm, 4)

tab_pref <- rbind(metrics_drf, metrics_aml, metrics_dl, metrics_gbm, metrics_nb, metrics_svm)
rownames(tab_pref) <- c("Distributed Random Forest", 
                        "AutoML", "Deep Learning", 
                        "Gradient Boosting Machine", 
                        "Naive Bayes", "Support Vector Machine")


# Identify the column index of the accuracy values in your data frame
accuracy_column_index <- 1  

# Order the rows of the data frame based on accuracy in descending order
tab_pref <- tab_pref[order(-tab_pref[, accuracy_column_index]), ]

# Create the data frame with the "Model" index name
(tab.pref <- data.frame( tab_pref))

##                           Accuracy Precision Recall     F1
## AutoML                      0.9655    0.9566 0.9752 0.9658
## Gradient Boosting Machine   0.9628    0.9596 0.9664 0.9630
## Deep Learning               0.9558    0.9510 0.9611 0.9560
## Distributed Random Forest   0.9540    0.9355 0.9752 0.9549
## Naive Bayes                 0.9071    0.9049 0.9097 0.9073
## Support Vector Machine      0.5478    0.5254 0.9894 0.6863

Visualization

# Increase bottom margin
par(mar = c(6, 4, 6, 4))

# Your existing barplot code
bp <- barplot(tab_pref, 
              col = c("#0000EE", "#00FF00", "#FF6EEB", "#EEEE00", "#FF8C69", "#528B8B"), 
              beside = TRUE)

# Legend at the bottom
legend(x = "bottom", 
       y = -0.4, 
       legend = c("Random Forest", "AutoML", "Deep Learning", "Gradient Boosting", "Naive Bayes", "SVM"),
       fill = c("#0000EE", "#00FF00", "#FF6EEB", "#EEEE00", "#FF8C69", "#528B8B"),
       cex = 0.53, 
       ncol = 6)

Model Accuracy and Unique Hyperparameters

# Create a data frame with the information
model_data <- data.frame(
  Model = c("Distributed Random Forest (DRF)",
            "AutoML (Automatic Machine Learning)",
            "Deep Learning",
            "Gradient Boosting Machine (GBM)",
            "Naïve-Bayes (NB)",
            "Support Vector Machine (SVM)"
  ),
  Accuracy = round(c(rf.con$overall[1], aml_cm$overall[1], dl_cm$overall[1], 
               gb.con$overall[1], nb_cm$overall[1], svm_cm$overall[1]), 4),
  Unique_Hyperparameters = c("balance_classes = FALSE",
                             "max_models = 5",
                             "Hidden = c(1), activation = 'Tanh', epochs = 1000",
                             "learn_rate = 0.1, ntrees = 1000",
                             "",
                             "Gamma = 0.01, rank_ratio = 0.1"
  )
)

# Order the values in descending order based on the "Model" column
model_df2 <- model_data[order(model_data$Model, decreasing = FALSE), ]

# Print the table
(model.df2 <- data.frame(model_df2))

##                                 Model Accuracy
## 2 AutoML (Automatic Machine Learning)   0.9655
## 3                       Deep Learning   0.9558
## 1     Distributed Random Forest (DRF)   0.9540
## 4     Gradient Boosting Machine (GBM)   0.9628
## 5                    Naïve-Bayes (NB)   0.9071
## 6        Support Vector Machine (SVM)   0.5478
##                              Unique_Hyperparameters
## 2                                    max_models = 5
## 3 Hidden = c(1), activation = 'Tanh', epochs = 1000
## 1                           balance_classes = FALSE
## 4                   learn_rate = 0.1, ntrees = 1000
## 5                                                  
## 6                    Gamma = 0.01, rank_ratio = 0.1

Confusion Matrix Model Heatmaps

#install.packages("dplyr")
library(dplyr)

# Function to create a custom confusion matrix plot
create_confusion_matrix_plot <- function(cm, title) {
  # Convert confusion matrix to a data frame
  cm_data <- as.data.frame(cm$table)
  
  # Create a custom confusion matrix plot with different colors for each category
  ggplot(cm_data, aes(x = Reference, y = Prediction, fill = as.factor(Category), label = Freq)) +
    geom_tile(color = "white") +
    geom_text(vjust = 1, hjust = 0.5) +  # Center text
    scale_fill_manual(values = c("#3498db", "#2ecc71", "#e74c3c", "#f39c12"), name = "Category") +  # Assign colors
    labs(title = title,
         x = "Predicted",
         y = "Actual") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
}

# List of confusion matrices with categories
confusion_matrices <- list(
  Random_Forest = rf.con,
  AutoML = aml_cm,
  Deep_Learning = dl_cm,
  Gradient_Boosting = gb.con,
  Naive_Bayes = nb_cm,
  Support_Vector_Machine = svm_cm
)

# Add a 'Category' column to the data frames representing the four categories
all_cm_data <- do.call(rbind, lapply(names(confusion_matrices), function(name) {
  cm <- confusion_matrices[[name]]
  cm_data <- as.data.frame(cm$table)
  cm_data$Category <- factor(rep(c("True Negative", "False Negative", "False Positive", "True Positive"), each = nrow(cm_data)/4))
  cm_data$Model <- name
  return(cm_data)
}))

# Create and display a facet_wrap confusion matrix plot with different colors for each category
ggplot(all_cm_data, aes(x = Reference, y = Prediction, fill = as.factor(Category), label = Freq)) +
  geom_tile(color = "white") +
  geom_text(vjust = 1, hjust = 0.5) +  # Center text
  scale_fill_manual(values = c("#3498db", "#2ecc71", "#e74c3c", "#f39c12"), name = "Category") +  # Assign colors
  labs(title = "Confusion Matrix Heatmaps",
       x = "Predicted",
       y = "Actual") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +  # Center the title
  facet_wrap(~Model, scales = "free")  # Create a separate panel for each model

Confusion Matrix Tables for Models

# Create an empty list to store dataframes
confusion_matrix_dataframes <- list()

for (name in names(confusion_matrices)) {
  cm <- confusion_matrices[[name]]
  
  # Accessing individual values from the confusion matrix
  TP <- cm$table[2, 2]  # second row and second column
  TN <- cm$table[1, 1]  # first row and first column
  FP <- cm$table[1, 2]  # first row and second column
  FN <- cm$table[2, 1]  # second row and first column
  
  # Create a dataframe for each model
  model_dataframe <- data.frame(
    Model = name,
    True_Positive = TP,
    True_Negative = TN,
    False_Positive = FP,
    False_Negative = FN
  )
  
  # Append the dataframe to the list
  confusion_matrix_dataframes[[name]] <- model_dataframe
}

# Combine all dataframes into a single dataframe without row names
all_confusion_matrices_df <- bind_rows(confusion_matrix_dataframes)

# Combine all dataframes into a single dataframe without row names
all_confusion_matrices_df <- bind_rows(confusion_matrix_dataframes)

# Arrange the dataframe in descending order based on the "True_Positive" column
all_confusion_matrices_df <- arrange(all_confusion_matrices_df, desc(True_Positive))

# Print the resulting dataframe
print(all_confusion_matrices_df)

##                    Model True_Positive True_Negative False_Positive
## 1 Support_Vector_Machine           559            60              6
## 2          Random_Forest           551           527             14
## 3                 AutoML           551           540             14
## 4      Gradient_Boosting           546           542             19
## 5          Deep_Learning           543           537             22
## 6            Naive_Bayes           514           511             51
##   False_Negative
## 1            505
## 2             38
## 3             25
## 4             23
## 5             28
## 6             54

True Positive Rate vs False Positive Rate

# Extract True Positive Rate or Sensitivity values
sens_drf <- round(rf.con$byClass[1], 4)
sens_aml <- round(aml_cm$byClass[1], 4)
sens_dl <- round(dl_cm$byClass[1], 4)
sens_gbm <- round(gb.con$byClass[1], 4)
sens_nb <- round(nb_cm$byClass[1], 4)
sens_svm <- round(svm_cm$byClass[1], 4)

# Calculate False Positive Rate (FPR)
fpr_drf <- round(1 - sens_drf, 4)
fpr_aml <- round(1 - sens_aml, 4)
fpr_dl <- round(1 - sens_dl, 4)
fpr_gbm <- round(1 - sens_gbm, 4)
fpr_nb <- round(1 - sens_nb, 4)
fpr_svm <- round(1 - sens_svm, 4)

# Create a data frame to store the results
results <- data.frame(
  Model = c("Random Forest", "AutoML", "Deep Learning", "Gradient Boosting", "Naive Bayes", "Support Vector Machine"),
  TPR = c(sens_drf, sens_aml, sens_dl, sens_gbm, sens_nb, sens_svm),
  FPR = c(fpr_drf, fpr_aml, fpr_dl, fpr_gbm, fpr_nb, fpr_svm)
)

# Arrange in descending order
results <- results[order(-results$TPR), ]

# Multiply values by 100
#results$TPR <- results$TPR * 100
#results$FPR <- results$FPR * 100

# Print the results
print(results)

##                    Model    TPR    FPR
## 6 Support Vector Machine 0.9894 0.0106
## 1          Random Forest 0.9752 0.0248
## 2                 AutoML 0.9752 0.0248
## 4      Gradient Boosting 0.9664 0.0336
## 3          Deep Learning 0.9611 0.0389
## 5            Naive Bayes 0.9097 0.0903

Visualization of TPR vs FPR

# Arrange in descending order by TPR
results <- results[order(-results$TPR), ]

# Create a scatter plot for each model
ggplot(results, aes(x = FPR, y = TPR, group = Model, color = Model)) +
  geom_point(size = 6) +
  labs(title = "TPR vs. FPR",
       x = "False Positive Rate (FPR)",
       y = "True Positive Rate (TPR)") +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))

##Variance Importance Plot

These model doesn’t have variable importances: aml, pros_nb, and svm_model

h2o.varimp_plot(rf.v2)

h2o.varimp_plot(gb_v2)

h2o.varimp_plot(dl)

h2o.shutdown()

D698 Final Project

Coffy Andrews-Guo, Vyanna Hill

2023-12-07