Prueba de Data Science:
http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
Adjuntar la implementación del modelo, resultados de precisión y una justificación del por qué eligió ese modelo y que tan propenso es a overfitting.
library(data.table)
library(dplyr)
library(ggplot2)
library(funModeling)
library(corrplot)
library(h2o)
library(randomForest)
library(caret)
setwd("/Users/bernardo/Dropbox (Personal)/Documentos/Data Science/Test/Rappi")
df <- fread('gzcat Data/covtype.data.gz')
head(df) # Quick first rows views
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1: 2596 51 3 258 0 510 221 232 148 6279 1 0 0 0 0 0 0
## 2: 2590 56 2 212 -6 390 220 235 151 6225 1 0 0 0 0 0 0
## 3: 2804 139 9 268 65 3180 234 238 135 6121 1 0 0 0 0 0 0
## 4: 2785 155 18 242 118 3090 238 238 122 6211 1 0 0 0 0 0 0
## 5: 2595 45 2 153 -1 391 220 234 150 6172 1 0 0 0 0 0 0
## 6: 2579 132 6 300 -15 67 230 237 140 6031 1 0 0 0 0 0 0
## V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 V35
## 1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53
## 1: 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V54 V55
## 1: 0 5
## 2: 0 5
## 3: 0 2
## 4: 0 2
## 5: 0 5
## 6: 0 2
dim(df) # 581012 rows and 55 columns
## [1] 581012 55
str(df) # All values are integer
## Classes 'data.table' and 'data.frame': 581012 obs. of 55 variables:
## $ V1 : int 2596 2590 2804 2785 2595 2579 2606 2605 2617 2612 ...
## $ V2 : int 51 56 139 155 45 132 45 49 45 59 ...
## $ V3 : int 3 2 9 18 2 6 7 4 9 10 ...
## $ V4 : int 258 212 268 242 153 300 270 234 240 247 ...
## $ V5 : int 0 -6 65 118 -1 -15 5 7 56 11 ...
## $ V6 : int 510 390 3180 3090 391 67 633 573 666 636 ...
## $ V7 : int 221 220 234 238 220 230 222 222 223 228 ...
## $ V8 : int 232 235 238 238 234 237 225 230 221 219 ...
## $ V9 : int 148 151 135 122 150 140 138 144 133 124 ...
## $ V10: int 6279 6225 6121 6211 6172 6031 6256 6228 6244 6230 ...
## $ V11: int 1 1 1 1 1 1 1 1 1 1 ...
## $ V12: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V13: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V14: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V15: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V16: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V17: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V18: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V19: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V20: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V21: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V22: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V23: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V24: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V25: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V26: int 0 0 1 0 0 0 0 0 0 0 ...
## $ V27: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V28: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V29: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V30: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V31: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V32: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V33: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V34: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V35: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V36: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V37: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V38: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V39: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V40: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V41: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V42: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V43: int 1 1 0 0 1 1 1 1 1 1 ...
## $ V44: int 0 0 0 1 0 0 0 0 0 0 ...
## $ V45: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V46: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V47: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V48: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V49: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V50: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V51: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V52: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V53: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V54: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V55: int 5 5 2 2 5 2 5 5 5 5 ...
## - attr(*, ".internal.selfref")=<externalptr>
sum(is.na(df)) # No null values
## [1] 0
# From the covtype.info file we know that:
names <- c("Elevation","Aspect","Slope",
"Horizontal_Distance_To_Hydrology",
"Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am",
"Hillshade_Noon",
"Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points",
paste("Wilderness_Area",1:4,sep="_"),
paste("Soil_Type",1:40,sep="_"),
"Cover_Type")
colnames(df) <- names
# Profiling all numerical columns
prof <- profiling_num(df, print_results = TRUE)
## variable mean std_dev variation_coef p_01
## 1 Elevation 3.0e+03 2.8e+02 0.095 2122
## 2 Aspect 1.6e+02 1.1e+02 0.719 1
## 3 Slope 1.4e+01 7.5e+00 0.531 2
## 4 Horizontal_Distance_To_Hydrology 2.7e+02 2.1e+02 0.789 0
## 5 Vertical_Distance_To_Hydrology 4.6e+01 5.8e+01 1.256 -42
## 6 Horizontal_Distance_To_Roadways 2.4e+03 1.6e+03 0.663 150
## 7 Hillshade_9am 2.1e+02 2.7e+01 0.126 127
## 8 Hillshade_Noon 2.2e+02 2.0e+01 0.089 162
## 9 Hillshade_3pm 1.4e+02 3.8e+01 0.269 41
## 10 Horizontal_Distance_To_Fire_Points 2.0e+03 1.3e+03 0.669 182
## 11 Wilderness_Area_1 4.5e-01 5.0e-01 1.108 0
## 12 Wilderness_Area_2 5.1e-02 2.2e-01 4.294 0
## 13 Wilderness_Area_3 4.4e-01 5.0e-01 1.137 0
## 14 Wilderness_Area_4 6.4e-02 2.4e-01 3.836 0
## 15 Soil_Type_1 5.2e-03 7.2e-02 13.809 0
## 16 Soil_Type_2 1.3e-02 1.1e-01 8.730 0
## 17 Soil_Type_3 8.3e-03 9.1e-02 10.930 0
## 18 Soil_Type_4 2.1e-02 1.4e-01 6.773 0
## 19 Soil_Type_5 2.7e-03 5.2e-02 19.048 0
## 20 Soil_Type_6 1.1e-02 1.1e-01 9.347 0
## 21 Soil_Type_7 1.8e-04 1.3e-02 74.380 0
## 22 Soil_Type_8 3.1e-04 1.8e-02 56.964 0
## 23 Soil_Type_9 2.0e-03 4.4e-02 22.484 0
## 24 Soil_Type_10 5.6e-02 2.3e-01 4.099 0
## 25 Soil_Type_11 2.1e-02 1.4e-01 6.769 0
## 26 Soil_Type_12 5.2e-02 2.2e-01 4.288 0
## 27 Soil_Type_13 3.0e-02 1.7e-01 5.686 0
## 28 Soil_Type_14 1.0e-03 3.2e-02 31.128 0
## 29 Soil_Type_15 5.2e-06 2.3e-03 440.080 0
## 30 Soil_Type_16 4.9e-03 7.0e-02 14.256 0
## 31 Soil_Type_17 5.9e-03 7.7e-02 12.992 0
## 32 Soil_Type_18 3.3e-03 5.7e-02 17.463 0
## 33 Soil_Type_19 6.9e-03 8.3e-02 11.979 0
## 34 Soil_Type_20 1.6e-02 1.3e-01 7.858 0
## 35 Soil_Type_21 1.4e-03 3.8e-02 26.312 0
## 36 Soil_Type_22 5.7e-02 2.3e-01 4.051 0
## 37 Soil_Type_23 9.9e-02 3.0e-01 3.010 0
## 38 Soil_Type_24 3.7e-02 1.9e-01 5.129 0
## 39 Soil_Type_25 8.2e-04 2.9e-02 34.997 0
## 40 Soil_Type_26 4.5e-03 6.7e-02 14.947 0
## 41 Soil_Type_27 1.9e-03 4.3e-02 23.108 0
## 42 Soil_Type_28 1.6e-03 4.0e-02 24.762 0
## 43 Soil_Type_29 2.0e-01 4.0e-01 2.010 0
## 44 Soil_Type_30 5.2e-02 2.2e-01 4.273 0
## 45 Soil_Type_31 4.4e-02 2.1e-01 4.652 0
## 46 Soil_Type_32 9.0e-02 2.9e-01 3.172 0
## 47 Soil_Type_33 7.8e-02 2.7e-01 3.445 0
## 48 Soil_Type_34 2.8e-03 5.3e-02 18.965 0
## 49 Soil_Type_35 3.3e-03 5.7e-02 17.500 0
## 50 Soil_Type_36 2.0e-04 1.4e-02 69.867 0
## 51 Soil_Type_37 5.1e-04 2.3e-02 44.144 0
## 52 Soil_Type_38 2.7e-02 1.6e-01 6.026 0
## 53 Soil_Type_39 2.4e-02 1.5e-01 6.410 0
## 54 Soil_Type_40 1.5e-02 1.2e-01 8.087 0
## 55 Cover_Type 2.1e+00 1.4e+00 0.681 1
## p_05 p_25 p_50 p_75 p_95 p_99 skewness kurtosis iqr range_98
## 1 2406 2809 2996 3163 3336 3438 -0.82 3.7e+00 354 [2122, 3438]
## 2 12 58 127 260 344 356 0.40 1.8e+00 202 [1, 356]
## 3 4 9 13 18 28 35 0.79 3.6e+00 9 [2, 35]
## 4 30 108 218 384 684 937 1.14 4.4e+00 276 [0, 937]
## 5 -8 7 30 69 165 251 1.79 8.3e+00 62 [-42, 251]
## 6 379 1106 1997 3328 5483 6112 0.71 2.6e+00 2222 [150, 6112]
## 7 160 198 218 231 246 252 -1.18 4.9e+00 33 [127, 252]
## 8 186 213 226 237 250 254 -1.06 5.1e+00 24 [162, 254]
## 9 78 119 143 168 204 226 -0.28 3.4e+00 49 [41, 226]
## 10 418 1024 1710 2550 4944 6262 1.29 4.6e+00 1526 [182, 6262]
## 11 0 0 0 1 1 1 0.21 1.0e+00 1 [0, 1]
## 12 0 0 0 0 1 1 4.06 1.7e+01 0 [0, 1]
## 13 0 0 0 1 1 1 0.26 1.1e+00 1 [0, 1]
## 14 0 0 0 0 1 1 3.58 1.4e+01 0 [0, 1]
## 15 0 0 0 0 0 0 13.74 1.9e+02 0 [0, 0]
## 16 0 0 0 0 0 1 8.62 7.5e+01 0 [0, 1]
## 17 0 0 0 0 0 0 10.84 1.2e+02 0 [0, 0]
## 18 0 0 0 0 0 1 6.63 4.5e+01 0 [0, 1]
## 19 0 0 0 0 0 0 19.00 3.6e+02 0 [0, 0]
## 20 0 0 0 0 0 1 9.24 8.6e+01 0 [0, 1]
## 21 0 0 0 0 0 0 74.37 5.5e+03 0 [0, 0]
## 22 0 0 0 0 0 0 56.95 3.2e+03 0 [0, 0]
## 23 0 0 0 0 0 0 22.44 5.0e+02 0 [0, 0]
## 24 0 0 0 0 1 1 3.86 1.6e+01 0 [0, 1]
## 25 0 0 0 0 0 1 6.62 4.5e+01 0 [0, 1]
## 26 0 0 0 0 1 1 4.05 1.7e+01 0 [0, 1]
## 27 0 0 0 0 0 1 5.51 3.1e+01 0 [0, 1]
## 28 0 0 0 0 0 0 31.10 9.7e+02 0 [0, 0]
## 29 0 0 0 0 0 0 440.08 1.9e+05 0 [0, 0]
## 30 0 0 0 0 0 0 14.19 2.0e+02 0 [0, 0]
## 31 0 0 0 0 0 0 12.91 1.7e+02 0 [0, 0]
## 32 0 0 0 0 0 0 17.41 3.0e+02 0 [0, 0]
## 33 0 0 0 0 0 0 11.90 1.4e+02 0 [0, 0]
## 34 0 0 0 0 0 1 7.73 6.1e+01 0 [0, 1]
## 35 0 0 0 0 0 0 26.27 6.9e+02 0 [0, 0]
## 36 0 0 0 0 1 1 3.80 1.5e+01 0 [0, 1]
## 37 0 0 0 0 1 1 2.68 8.2e+00 0 [0, 1]
## 38 0 0 0 0 0 1 4.93 2.5e+01 0 [0, 1]
## 39 0 0 0 0 0 0 34.97 1.2e+03 0 [0, 0]
## 40 0 0 0 0 0 0 14.88 2.2e+02 0 [0, 0]
## 41 0 0 0 0 0 0 23.07 5.3e+02 0 [0, 0]
## 42 0 0 0 0 0 0 24.72 6.1e+02 0 [0, 0]
## 43 0 0 0 0 1 1 1.51 3.3e+00 0 [0, 1]
## 44 0 0 0 0 1 1 4.04 1.7e+01 0 [0, 1]
## 45 0 0 0 0 0 1 4.44 2.1e+01 0 [0, 1]
## 46 0 0 0 0 1 1 2.86 9.2e+00 0 [0, 1]
## 47 0 0 0 0 1 1 3.15 1.1e+01 0 [0, 1]
## 48 0 0 0 0 0 0 18.91 3.6e+02 0 [0, 0]
## 49 0 0 0 0 0 0 17.44 3.1e+02 0 [0, 0]
## 50 0 0 0 0 0 0 69.85 4.9e+03 0 [0, 0]
## 51 0 0 0 0 0 0 44.12 1.9e+03 0 [0, 0]
## 52 0 0 0 0 0 1 5.86 3.5e+01 0 [0, 1]
## 53 0 0 0 0 0 1 6.25 4.0e+01 0 [0, 1]
## 54 0 0 0 0 0 1 7.96 6.4e+01 0 [0, 1]
## 55 1 1 2 2 6 7 2.28 7.9e+00 1 [1, 7]
## range_80
## 1 [2581, 3272]
## 2 [24, 329]
## 3 [5, 24]
## 4 [30, 564]
## 5 [0, 121]
## 6 [591, 4793]
## 7 [176, 241]
## 8 [198, 247]
## 9 [95, 191]
## 10 [595, 3747]
## 11 [0, 1]
## 12 [0, 0]
## 13 [0, 1]
## 14 [0, 0]
## 15 [0, 0]
## 16 [0, 0]
## 17 [0, 0]
## 18 [0, 0]
## 19 [0, 0]
## 20 [0, 0]
## 21 [0, 0]
## 22 [0, 0]
## 23 [0, 0]
## 24 [0, 0]
## 25 [0, 0]
## 26 [0, 0]
## 27 [0, 0]
## 28 [0, 0]
## 29 [0, 0]
## 30 [0, 0]
## 31 [0, 0]
## 32 [0, 0]
## 33 [0, 0]
## 34 [0, 0]
## 35 [0, 0]
## 36 [0, 0]
## 37 [0, 0]
## 38 [0, 0]
## 39 [0, 0]
## 40 [0, 0]
## 41 [0, 0]
## 42 [0, 0]
## 43 [0, 1]
## 44 [0, 0]
## 45 [0, 0]
## 46 [0, 0]
## 47 [0, 0]
## 48 [0, 0]
## 49 [0, 0]
## 50 [0, 0]
## 51 [0, 0]
## 52 [0, 0]
## 53 [0, 0]
## 54 [0, 0]
## 55 [1, 3]
plot_num(prof, bins = 20) # Visually, no outliers
# Correlation matrix
df_cor <- select(df, -matches("Wilderness_Area|Soil_Type"))
corrplot(cor(df_cor), method="color", type="lower", number.cex=0.7,
addCoef.col = "black", tl.col="red", diag=FALSE, is.corr = F)
# Plot intereseting and strong correlations
ggplot(df) +
geom_point(aes(x=Hillshade_9am, y=Hillshade_3pm,
color=as.character(Cover_Type)), alpha=0.7) +
guides(color=guide_legend(title="Cover Type")) + theme_minimal() +
ggtitle("Hillshade 3pm vs Hillshade 9am vs Cover Type")
ggplot(df) +
geom_point(aes(x=Aspect, y=Hillshade_3pm,
color=as.character(Cover_Type)), alpha=0.7) +
guides(color=guide_legend(title="Cover Type")) + theme_minimal() +
ggtitle("Hillshade 3pm vs Aspect vs Cover Type")
ggplot(df) +
geom_point(aes(x=Elevation, y=Horizontal_Distance_To_Roadways,
color=as.character(Cover_Type)), alpha=0.7) +
guides(color=guide_legend(title="Cover Type")) + theme_minimal() +
ggtitle("Horiz Distance to Roadways vs Elevation vs Cover Type")
ggplot(df) +
geom_point(aes(x=Hillshade_Noon, y=Slope,
color=as.character(Cover_Type)), alpha=0.7) +
guides(color=guide_legend(title="Cover Type")) + theme_minimal() +
ggtitle("Slope vs Hillshade Noon vs Cover Type")
ggplot(df) +
geom_boxplot(aes(x=as.character(Cover_Type), y=Elevation, color=as.character(Cover_Type)), alpha=0.7) +
guides(color=FALSE) + theme_minimal() +
ggtitle("Elevation vs Cover Type") + xlab('Cover_Type')
# How many rows per Cover Type do we have?
distr <- df %>% group_by(Cover_Type) %>% tally() %>% mutate(p_original = n/sum(n))
print(distr)
## # A tibble: 7 x 3
## Cover_Type n p_original
## <int> <int> <dbl>
## 1 1 211840 0.3646
## 2 2 283301 0.4876
## 3 3 35754 0.0615
## 4 4 2747 0.0047
## 5 5 9493 0.0163
## 6 6 17367 0.0299
## 7 7 20510 0.0353
# Predict Cover_Type
# No need for much Data Preparation (delete rows or impute values of missing values)
# because the data is quite clean. But.....
# We can remove Soil_Type_7 and Soil_Type_15 because there is too little data on Cover Type
df <- select(df, -Soil_Type_7, -Soil_Type_15)
# For faster testings and modeling, we can reduce the size by sampling the original data set
set.seed(1234)
df_model <- sample_n(df, 50000) # 8.6% of the whole data
df_model %>% group_by(Cover_Type) %>% tally() %>% mutate(p_sample = n/sum(n)) %>%
left_join(select(distr, -n), by=c("Cover_Type")) %>% mutate(diff = abs(p_sample-p_original))
## # A tibble: 7 x 5
## Cover_Type n p_sample p_original diff
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 18018 0.3604 0.3646 0.00425
## 2 2 24649 0.4930 0.4876 0.00538
## 3 3 3033 0.0607 0.0615 0.00088
## 4 4 205 0.0041 0.0047 0.00063
## 5 5 809 0.0162 0.0163 0.00016
## 6 6 1522 0.0304 0.0299 0.00055
## 7 7 1764 0.0353 0.0353 0.00002
# We save the final data file into a CSV
# fwrite(df_model, file="Data/covtype.data.clean.csv")
### h2o ###
# First, we can run autoML in h2o to get a fast first draft (No need to split data first)
h2o.init(nthreads = -1, max_mem_size = "8G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 hours 15 minutes
## H2O cluster version: 3.16.0.4
## H2O cluster version age: 29 days
## H2O cluster name: H2O_started_from_R_bernardo_rze496
## H2O cluster total nodes: 1
## H2O cluster total memory: 6.83 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.2 (2017-09-28)
# importFiles [ "/Users/bernardo/Dropbox (Personal)/Documentos/Data Science/Test/Rappi/covtype.data.clean.csv" ]
# setupParse source_frames: [ "nfs://Users/bernardo/Dropbox (Personal)/Documentos/Data Science/Test/Rappi/covtype.data.clean.gz" ]
# parseFiles source_frames: ["nfs://Users/bernardo/Dropbox (Personal)/Documentos/Data Science/Test/Rappi/covtype.data.clean.gz"] destination_frame: "covtype_data_clean.hex" parse_type: "CSV" separator: 44 number_columns: 55 single_quotes: false column_names: ["Elevation","Aspect","Slope","Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon","Hillshade_3pm","Horizontal_Distance_To_Fire_Points","Wilderness_Area_1","Wilderness_Area_2","Wilderness_Area_3","Wilderness_Area_4","Soil_Type_1","Soil_Type_2","Soil_Type_3","Soil_Type_4","Soil_Type_5","Soil_Type_6","Soil_Type_7","Soil_Type_8","Soil_Type_9","Soil_Type_10","Soil_Type_11","Soil_Type_12","Soil_Type_13","Soil_Type_14","Soil_Type_15","Soil_Type_16","Soil_Type_17","Soil_Type_18","Soil_Type_19","Soil_Type_20","Soil_Type_21","Soil_Type_22","Soil_Type_23","Soil_Type_24","Soil_Type_25","Soil_Type_26","Soil_Type_27","Soil_Type_28","Soil_Type_29","Soil_Type_30","Soil_Type_31","Soil_Type_32","Soil_Type_33","Soil_Type_34","Soil_Type_35","Soil_Type_36","Soil_Type_37","Soil_Type_38","Soil_Type_39","Soil_Type_40","Cover_Type"] column_types: ["Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum"] delete_on_done: true check_header: 1 chunk_size: 2349568
# runAutoML {"training_frame":"covtype_data_clean.hex","response_column":"Cover_Type","seed":-1,"max_models":0,"max_runtime_secs":2000,"stopping_metric":"AUTO","stopping_rounds":3,"stopping_tolerance":-1,"nfolds":5,"ignored_columns":[]}
# The h2o's (v 3.16.0.4) results for the best non-Stacked model are:
h2o.loadModel("h2o/GBM_grid_0_AutoML_20180214_122009_model_3")
## Model Details:
## ==============
##
## H2OMultinomialModel: gbm
## Model ID: GBM_grid_0_AutoML_20180214_122009_model_3
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 103 721 1501509 8
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 10 9.99723 22 355 160.76976
##
##
## H2OMultinomialMetrics: gbm
## ** Reported on training data. **
##
## Training Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.0083
## RMSE: (Extract with `h2o.rmse`) 0.091
## Logloss: (Extract with `h2o.logloss`) 0.061
## Mean Per-Class Error: 0.00068
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 1 2 3 4 5 6 7 Error Rate
## 1 7126 27 0 0 0 0 0 0.0038 = 27 / 7.153
## 2 10 9903 0 0 0 0 0 0.0010 = 10 / 9.913
## 3 0 0 1218 0 0 0 0 0.0000 = 0 / 1.218
## 4 0 0 0 70 0 0 0 0.0000 = 0 / 70
## 5 0 0 0 0 329 0 0 0.0000 = 0 / 329
## 6 0 0 0 0 0 641 0 0.0000 = 0 / 641
## 7 0 0 0 0 0 0 690 0.0000 = 0 / 690
## Totals 7136 9930 1218 70 329 641 690 0.0018 = 37 / 20.014
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-7 Hit Ratios:
## k hit_ratio
## 1 1 0.998151
## 2 2 1.000000
## 3 3 1.000000
## 4 4 1.000000
## 5 5 1.000000
## 6 6 1.000000
## 7 7 1.000000
##
##
## H2OMultinomialMetrics: gbm
## ** Reported on validation data. **
##
## Validation Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.11
## RMSE: (Extract with `h2o.rmse`) 0.33
## Logloss: (Extract with `h2o.logloss`) 0.37
## Mean Per-Class Error: 0.27
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 1 2 3 4 5 6 7 Error Rate
## 1 1534 247 1 0 1 1 11 0.1454 = 261 / 1.795
## 2 188 2234 16 0 5 10 2 0.0900 = 221 / 2.455
## 3 1 27 252 0 1 16 0 0.1515 = 45 / 297
## 4 0 0 11 16 0 3 0 0.4667 = 14 / 30
## 5 1 39 1 0 37 0 0 0.5256 = 41 / 78
## 6 0 18 32 1 0 112 0 0.3129 = 51 / 163
## 7 34 3 0 0 0 0 131 0.2202 = 37 / 168
## Totals 1758 2568 313 17 44 142 144 0.1344 = 670 / 4.986
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-7 Hit Ratios:
## k hit_ratio
## 1 1 0.865624
## 2 2 0.984156
## 3 3 0.997393
## 4 4 0.998997
## 5 5 0.999599
## 6 6 1.000000
## 7 7 1.000000
##
##
## H2OMultinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## Cross-Validation Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.11
## RMSE: (Extract with `h2o.rmse`) 0.33
## Logloss: (Extract with `h2o.logloss`) 0.38
## Mean Per-Class Error: 0.27
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-7 Hit Ratios:
## k hit_ratio
## 1 1 0.855251
## 2 2 0.984261
## 3 3 0.996852
## 4 4 0.998501
## 5 5 0.999750
## 6 6 0.999950
## 7 7 1.000000
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid
## accuracy 0.8552511 0.0026090494 0.8531102 0.85336
## err 0.14474891 0.0026090494 0.14688984 0.14664002
## err_count 579.4 10.417294 588.0 587.0
## logloss 0.38243577 0.008464973 0.38749266 0.37589598
## max_per_class_error 0.57280993 0.048857447 0.49295774 0.7017544
## mean_per_class_accuracy 0.7324064 0.013980391 0.7557022 0.7089864
## mean_per_class_error 0.26759362 0.013980391 0.2442978 0.29101357
## mse 0.11000753 0.0020396472 0.11108888 0.11020539
## r2 0.9435416 0.0013084789 0.9417358 0.94636923
## rmse 0.3316455 0.0030666983 0.3333 0.33197197
## cv_3_valid cv_4_valid cv_5_valid
## accuracy 0.8593555 0.8598551 0.85057473
## err 0.14064452 0.14014488 0.14942528
## err_count 563.0 561.0 598.0
## logloss 0.38014996 0.36648914 0.40215117
## max_per_class_error 0.5625 0.5555556 0.55128205
## mean_per_class_accuracy 0.72775733 0.7144175 0.75516844
## mean_per_class_error 0.2722427 0.28558245 0.24483156
## mse 0.10725212 0.10676373 0.11472755
## r2 0.94508356 0.9425766 0.9419429
## rmse 0.3274937 0.32674718 0.33871454
# - The best algorithm for this dataset was Gradient Boost, with 103 trees and 8-10 depth
# - Elevation and Horizontal_Distances are the most importante features
# - We can obtain a 85.53% accuracy with a 5 cross validation with only 0.26% sd between them
# If I'd have to chose a quick method before testing others, I'd guees that,
# to classify this data set, Random Forest would be the best option.
# For this task, we could use the xgboost or randomForest librery...
### randomForest ###
# First, convert Cover_Type to categorical type
df_model$Cover_Type <- factor(df_model$Cover_Type)
# Let create samples of devlopment and validation set from the training data
sample <- sample(2, nrow(df_model), replace = T, prob = c(0.8,0.2))
df_dev <- df_model[sample==1,]
df_val <- df_model[sample==2,]
# Creating a Random Forest model for Cover Type classification
rf <- randomForest(Cover_Type ~ .,
data = df_dev,
mtry = sqrt(ncol(df_dev)), # thumb rule: 7
ntree = 300,
importance = T,
do.trace=25)
## ntree OOB 1 2 3 4 5 6 7
## 25: 20.77% 20.63% 15.38% 13.16% 58.93% 81.45% 73.23% 32.71%
## 50: 19.78% 19.81% 14.10% 11.72% 54.17% 80.53% 74.21% 33.57%
## 75: 19.36% 19.82% 13.40% 11.48% 51.79% 83.62% 71.60% 32.71%
## 100: 19.22% 20.23% 12.90% 11.07% 51.19% 84.85% 69.98% 33.43%
## 125: 19.19% 20.23% 12.71% 10.94% 54.76% 85.47% 71.60% 33.21%
## 150: 19.02% 20.15% 12.53% 11.11% 52.38% 86.09% 70.22% 32.78%
## 175: 19.04% 20.30% 12.40% 10.86% 52.38% 85.78% 71.93% 32.64%
## 200: 18.99% 20.23% 12.36% 10.98% 51.19% 86.40% 71.60% 32.42%
## 225: 18.90% 20.17% 12.30% 10.98% 50.60% 85.94% 70.79% 32.28%
## 250: 18.93% 20.20% 12.33% 10.86% 50.60% 85.94% 71.36% 32.28%
## 275: 18.94% 20.18% 12.41% 10.86% 50.00% 85.47% 71.28% 31.85%
## 300: 18.99% 20.08% 12.55% 11.23% 48.21% 85.63% 71.52% 31.64%
print(rf) ## Accuracy: 81%
##
## Call:
## randomForest(formula = Cover_Type ~ ., data = df_dev, mtry = sqrt(ncol(df_dev)), ntree = 300, importance = T, do.trace = 25)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 19%
## Confusion matrix:
## 1 2 3 4 5 6 7 class.error
## 1 11507 2794 4 0 1 8 84 0.20
## 2 2233 17231 209 1 7 18 5 0.13
## 3 0 222 2166 8 0 44 0 0.11
## 4 0 0 79 87 0 2 0 0.48
## 5 27 506 20 0 93 1 0 0.86
## 6 9 303 558 9 0 350 0 0.72
## 7 425 16 0 0 0 0 953 0.32
plot(rf)
# Variable importance
par(mfrow=c(1,1))
varImpPlot(rf, type=1, main="Feature Importance", pch=20)
# Predicting Cover Type on validation set
df_val$predictedCoverType <- predict(rf,df_val)
# Creating Confusion Matrix for validation set
confusionMatrix(data = df_val$predictedCoverType,
reference = df_val$Cover_Type,
positive = 'yes')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7
## 1 2861 480 0 0 6 0 107
## 2 736 4406 56 0 121 62 1
## 3 1 47 528 25 10 153 0
## 4 0 0 3 11 0 1 0
## 5 0 0 0 0 25 0 0
## 6 1 6 6 1 0 77 0
## 7 21 6 0 0 0 0 262
##
## Overall Statistics
##
## Accuracy : 0.815
## 95% CI : (0.808, 0.823)
## No Information Rate : 0.494
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.695
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.790 0.891 0.8904 0.29730 0.1543 0.26280
## Specificity 0.907 0.808 0.9750 0.99960 1.0000 0.99856
## Pos Pred Value 0.828 0.819 0.6911 0.73333 1.0000 0.84615
## Neg Pred Value 0.884 0.884 0.9930 0.99740 0.9863 0.97825
## Prevalence 0.361 0.494 0.0592 0.00369 0.0162 0.02924
## Detection Rate 0.286 0.440 0.0527 0.00110 0.0025 0.00768
## Detection Prevalence 0.345 0.537 0.0762 0.00150 0.0025 0.00908
## Balanced Accuracy 0.849 0.849 0.9327 0.64845 0.5772 0.63068
## Class: 7
## Sensitivity 0.7081
## Specificity 0.9972
## Pos Pred Value 0.9066
## Neg Pred Value 0.9889
## Prevalence 0.0369
## Detection Rate 0.0261
## Detection Prevalence 0.0288
## Balanced Accuracy 0.8527
## Accuracy: 81.5% (a bit better than training set... interesting)
# Some more notes:
# - To impove it, we could train and test the model with all (or more) of the original data
# - Note that h2o's autoML had a better performance than the raw R randomForest model
# because it auto tunes the hyperparameters
# - I don't think there is much risk of overfitting because of the few trees used, the huge amount of
# data we have and the small standard deviation on cross validation results