library(data.table)
train<-fread("C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\train_oSwQCTC\\train.csv", stringsAsFactors = T)
test<-fread("C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\test_HujdGe7\\test.csv", stringsAsFactors = T)
dim(train)
## [1] 550068     12
dim(test)
## [1] 233599     11
str(train)
## Classes 'data.table' and 'data.frame':   550068 obs. of  12 variables:
##  $ User_ID                   : int  1000001 1000001 1000001 1000001 1000002 1000003 1000004 1000004 1000004 1000005 ...
##  $ Product_ID                : Factor w/ 3631 levels "P00000142","P00000242",..: 673 2377 853 829 2735 1832 1746 3321 3605 2632 ...
##  $ Gender                    : Factor w/ 2 levels "F","M": 1 1 1 1 2 2 2 2 2 2 ...
##  $ Age                       : Factor w/ 7 levels "0-17","18-25",..: 1 1 1 1 7 3 5 5 5 3 ...
##  $ Occupation                : int  10 10 10 10 16 15 7 7 7 20 ...
##  $ City_Category             : Factor w/ 3 levels "A","B","C": 1 1 1 1 3 1 2 2 2 1 ...
##  $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 3 3 3 3 5 4 3 3 3 2 ...
##  $ Marital_Status            : int  0 0 0 0 0 0 1 1 1 1 ...
##  $ Product_Category_1        : int  3 1 12 12 8 1 1 1 1 8 ...
##  $ Product_Category_2        : int  NA 6 NA 14 NA 2 8 15 16 NA ...
##  $ Product_Category_3        : int  NA 14 NA NA NA NA 17 NA NA NA ...
##  $ Purchase                  : int  8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...
##  - attr(*, ".internal.selfref")=<externalptr>
sub_mean<-data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase=mean(train$Purchase))
write.csv(sub_mean, file="first_sub.csv",row.names=F)
summary(train)
##     User_ID            Product_ID     Gender        Age        
##  Min.   :1000001   P00265242:  1880   F:135809   0-17 : 15102  
##  1st Qu.:1001516   P00025442:  1615   M:414259   18-25: 99660  
##  Median :1003077   P00110742:  1612              26-35:219587  
##  Mean   :1003029   P00112142:  1562              36-45:110013  
##  3rd Qu.:1004478   P00057642:  1470              46-50: 45701  
##  Max.   :1006040   P00184942:  1440              51-55: 38501  
##                    (Other)  :540489              55+  : 21504  
##    Occupation     City_Category Stay_In_Current_City_Years
##  Min.   : 0.000   A:147720      0 : 74398                 
##  1st Qu.: 2.000   B:231173      1 :193821                 
##  Median : 7.000   C:171175      2 :101838                 
##  Mean   : 8.077                 3 : 95285                 
##  3rd Qu.:14.000                 4+: 84726                 
##  Max.   :20.000                                           
##                                                           
##  Marital_Status   Product_Category_1 Product_Category_2 Product_Category_3
##  Min.   :0.0000   Min.   : 1.000     Min.   : 2.00      Min.   : 3.0      
##  1st Qu.:0.0000   1st Qu.: 1.000     1st Qu.: 5.00      1st Qu.: 9.0      
##  Median :0.0000   Median : 5.000     Median : 9.00      Median :14.0      
##  Mean   :0.4097   Mean   : 5.404     Mean   : 9.84      Mean   :12.7      
##  3rd Qu.:1.0000   3rd Qu.: 8.000     3rd Qu.:15.00      3rd Qu.:16.0      
##  Max.   :1.0000   Max.   :20.000     Max.   :18.00      Max.   :18.0      
##                                      NA's   :173638     NA's   :383247    
##     Purchase    
##  Min.   :   12  
##  1st Qu.: 5823  
##  Median : 8047  
##  Mean   : 9264  
##  3rd Qu.:12054  
##  Max.   :23961  
## 
summary(test)
##     User_ID            Product_ID     Gender        Age       
##  Min.   :1000001   P00265242:   829   F: 57827   0-17 : 6232  
##  1st Qu.:1001527   P00112142:   717   M:175772   18-25:42293  
##  Median :1003070   P00025442:   695              26-35:93428  
##  Mean   :1003029   P00110742:   680              36-45:46711  
##  3rd Qu.:1004477   P00046742:   646              46-50:19577  
##  Max.   :1006040   P00184942:   626              51-55:16283  
##                    (Other)  :229406              55+  : 9075  
##    Occupation     City_Category Stay_In_Current_City_Years
##  Min.   : 0.000   A:62524       0 :31318                  
##  1st Qu.: 2.000   B:98566       1 :82604                  
##  Median : 7.000   C:72509       2 :43589                  
##  Mean   : 8.085                 3 :40143                  
##  3rd Qu.:14.000                 4+:35945                  
##  Max.   :20.000                                           
##                                                           
##  Marital_Status   Product_Category_1 Product_Category_2 Product_Category_3
##  Min.   :0.0000   Min.   : 1.000     Min.   : 2.00      Min.   : 3.00     
##  1st Qu.:0.0000   1st Qu.: 1.000     1st Qu.: 5.00      1st Qu.: 9.00     
##  Median :0.0000   Median : 5.000     Median : 9.00      Median :14.00     
##  Mean   :0.4101   Mean   : 5.277     Mean   : 9.85      Mean   :12.67     
##  3rd Qu.:1.0000   3rd Qu.: 8.000     3rd Qu.:15.00      3rd Qu.:16.00     
##  Max.   :1.0000   Max.   :18.000     Max.   :18.00      Max.   :18.00     
##                                      NA's   :72344      NA's   :162562

combine dataset

test[,Purchase:=mean(train$Purchase)]
c<-list(train, test)
combin<-rbindlist(c)
combin[,prop.table(table(Gender))] 
## Gender
##         F         M 
## 0.2470896 0.7529104
combin[,prop.table(table(Age))]
## Age
##       0-17      18-25      26-35      36-45      46-50      51-55 
## 0.02722330 0.18113944 0.39942348 0.19998801 0.08329814 0.06990724 
##        55+ 
## 0.03902040
combin[,prop.table(table(City_Category))]
## City_Category
##         A         B         C 
## 0.2682823 0.4207642 0.3109535
combin[,prop.table(table(Stay_In_Current_City_Years))]
## Stay_In_Current_City_Years
##         0         1         2         3        4+ 
## 0.1348991 0.3527327 0.1855724 0.1728132 0.1539825
length(unique(combin$Product_ID))
## [1] 3677
length(unique(combin$User_ID))
## [1] 5891
colSums(is.na(combin))
##                    User_ID                 Product_ID 
##                          0                          0 
##                     Gender                        Age 
##                          0                          0 
##                 Occupation              City_Category 
##                          0                          0 
## Stay_In_Current_City_Years             Marital_Status 
##                          0                          0 
##         Product_Category_1         Product_Category_2 
##                          0                     245982 
##         Product_Category_3                   Purchase 
##                     545809                          0
library(ggplot2)
ggplot(combin, aes(Age, fill = Gender)) + geom_bar()

ggplot(combin, aes(Age, fill = City_Category)) + geom_bar()

library(gmodels)
CrossTable(combin$Occupation, combin$City_Category)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  783667 
## 
##  
##                   | combin$City_Category 
## combin$Occupation |         A |         B |         C | Row Total | 
## ------------------|-----------|-----------|-----------|-----------|
##                 0 |     26874 |     42455 |     29521 |     98850 | 
##                   |     4.733 |    17.884 |    48.165 |           | 
##                   |     0.272 |     0.429 |     0.299 |     0.126 | 
##                   |     0.128 |     0.129 |     0.121 |           | 
##                   |     0.034 |     0.054 |     0.038 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 1 |     18200 |     28264 |     21223 |     67687 | 
##                   |     0.092 |     1.642 |     1.463 |           | 
##                   |     0.269 |     0.418 |     0.314 |     0.086 | 
##                   |     0.087 |     0.086 |     0.087 |           | 
##                   |     0.023 |     0.036 |     0.027 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 2 |     13201 |     16276 |      8519 |     37996 | 
##                   |   887.231 |     5.211 |   919.471 |           | 
##                   |     0.347 |     0.428 |     0.224 |     0.048 | 
##                   |     0.063 |     0.049 |     0.035 |           | 
##                   |     0.017 |     0.021 |     0.011 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 3 |      8040 |      9747 |      7339 |     25126 | 
##                   |   250.378 |    64.398 |    28.759 |           | 
##                   |     0.320 |     0.388 |     0.292 |     0.032 | 
##                   |     0.038 |     0.030 |     0.030 |           | 
##                   |     0.010 |     0.012 |     0.009 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 4 |     34577 |     42524 |     25985 |    103086 | 
##                   |  1731.917 |    16.692 |  1149.411 |           | 
##                   |     0.335 |     0.413 |     0.252 |     0.132 | 
##                   |     0.164 |     0.129 |     0.107 |           | 
##                   |     0.044 |     0.054 |     0.033 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 5 |      3380 |      9467 |      4526 |     17373 | 
##                   |   352.000 |   636.521 |   142.112 |           | 
##                   |     0.195 |     0.545 |     0.261 |     0.022 | 
##                   |     0.016 |     0.029 |     0.019 |           | 
##                   |     0.004 |     0.012 |     0.006 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 6 |      5321 |     15656 |      8125 |     29102 | 
##                   |   791.918 |   950.127 |    94.422 |           | 
##                   |     0.183 |     0.538 |     0.279 |     0.037 | 
##                   |     0.025 |     0.047 |     0.033 |           | 
##                   |     0.007 |     0.020 |     0.010 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 7 |     22956 |     32859 |     28312 |     84127 | 
##                   |     6.609 |   182.064 |   177.101 |           | 
##                   |     0.273 |     0.391 |     0.337 |     0.107 | 
##                   |     0.109 |     0.100 |     0.116 |           | 
##                   |     0.029 |     0.042 |     0.036 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 8 |       134 |      1178 |       877 |      2189 | 
##                   |   349.845 |    71.681 |    56.624 |           | 
##                   |     0.061 |     0.538 |     0.401 |     0.003 | 
##                   |     0.001 |     0.004 |     0.004 |           | 
##                   |     0.000 |     0.002 |     0.001 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                 9 |       999 |      4574 |      3356 |      8929 | 
##                   |   814.109 |   177.664 |   120.949 |           | 
##                   |     0.112 |     0.512 |     0.376 |     0.011 | 
##                   |     0.005 |     0.014 |     0.014 |           | 
##                   |     0.001 |     0.006 |     0.004 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                10 |      3138 |      6039 |      9127 |     18304 | 
##                   |   639.886 |   358.943 |  2073.431 |           | 
##                   |     0.171 |     0.330 |     0.499 |     0.023 | 
##                   |     0.015 |     0.018 |     0.037 |           | 
##                   |     0.004 |     0.008 |     0.012 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                11 |      3537 |      8002 |      5054 |     16593 | 
##                   |   187.912 |   149.093 |     2.163 |           | 
##                   |     0.213 |     0.482 |     0.305 |     0.021 | 
##                   |     0.017 |     0.024 |     0.021 |           | 
##                   |     0.005 |     0.010 |     0.006 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                12 |     10057 |     18784 |     15607 |     44448 | 
##                   |   292.502 |     0.358 |   230.722 |           | 
##                   |     0.226 |     0.423 |     0.351 |     0.057 | 
##                   |     0.048 |     0.057 |     0.064 |           | 
##                   |     0.013 |     0.024 |     0.020 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                13 |       561 |      3466 |      7026 |     11053 | 
##                   |  1949.458 |   301.788 |  3747.820 |           | 
##                   |     0.051 |     0.314 |     0.636 |     0.014 | 
##                   |     0.003 |     0.011 |     0.029 |           | 
##                   |     0.001 |     0.004 |     0.009 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                14 |     10975 |     15971 |     11836 |     38782 | 
##                   |    31.279 |     7.382 |     4.138 |           | 
##                   |     0.283 |     0.412 |     0.305 |     0.049 | 
##                   |     0.052 |     0.048 |     0.049 |           | 
##                   |     0.014 |     0.020 |     0.015 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                15 |      4373 |      7479 |      5504 |     17356 | 
##                   |    17.238 |     4.252 |     2.125 |           | 
##                   |     0.252 |     0.431 |     0.317 |     0.022 | 
##                   |     0.021 |     0.023 |     0.023 |           | 
##                   |     0.006 |     0.010 |     0.007 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                16 |      8772 |     15444 |     11906 |     36122 | 
##                   |    87.130 |     3.954 |    40.412 |           | 
##                   |     0.243 |     0.428 |     0.330 |     0.046 | 
##                   |     0.042 |     0.047 |     0.049 |           | 
##                   |     0.011 |     0.020 |     0.015 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                17 |     11668 |     23204 |     22546 |     57418 | 
##                   |   906.208 |    37.785 |  1232.854 |           | 
##                   |     0.203 |     0.404 |     0.393 |     0.073 | 
##                   |     0.055 |     0.070 |     0.093 |           | 
##                   |     0.015 |     0.030 |     0.029 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                18 |      2246 |      3030 |      4091 |      9367 | 
##                   |    28.368 |   210.708 |   476.667 |           | 
##                   |     0.240 |     0.323 |     0.437 |     0.012 | 
##                   |     0.011 |     0.009 |     0.017 |           | 
##                   |     0.003 |     0.004 |     0.005 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                19 |      3165 |      4712 |      4042 |     11919 | 
##                   |     0.334 |    18.317 |    30.415 |           | 
##                   |     0.266 |     0.395 |     0.339 |     0.015 | 
##                   |     0.015 |     0.014 |     0.017 |           | 
##                   |     0.004 |     0.006 |     0.005 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##                20 |     18070 |     20608 |      9162 |     47840 | 
##                   |  2135.562 |    11.381 |  2194.806 |           | 
##                   |     0.378 |     0.431 |     0.192 |     0.061 | 
##                   |     0.086 |     0.062 |     0.038 |           | 
##                   |     0.023 |     0.026 |     0.012 |           | 
## ------------------|-----------|-----------|-----------|-----------|
##      Column Total |    210244 |    329739 |    243684 |    783667 | 
##                   |     0.268 |     0.421 |     0.311 |           | 
## ------------------|-----------|-----------|-----------|-----------|
## 
## 
combin[,Product_Category_2_NA := ifelse(sapply(combin$Product_Category_2, is.na) ==    TRUE,1,0)]
combin[,Product_Category_3_NA := ifelse(sapply(combin$Product_Category_3, is.na) ==  TRUE,1,0)]
combin[,Product_Category_2 := ifelse(is.na(Product_Category_2) == TRUE, "-999",  Product_Category_2)]
combin[,Product_Category_3 := ifelse(is.na(Product_Category_3) == TRUE, "-999",  Product_Category_3)]
levels(combin$Stay_In_Current_City_Years)[levels(combin$Stay_In_Current_City_Years) ==  "4+"] <- "4"
levels(combin$Age)[levels(combin$Age) == "0-17"] <- 0
levels(combin$Age)[levels(combin$Age) == "18-25"] <- 1
levels(combin$Age)[levels(combin$Age) == "26-35"] <- 2
levels(combin$Age)[levels(combin$Age) == "36-45"] <- 3
levels(combin$Age)[levels(combin$Age) == "46-50"] <- 4
levels(combin$Age)[levels(combin$Age) == "51-55"] <- 5
levels(combin$Age)[levels(combin$Age) == "55+"] <- 6

#convert age to numeric
 combin$Age <- as.numeric(combin$Age)

#convert Gender into numeric
 combin[, Gender := as.numeric(as.factor(Gender)) - 1]

User Count

combin[,User_count:=.N, by=User_ID]
combin[,Product_count:=.N, by =Product_ID]
# combin<-combin[order(User_ID)]
combin[,Mean_Purchase_Product:=mean(Purchase), by=Product_ID]
combin[, Mean_Purchase_User := mean(Purchase), by = User_ID]
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
combin <- dummy.data.frame(combin, names = c("City_Category"), sep = "_")

check classes of all variables

sapply(combin,class)
##                    User_ID                 Product_ID 
##                  "integer"                   "factor" 
##                     Gender                        Age 
##                  "numeric"                  "numeric" 
##                 Occupation            City_Category_A 
##                  "integer"                  "integer" 
##            City_Category_B            City_Category_C 
##                  "integer"                  "integer" 
## Stay_In_Current_City_Years             Marital_Status 
##                   "factor"                  "integer" 
##         Product_Category_1         Product_Category_2 
##                  "integer"                "character" 
##         Product_Category_3                   Purchase 
##                "character"                  "numeric" 
##      Product_Category_2_NA      Product_Category_3_NA 
##                  "numeric"                  "numeric" 
##                 User_count              Product_count 
##                  "integer"                  "integer" 
##      Mean_Purchase_Product         Mean_Purchase_User 
##                  "numeric"                  "numeric"

converting Product Category 2 & 3

combin$Product_Category_2 <- as.integer(combin$Product_Category_2)
combin$Product_Category_3 <- as.integer(combin$Product_Category_3)

Divide into train and test

c.train<-combin[1:nrow(train),]
c.test<-combin[-(1:nrow(train)),]
c.train <- c.train[c.train$Product_Category_1 <= 18,]
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:data.table':
## 
##     hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
localH2O<-h2o.init(nthreads=-1)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 minutes 1 seconds 
##     H2O cluster version:        3.14.0.3 
##     H2O cluster version age:    1 month and 28 days  
##     H2O cluster name:           H2O_started_from_R_r631758_zih486 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.45 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.2 (2017-09-28)
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 minutes 2 seconds 
##     H2O cluster version:        3.14.0.3 
##     H2O cluster version age:    1 month and 28 days  
##     H2O cluster name:           H2O_started_from_R_r631758_zih486 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.45 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.2 (2017-09-28)
h2o.removeAll()
## [1] 0

data to h2o cluster

train.h2o<-as.h2o(c.train)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
test.h2o<-as.h2o(c.test)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

check column index number

colnames(train.h2o)
##  [1] "User_ID"                    "Product_ID"                
##  [3] "Gender"                     "Age"                       
##  [5] "Occupation"                 "City_Category_A"           
##  [7] "City_Category_B"            "City_Category_C"           
##  [9] "Stay_In_Current_City_Years" "Marital_Status"            
## [11] "Product_Category_1"         "Product_Category_2"        
## [13] "Product_Category_3"         "Purchase"                  
## [15] "Product_Category_2_NA"      "Product_Category_3_NA"     
## [17] "User_count"                 "Product_count"             
## [19] "Mean_Purchase_Product"      "Mean_Purchase_User"
colnames(test.h2o)
##  [1] "User_ID"                    "Product_ID"                
##  [3] "Gender"                     "Age"                       
##  [5] "Occupation"                 "City_Category_A"           
##  [7] "City_Category_B"            "City_Category_C"           
##  [9] "Stay_In_Current_City_Years" "Marital_Status"            
## [11] "Product_Category_1"         "Product_Category_2"        
## [13] "Product_Category_3"         "Purchase"                  
## [15] "Product_Category_2_NA"      "Product_Category_3_NA"     
## [17] "User_count"                 "Product_count"             
## [19] "Mean_Purchase_Product"      "Mean_Purchase_User"

dependent variable(Purchase)

y.dep<-14

independent variables

x.indep<-c(3:13,15:20)

regression

regression.model<-h2o.glm(y=y.dep,x=x.indep, training_frame=train.h2o, family="gaussian")
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |=================================================================| 100%
h2o.performance(regression.model)
## H2ORegressionMetrics: glm
## ** Reported on training data. **
## 
## MSE:  16710563
## RMSE:  4087.856
## MAE:  3219.644
## RMSLE:  0.5782911
## Mean Residual Deviance :  16710563
## R^2 :  0.3261543
## Null Deviance :1.353804e+13
## Null D.o.F. :545914
## Residual Deviance :9.122547e+12
## Residual D.o.F. :545898
## AIC :10628689

make predictions

predict.reg<-as.data.frame(h2o.predict(regression.model,test.h2o))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
sub_reg<-data.frame(User_ID=test$User_ID, Product_ID=test$Product_ID,Purchase=predict.reg$predict)
write.csv(sub_reg,file="sub_reg.csv", row.names=F)

random Forest

system.time(rforest.model<-h2o.randomForest(y=y.dep, x=x.indep, training_frame=train.h2o,ntrees=1000,mtries=3,max_depth=4,seed=1122))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |==============================                                   |  47%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |====================================================             |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================| 100%
##    user  system elapsed 
##    0.58    0.04   49.83
h2o.performance(rforest.model)
## H2ORegressionMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
## 
## MSE:  10414919
## RMSE:  3227.215
## MAE:  2486.118
## RMSLE:  0.5007453
## Mean Residual Deviance :  10414919

check variable importance

h2o.varimp(rforest.model)

making predictions on test data

system.time(predict.rforest <- as.data.frame(h2o.predict(rforest.model, test.h2o)))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
##    user  system elapsed 
##    0.39    0.03   10.27

writing submission file

sub_rf <- data.frame(User_ID = test$User_ID, Product_ID = test$Product_ID, Purchase =  predict.rforest$predict)
write.csv(sub_rf, file = "sub_rf.csv", row.names = F)

GBM

system.time(gbm.model<-h2o.gbm(y=y.dep,x=x.indep, training_frame=train.h2o,ntrees=1000, max_depth=4, learn_rate=0.01,seed=1122))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |============================                                     |  44%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |==============================                                   |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |=======================================                          |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  88%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================| 100%
##    user  system elapsed 
##    0.91    0.22  119.19
h2o.performance(gbm.model)
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  6321280
## RMSE:  2514.216
## MAE:  1859.895
## RMSLE:  NaN
## Mean Residual Deviance :  6321280

making prediction and writing submission file

predict.gbm <- as.data.frame(h2o.predict(gbm.model, test.h2o))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
 sub_gbm <- data.frame(User_ID = test$User_ID, Product_ID = test$Product_ID, Purchase = predict.gbm$predict)
 write.csv(sub_gbm, file = "sub_gbm.csv", row.names = F)

deep learning

hyper_params <- list(
  activation=c("Rectifier","Tanh","Maxout","RectifierWithDropout","TanhWithDropout","MaxoutWithDropout"),
  epochs=c(60,70,80,100),
  hidden=list(c(20,20),c(50,50),c(100,100),c(30,30,30),c(25,25,25,25))
  # input_dropout_ratio=c(0,0.05),
  # l1=seq(0,1e-4,1e-6),
  # l2=seq(0,1e-4,1e-6)
)

hyper_params
## $activation
## [1] "Rectifier"            "Tanh"                 "Maxout"              
## [4] "RectifierWithDropout" "TanhWithDropout"      "MaxoutWithDropout"   
## 
## $epochs
## [1]  60  70  80 100
## 
## $hidden
## $hidden[[1]]
## [1] 20 20
## 
## $hidden[[2]]
## [1] 50 50
## 
## $hidden[[3]]
## [1] 100 100
## 
## $hidden[[4]]
## [1] 30 30 30
## 
## $hidden[[5]]
## [1] 25 25 25 25
search_criteria = list(strategy = "RandomDiscrete", max_runtime_secs = 1000, max_models = 100, seed=1122, stopping_rounds=5, stopping_tolerance=1e-2)

# dl_random_grid <- h2o.grid(
#   algorithm="deeplearning",
#   grid_id = "dl_grid_random",
#   training_frame=train.h2o,
#     x= x.indep, 
#   y= y.dep,
#   
#   stopping_metric="RMSE",
#   # stopping_tolerance=1e-2,        ## stop when logloss does not improve by >=1% for 2 scoring events
#   # stopping_rounds=2,
#   #  
#   # score_duty_cycle=0.025,         ## don't score more than 2.5% of the wall time
#   # max_w2=10,                      ## can help improve stability for Rectifier
#   hyper_params = hyper_params,
#   search_criteria = search_criteria
# )          
# 
# 
# grid<-h2o.getGrid("dl_grid_random",sort_by="MSE",decreasing=FALSE)
# grid
# 
# best_model<-h2o.getModel(grid@model_ids[[1]])
# best_model

#use automl;

# md<-h2o.automl(y = y.dep,x = x.indep,training_frame = train.h2o, stopping_metric="RMSE",leaderboard_frame=train.h2o)

# 
# system.time(
#              dlearning.model <- h2o.deeplearning(y = y.dep,
#              x = x.indep,
#              training_frame = train.h2o,
#              epoch = 60,
#               hidden = c(100,100),
#              #hidden=c(50,50),
#              activation = "Rectifier",
#              seed = 1122
#              ))
# h2o.performance(dlearning.model)
# h2o.r2(dlearning.model)
# h2o.performance(dlearning.model,newdata=test.h2o)
# h2o.performance(md@leader,newdata=test.h2o)
# predict.dl2 <- as.data.frame(h2o.predict(dlearning.model, test.h2o))
# predict.dl2 <- as.data.frame(h2o.predict(md@leaderboard[1,1], test.h2o))

create a data frame and writing submission file

# sub_dlearning <- data.frame(User_ID = test$User_ID, Product_ID = test$Product_ID, Purchase = predict.dl2$predict)
# write.csv(sub_dlearning, file = "sub_dlearning_new.csv", row.names = F)
# write.csv(sub_dlearning, file = "sub_drf_new.csv", row.names = F)
# h2o.shutdown()