library(data.table)
train<-fread("C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\train_oSwQCTC\\train.csv", stringsAsFactors = T)
test<-fread("C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\test_HujdGe7\\test.csv", stringsAsFactors = T)
dim(train)
## [1] 550068 12
dim(test)
## [1] 233599 11
str(train)
## Classes 'data.table' and 'data.frame': 550068 obs. of 12 variables:
## $ User_ID : int 1000001 1000001 1000001 1000001 1000002 1000003 1000004 1000004 1000004 1000005 ...
## $ Product_ID : Factor w/ 3631 levels "P00000142","P00000242",..: 673 2377 853 829 2735 1832 1746 3321 3605 2632 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 1 1 1 2 2 2 2 2 2 ...
## $ Age : Factor w/ 7 levels "0-17","18-25",..: 1 1 1 1 7 3 5 5 5 3 ...
## $ Occupation : int 10 10 10 10 16 15 7 7 7 20 ...
## $ City_Category : Factor w/ 3 levels "A","B","C": 1 1 1 1 3 1 2 2 2 1 ...
## $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 3 3 3 3 5 4 3 3 3 2 ...
## $ Marital_Status : int 0 0 0 0 0 0 1 1 1 1 ...
## $ Product_Category_1 : int 3 1 12 12 8 1 1 1 1 8 ...
## $ Product_Category_2 : int NA 6 NA 14 NA 2 8 15 16 NA ...
## $ Product_Category_3 : int NA 14 NA NA NA NA 17 NA NA NA ...
## $ Purchase : int 8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...
## - attr(*, ".internal.selfref")=<externalptr>
sub_mean<-data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase=mean(train$Purchase))
write.csv(sub_mean, file="first_sub.csv",row.names=F)
summary(train)
## User_ID Product_ID Gender Age
## Min. :1000001 P00265242: 1880 F:135809 0-17 : 15102
## 1st Qu.:1001516 P00025442: 1615 M:414259 18-25: 99660
## Median :1003077 P00110742: 1612 26-35:219587
## Mean :1003029 P00112142: 1562 36-45:110013
## 3rd Qu.:1004478 P00057642: 1470 46-50: 45701
## Max. :1006040 P00184942: 1440 51-55: 38501
## (Other) :540489 55+ : 21504
## Occupation City_Category Stay_In_Current_City_Years
## Min. : 0.000 A:147720 0 : 74398
## 1st Qu.: 2.000 B:231173 1 :193821
## Median : 7.000 C:171175 2 :101838
## Mean : 8.077 3 : 95285
## 3rd Qu.:14.000 4+: 84726
## Max. :20.000
##
## Marital_Status Product_Category_1 Product_Category_2 Product_Category_3
## Min. :0.0000 Min. : 1.000 Min. : 2.00 Min. : 3.0
## 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 9.0
## Median :0.0000 Median : 5.000 Median : 9.00 Median :14.0
## Mean :0.4097 Mean : 5.404 Mean : 9.84 Mean :12.7
## 3rd Qu.:1.0000 3rd Qu.: 8.000 3rd Qu.:15.00 3rd Qu.:16.0
## Max. :1.0000 Max. :20.000 Max. :18.00 Max. :18.0
## NA's :173638 NA's :383247
## Purchase
## Min. : 12
## 1st Qu.: 5823
## Median : 8047
## Mean : 9264
## 3rd Qu.:12054
## Max. :23961
##
summary(test)
## User_ID Product_ID Gender Age
## Min. :1000001 P00265242: 829 F: 57827 0-17 : 6232
## 1st Qu.:1001527 P00112142: 717 M:175772 18-25:42293
## Median :1003070 P00025442: 695 26-35:93428
## Mean :1003029 P00110742: 680 36-45:46711
## 3rd Qu.:1004477 P00046742: 646 46-50:19577
## Max. :1006040 P00184942: 626 51-55:16283
## (Other) :229406 55+ : 9075
## Occupation City_Category Stay_In_Current_City_Years
## Min. : 0.000 A:62524 0 :31318
## 1st Qu.: 2.000 B:98566 1 :82604
## Median : 7.000 C:72509 2 :43589
## Mean : 8.085 3 :40143
## 3rd Qu.:14.000 4+:35945
## Max. :20.000
##
## Marital_Status Product_Category_1 Product_Category_2 Product_Category_3
## Min. :0.0000 Min. : 1.000 Min. : 2.00 Min. : 3.00
## 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 9.00
## Median :0.0000 Median : 5.000 Median : 9.00 Median :14.00
## Mean :0.4101 Mean : 5.277 Mean : 9.85 Mean :12.67
## 3rd Qu.:1.0000 3rd Qu.: 8.000 3rd Qu.:15.00 3rd Qu.:16.00
## Max. :1.0000 Max. :18.000 Max. :18.00 Max. :18.00
## NA's :72344 NA's :162562
combine dataset
test[,Purchase:=mean(train$Purchase)]
c<-list(train, test)
combin<-rbindlist(c)
combin[,prop.table(table(Gender))]
## Gender
## F M
## 0.2470896 0.7529104
combin[,prop.table(table(Age))]
## Age
## 0-17 18-25 26-35 36-45 46-50 51-55
## 0.02722330 0.18113944 0.39942348 0.19998801 0.08329814 0.06990724
## 55+
## 0.03902040
combin[,prop.table(table(City_Category))]
## City_Category
## A B C
## 0.2682823 0.4207642 0.3109535
combin[,prop.table(table(Stay_In_Current_City_Years))]
## Stay_In_Current_City_Years
## 0 1 2 3 4+
## 0.1348991 0.3527327 0.1855724 0.1728132 0.1539825
length(unique(combin$Product_ID))
## [1] 3677
length(unique(combin$User_ID))
## [1] 5891
colSums(is.na(combin))
## User_ID Product_ID
## 0 0
## Gender Age
## 0 0
## Occupation City_Category
## 0 0
## Stay_In_Current_City_Years Marital_Status
## 0 0
## Product_Category_1 Product_Category_2
## 0 245982
## Product_Category_3 Purchase
## 545809 0
library(ggplot2)
ggplot(combin, aes(Age, fill = Gender)) + geom_bar()

ggplot(combin, aes(Age, fill = City_Category)) + geom_bar()

library(gmodels)
CrossTable(combin$Occupation, combin$City_Category)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 783667
##
##
## | combin$City_Category
## combin$Occupation | A | B | C | Row Total |
## ------------------|-----------|-----------|-----------|-----------|
## 0 | 26874 | 42455 | 29521 | 98850 |
## | 4.733 | 17.884 | 48.165 | |
## | 0.272 | 0.429 | 0.299 | 0.126 |
## | 0.128 | 0.129 | 0.121 | |
## | 0.034 | 0.054 | 0.038 | |
## ------------------|-----------|-----------|-----------|-----------|
## 1 | 18200 | 28264 | 21223 | 67687 |
## | 0.092 | 1.642 | 1.463 | |
## | 0.269 | 0.418 | 0.314 | 0.086 |
## | 0.087 | 0.086 | 0.087 | |
## | 0.023 | 0.036 | 0.027 | |
## ------------------|-----------|-----------|-----------|-----------|
## 2 | 13201 | 16276 | 8519 | 37996 |
## | 887.231 | 5.211 | 919.471 | |
## | 0.347 | 0.428 | 0.224 | 0.048 |
## | 0.063 | 0.049 | 0.035 | |
## | 0.017 | 0.021 | 0.011 | |
## ------------------|-----------|-----------|-----------|-----------|
## 3 | 8040 | 9747 | 7339 | 25126 |
## | 250.378 | 64.398 | 28.759 | |
## | 0.320 | 0.388 | 0.292 | 0.032 |
## | 0.038 | 0.030 | 0.030 | |
## | 0.010 | 0.012 | 0.009 | |
## ------------------|-----------|-----------|-----------|-----------|
## 4 | 34577 | 42524 | 25985 | 103086 |
## | 1731.917 | 16.692 | 1149.411 | |
## | 0.335 | 0.413 | 0.252 | 0.132 |
## | 0.164 | 0.129 | 0.107 | |
## | 0.044 | 0.054 | 0.033 | |
## ------------------|-----------|-----------|-----------|-----------|
## 5 | 3380 | 9467 | 4526 | 17373 |
## | 352.000 | 636.521 | 142.112 | |
## | 0.195 | 0.545 | 0.261 | 0.022 |
## | 0.016 | 0.029 | 0.019 | |
## | 0.004 | 0.012 | 0.006 | |
## ------------------|-----------|-----------|-----------|-----------|
## 6 | 5321 | 15656 | 8125 | 29102 |
## | 791.918 | 950.127 | 94.422 | |
## | 0.183 | 0.538 | 0.279 | 0.037 |
## | 0.025 | 0.047 | 0.033 | |
## | 0.007 | 0.020 | 0.010 | |
## ------------------|-----------|-----------|-----------|-----------|
## 7 | 22956 | 32859 | 28312 | 84127 |
## | 6.609 | 182.064 | 177.101 | |
## | 0.273 | 0.391 | 0.337 | 0.107 |
## | 0.109 | 0.100 | 0.116 | |
## | 0.029 | 0.042 | 0.036 | |
## ------------------|-----------|-----------|-----------|-----------|
## 8 | 134 | 1178 | 877 | 2189 |
## | 349.845 | 71.681 | 56.624 | |
## | 0.061 | 0.538 | 0.401 | 0.003 |
## | 0.001 | 0.004 | 0.004 | |
## | 0.000 | 0.002 | 0.001 | |
## ------------------|-----------|-----------|-----------|-----------|
## 9 | 999 | 4574 | 3356 | 8929 |
## | 814.109 | 177.664 | 120.949 | |
## | 0.112 | 0.512 | 0.376 | 0.011 |
## | 0.005 | 0.014 | 0.014 | |
## | 0.001 | 0.006 | 0.004 | |
## ------------------|-----------|-----------|-----------|-----------|
## 10 | 3138 | 6039 | 9127 | 18304 |
## | 639.886 | 358.943 | 2073.431 | |
## | 0.171 | 0.330 | 0.499 | 0.023 |
## | 0.015 | 0.018 | 0.037 | |
## | 0.004 | 0.008 | 0.012 | |
## ------------------|-----------|-----------|-----------|-----------|
## 11 | 3537 | 8002 | 5054 | 16593 |
## | 187.912 | 149.093 | 2.163 | |
## | 0.213 | 0.482 | 0.305 | 0.021 |
## | 0.017 | 0.024 | 0.021 | |
## | 0.005 | 0.010 | 0.006 | |
## ------------------|-----------|-----------|-----------|-----------|
## 12 | 10057 | 18784 | 15607 | 44448 |
## | 292.502 | 0.358 | 230.722 | |
## | 0.226 | 0.423 | 0.351 | 0.057 |
## | 0.048 | 0.057 | 0.064 | |
## | 0.013 | 0.024 | 0.020 | |
## ------------------|-----------|-----------|-----------|-----------|
## 13 | 561 | 3466 | 7026 | 11053 |
## | 1949.458 | 301.788 | 3747.820 | |
## | 0.051 | 0.314 | 0.636 | 0.014 |
## | 0.003 | 0.011 | 0.029 | |
## | 0.001 | 0.004 | 0.009 | |
## ------------------|-----------|-----------|-----------|-----------|
## 14 | 10975 | 15971 | 11836 | 38782 |
## | 31.279 | 7.382 | 4.138 | |
## | 0.283 | 0.412 | 0.305 | 0.049 |
## | 0.052 | 0.048 | 0.049 | |
## | 0.014 | 0.020 | 0.015 | |
## ------------------|-----------|-----------|-----------|-----------|
## 15 | 4373 | 7479 | 5504 | 17356 |
## | 17.238 | 4.252 | 2.125 | |
## | 0.252 | 0.431 | 0.317 | 0.022 |
## | 0.021 | 0.023 | 0.023 | |
## | 0.006 | 0.010 | 0.007 | |
## ------------------|-----------|-----------|-----------|-----------|
## 16 | 8772 | 15444 | 11906 | 36122 |
## | 87.130 | 3.954 | 40.412 | |
## | 0.243 | 0.428 | 0.330 | 0.046 |
## | 0.042 | 0.047 | 0.049 | |
## | 0.011 | 0.020 | 0.015 | |
## ------------------|-----------|-----------|-----------|-----------|
## 17 | 11668 | 23204 | 22546 | 57418 |
## | 906.208 | 37.785 | 1232.854 | |
## | 0.203 | 0.404 | 0.393 | 0.073 |
## | 0.055 | 0.070 | 0.093 | |
## | 0.015 | 0.030 | 0.029 | |
## ------------------|-----------|-----------|-----------|-----------|
## 18 | 2246 | 3030 | 4091 | 9367 |
## | 28.368 | 210.708 | 476.667 | |
## | 0.240 | 0.323 | 0.437 | 0.012 |
## | 0.011 | 0.009 | 0.017 | |
## | 0.003 | 0.004 | 0.005 | |
## ------------------|-----------|-----------|-----------|-----------|
## 19 | 3165 | 4712 | 4042 | 11919 |
## | 0.334 | 18.317 | 30.415 | |
## | 0.266 | 0.395 | 0.339 | 0.015 |
## | 0.015 | 0.014 | 0.017 | |
## | 0.004 | 0.006 | 0.005 | |
## ------------------|-----------|-----------|-----------|-----------|
## 20 | 18070 | 20608 | 9162 | 47840 |
## | 2135.562 | 11.381 | 2194.806 | |
## | 0.378 | 0.431 | 0.192 | 0.061 |
## | 0.086 | 0.062 | 0.038 | |
## | 0.023 | 0.026 | 0.012 | |
## ------------------|-----------|-----------|-----------|-----------|
## Column Total | 210244 | 329739 | 243684 | 783667 |
## | 0.268 | 0.421 | 0.311 | |
## ------------------|-----------|-----------|-----------|-----------|
##
##
combin[,Product_Category_2_NA := ifelse(sapply(combin$Product_Category_2, is.na) == TRUE,1,0)]
combin[,Product_Category_3_NA := ifelse(sapply(combin$Product_Category_3, is.na) == TRUE,1,0)]
combin[,Product_Category_2 := ifelse(is.na(Product_Category_2) == TRUE, "-999", Product_Category_2)]
combin[,Product_Category_3 := ifelse(is.na(Product_Category_3) == TRUE, "-999", Product_Category_3)]
levels(combin$Stay_In_Current_City_Years)[levels(combin$Stay_In_Current_City_Years) == "4+"] <- "4"
levels(combin$Age)[levels(combin$Age) == "0-17"] <- 0
levels(combin$Age)[levels(combin$Age) == "18-25"] <- 1
levels(combin$Age)[levels(combin$Age) == "26-35"] <- 2
levels(combin$Age)[levels(combin$Age) == "36-45"] <- 3
levels(combin$Age)[levels(combin$Age) == "46-50"] <- 4
levels(combin$Age)[levels(combin$Age) == "51-55"] <- 5
levels(combin$Age)[levels(combin$Age) == "55+"] <- 6
#convert age to numeric
combin$Age <- as.numeric(combin$Age)
#convert Gender into numeric
combin[, Gender := as.numeric(as.factor(Gender)) - 1]
User Count
combin[,User_count:=.N, by=User_ID]
combin[,Product_count:=.N, by =Product_ID]
# combin<-combin[order(User_ID)]
combin[,Mean_Purchase_Product:=mean(Purchase), by=Product_ID]
combin[, Mean_Purchase_User := mean(Purchase), by = User_ID]
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
combin <- dummy.data.frame(combin, names = c("City_Category"), sep = "_")
check classes of all variables
sapply(combin,class)
## User_ID Product_ID
## "integer" "factor"
## Gender Age
## "numeric" "numeric"
## Occupation City_Category_A
## "integer" "integer"
## City_Category_B City_Category_C
## "integer" "integer"
## Stay_In_Current_City_Years Marital_Status
## "factor" "integer"
## Product_Category_1 Product_Category_2
## "integer" "character"
## Product_Category_3 Purchase
## "character" "numeric"
## Product_Category_2_NA Product_Category_3_NA
## "numeric" "numeric"
## User_count Product_count
## "integer" "integer"
## Mean_Purchase_Product Mean_Purchase_User
## "numeric" "numeric"
converting Product Category 2 & 3
combin$Product_Category_2 <- as.integer(combin$Product_Category_2)
combin$Product_Category_3 <- as.integer(combin$Product_Category_3)
Divide into train and test
c.train<-combin[1:nrow(train),]
c.test<-combin[-(1:nrow(train)),]
c.train <- c.train[c.train$Product_Category_1 <= 18,]
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:data.table':
##
## hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
localH2O<-h2o.init(nthreads=-1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 minutes 1 seconds
## H2O cluster version: 3.14.0.3
## H2O cluster version age: 1 month and 28 days
## H2O cluster name: H2O_started_from_R_r631758_zih486
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.45 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.2 (2017-09-28)
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 minutes 2 seconds
## H2O cluster version: 3.14.0.3
## H2O cluster version age: 1 month and 28 days
## H2O cluster name: H2O_started_from_R_r631758_zih486
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.45 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.2 (2017-09-28)
h2o.removeAll()
## [1] 0
data to h2o cluster
train.h2o<-as.h2o(c.train)
##
|
| | 0%
|
|=================================================================| 100%
test.h2o<-as.h2o(c.test)
##
|
| | 0%
|
|=================================================================| 100%
check column index number
colnames(train.h2o)
## [1] "User_ID" "Product_ID"
## [3] "Gender" "Age"
## [5] "Occupation" "City_Category_A"
## [7] "City_Category_B" "City_Category_C"
## [9] "Stay_In_Current_City_Years" "Marital_Status"
## [11] "Product_Category_1" "Product_Category_2"
## [13] "Product_Category_3" "Purchase"
## [15] "Product_Category_2_NA" "Product_Category_3_NA"
## [17] "User_count" "Product_count"
## [19] "Mean_Purchase_Product" "Mean_Purchase_User"
colnames(test.h2o)
## [1] "User_ID" "Product_ID"
## [3] "Gender" "Age"
## [5] "Occupation" "City_Category_A"
## [7] "City_Category_B" "City_Category_C"
## [9] "Stay_In_Current_City_Years" "Marital_Status"
## [11] "Product_Category_1" "Product_Category_2"
## [13] "Product_Category_3" "Purchase"
## [15] "Product_Category_2_NA" "Product_Category_3_NA"
## [17] "User_count" "Product_count"
## [19] "Mean_Purchase_Product" "Mean_Purchase_User"
dependent variable(Purchase)
y.dep<-14
independent variables
x.indep<-c(3:13,15:20)
regression
regression.model<-h2o.glm(y=y.dep,x=x.indep, training_frame=train.h2o, family="gaussian")
##
|
| | 0%
|
|= | 2%
|
|=================================================================| 100%
h2o.performance(regression.model)
## H2ORegressionMetrics: glm
## ** Reported on training data. **
##
## MSE: 16710563
## RMSE: 4087.856
## MAE: 3219.644
## RMSLE: 0.5782911
## Mean Residual Deviance : 16710563
## R^2 : 0.3261543
## Null Deviance :1.353804e+13
## Null D.o.F. :545914
## Residual Deviance :9.122547e+12
## Residual D.o.F. :545898
## AIC :10628689
make predictions
predict.reg<-as.data.frame(h2o.predict(regression.model,test.h2o))
##
|
| | 0%
|
|=================================================================| 100%
sub_reg<-data.frame(User_ID=test$User_ID, Product_ID=test$Product_ID,Purchase=predict.reg$predict)
write.csv(sub_reg,file="sub_reg.csv", row.names=F)
random Forest
system.time(rforest.model<-h2o.randomForest(y=y.dep, x=x.indep, training_frame=train.h2o,ntrees=1000,mtries=3,max_depth=4,seed=1122))
##
|
| | 0%
|
|= | 2%
|
|== | 4%
|
|=== | 5%
|
|==== | 7%
|
|====== | 9%
|
|======= | 11%
|
|======== | 13%
|
|========== | 15%
|
|=========== | 17%
|
|============ | 19%
|
|============== | 21%
|
|=============== | 23%
|
|================= | 26%
|
|================== | 28%
|
|=================== | 30%
|
|===================== | 32%
|
|====================== | 34%
|
|======================= | 36%
|
|========================= | 38%
|
|========================== | 40%
|
|============================ | 42%
|
|============================= | 44%
|
|============================== | 47%
|
|================================ | 49%
|
|================================= | 51%
|
|================================== | 53%
|
|==================================== | 55%
|
|===================================== | 57%
|
|======================================= | 60%
|
|======================================== | 62%
|
|========================================== | 64%
|
|=========================================== | 67%
|
|============================================= | 69%
|
|============================================== | 71%
|
|================================================ | 74%
|
|================================================= | 76%
|
|=================================================== | 78%
|
|==================================================== | 81%
|
|====================================================== | 83%
|
|======================================================= | 85%
|
|========================================================= | 88%
|
|========================================================== | 90%
|
|============================================================ | 92%
|
|============================================================= | 94%
|
|=============================================================== | 97%
|
|================================================================ | 99%
|
|=================================================================| 100%
## user system elapsed
## 0.58 0.04 49.83
h2o.performance(rforest.model)
## H2ORegressionMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
##
## MSE: 10414919
## RMSE: 3227.215
## MAE: 2486.118
## RMSLE: 0.5007453
## Mean Residual Deviance : 10414919
check variable importance
h2o.varimp(rforest.model)
making predictions on test data
system.time(predict.rforest <- as.data.frame(h2o.predict(rforest.model, test.h2o)))
##
|
| | 0%
|
|=================================================================| 100%
## user system elapsed
## 0.39 0.03 10.27
writing submission file
sub_rf <- data.frame(User_ID = test$User_ID, Product_ID = test$Product_ID, Purchase = predict.rforest$predict)
write.csv(sub_rf, file = "sub_rf.csv", row.names = F)
GBM
system.time(gbm.model<-h2o.gbm(y=y.dep,x=x.indep, training_frame=train.h2o,ntrees=1000, max_depth=4, learn_rate=0.01,seed=1122))
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 17%
|
|============ | 18%
|
|============ | 19%
|
|============= | 20%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================== | 36%
|
|======================== | 37%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================== | 45%
|
|============================== | 46%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 57%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|=========================================== | 66%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 77%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 80%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 100%
## user system elapsed
## 0.91 0.22 119.19
h2o.performance(gbm.model)
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
##
## MSE: 6321280
## RMSE: 2514.216
## MAE: 1859.895
## RMSLE: NaN
## Mean Residual Deviance : 6321280
making prediction and writing submission file
predict.gbm <- as.data.frame(h2o.predict(gbm.model, test.h2o))
##
|
| | 0%
|
|=================================================================| 100%
sub_gbm <- data.frame(User_ID = test$User_ID, Product_ID = test$Product_ID, Purchase = predict.gbm$predict)
write.csv(sub_gbm, file = "sub_gbm.csv", row.names = F)
deep learning
hyper_params <- list(
activation=c("Rectifier","Tanh","Maxout","RectifierWithDropout","TanhWithDropout","MaxoutWithDropout"),
epochs=c(60,70,80,100),
hidden=list(c(20,20),c(50,50),c(100,100),c(30,30,30),c(25,25,25,25))
# input_dropout_ratio=c(0,0.05),
# l1=seq(0,1e-4,1e-6),
# l2=seq(0,1e-4,1e-6)
)
hyper_params
## $activation
## [1] "Rectifier" "Tanh" "Maxout"
## [4] "RectifierWithDropout" "TanhWithDropout" "MaxoutWithDropout"
##
## $epochs
## [1] 60 70 80 100
##
## $hidden
## $hidden[[1]]
## [1] 20 20
##
## $hidden[[2]]
## [1] 50 50
##
## $hidden[[3]]
## [1] 100 100
##
## $hidden[[4]]
## [1] 30 30 30
##
## $hidden[[5]]
## [1] 25 25 25 25
search_criteria = list(strategy = "RandomDiscrete", max_runtime_secs = 1000, max_models = 100, seed=1122, stopping_rounds=5, stopping_tolerance=1e-2)
# dl_random_grid <- h2o.grid(
# algorithm="deeplearning",
# grid_id = "dl_grid_random",
# training_frame=train.h2o,
# x= x.indep,
# y= y.dep,
#
# stopping_metric="RMSE",
# # stopping_tolerance=1e-2, ## stop when logloss does not improve by >=1% for 2 scoring events
# # stopping_rounds=2,
# #
# # score_duty_cycle=0.025, ## don't score more than 2.5% of the wall time
# # max_w2=10, ## can help improve stability for Rectifier
# hyper_params = hyper_params,
# search_criteria = search_criteria
# )
#
#
# grid<-h2o.getGrid("dl_grid_random",sort_by="MSE",decreasing=FALSE)
# grid
#
# best_model<-h2o.getModel(grid@model_ids[[1]])
# best_model
#use automl;
# md<-h2o.automl(y = y.dep,x = x.indep,training_frame = train.h2o, stopping_metric="RMSE",leaderboard_frame=train.h2o)
#
# system.time(
# dlearning.model <- h2o.deeplearning(y = y.dep,
# x = x.indep,
# training_frame = train.h2o,
# epoch = 60,
# hidden = c(100,100),
# #hidden=c(50,50),
# activation = "Rectifier",
# seed = 1122
# ))
# h2o.performance(dlearning.model)
# h2o.r2(dlearning.model)
# h2o.performance(dlearning.model,newdata=test.h2o)
# h2o.performance(md@leader,newdata=test.h2o)
# predict.dl2 <- as.data.frame(h2o.predict(dlearning.model, test.h2o))
# predict.dl2 <- as.data.frame(h2o.predict(md@leaderboard[1,1], test.h2o))
create a data frame and writing submission file
# sub_dlearning <- data.frame(User_ID = test$User_ID, Product_ID = test$Product_ID, Purchase = predict.dl2$predict)
# write.csv(sub_dlearning, file = "sub_dlearning_new.csv", row.names = F)
# write.csv(sub_dlearning, file = "sub_drf_new.csv", row.names = F)
# h2o.shutdown()