setwd("C:/Users/linye/Desktop/Tandon2016Fall/MachineLearning/project1/train_users_2.csv")
airbnb.simple <- read.csv("cleaned_simple.csv", row.names = 1)
random select rows as train data
set.seed(3)
train <- sample(1:nrow(airbnb.simple), nrow(airbnb.simple)/2)
airbnb.simple$country_destination <- ifelse(airbnb.simple$country_destination == "US", 0, 1)
airbnb.simple.train <- subset(airbnb.simple[train, ], select = -country_destination)
destination.train <- airbnb.simple$country_destination[train]
airbnb.simple.test <- subset(airbnb.simple[-train, ], select = -country_destination)
destination.test <- airbnb.simple$country_destination[-train]
Boosting
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
boost.airbnb <- gbm.fit(airbnb.simple.train,
destination.train,
distribution = "bernoulli",
n.trees = 500,
shrinkage = 0.01,
interaction.depth = 5)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2072 nan 0.0100 0.0001
## 2 1.2070 nan 0.0100 0.0001
## 3 1.2068 nan 0.0100 0.0001
## 4 1.2066 nan 0.0100 0.0001
## 5 1.2065 nan 0.0100 0.0001
## 6 1.2062 nan 0.0100 0.0001
## 7 1.2061 nan 0.0100 0.0001
## 8 1.2059 nan 0.0100 0.0001
## 9 1.2057 nan 0.0100 0.0001
## 10 1.2056 nan 0.0100 0.0001
## 20 1.2039 nan 0.0100 0.0001
## 40 1.2011 nan 0.0100 0.0000
## 60 1.1989 nan 0.0100 0.0000
## 80 1.1971 nan 0.0100 0.0000
## 100 1.1955 nan 0.0100 0.0000
## 120 1.1943 nan 0.0100 0.0000
## 140 1.1932 nan 0.0100 0.0000
## 160 1.1923 nan 0.0100 0.0000
## 180 1.1914 nan 0.0100 0.0000
## 200 1.1906 nan 0.0100 0.0000
## 220 1.1899 nan 0.0100 -0.0000
## 240 1.1892 nan 0.0100 0.0000
## 260 1.1886 nan 0.0100 0.0000
## 280 1.1880 nan 0.0100 -0.0000
## 300 1.1874 nan 0.0100 0.0000
## 320 1.1869 nan 0.0100 -0.0000
## 340 1.1865 nan 0.0100 -0.0000
## 360 1.1860 nan 0.0100 0.0000
## 380 1.1856 nan 0.0100 -0.0000
## 400 1.1851 nan 0.0100 -0.0000
## 420 1.1848 nan 0.0100 -0.0000
## 440 1.1844 nan 0.0100 0.0000
## 460 1.1839 nan 0.0100 -0.0000
## 480 1.1836 nan 0.0100 -0.0000
## 500 1.1832 nan 0.0100 -0.0000
boost.pred <- predict(boost.airbnb,
newdata = airbnb.simple.test,
n.trees = 500,
type = "response")
table(round(boost.pred), destination.test)
## destination.test
## 0 1
## 0 19789 8180
## 1 13 14