setwd("C:/Users/linye/Desktop/Tandon2016Fall/MachineLearning/project1/train_users_2.csv")
airbnb <- read.csv("cleaned_data.csv", row.names = 1)
convert date_account_created, timestamp_first_active and date_first_booking to class Date
airbnb$date_account_created <- as.Date(airbnb$date_account_created)
airbnb$timestamp_first_active <- as.Date(airbnb$timestamp_first_active)
airbnb$date_first_booking <- as.Date(airbnb$date_first_booking)
calculate the time difference between date_account_created and timestamp_first_active
airbnb$dift1 <- as.numeric(airbnb$date_account_created - airbnb$timestamp_first_active)
calculate the time difference between date_account_created and date_first_booking
airbnb$dift2 <- as.numeric(airbnb$date_account_created - airbnb$date_first_booking)
simplify data: set language to ‘en’ and ‘non-en’, and country_destination to ‘US’ and ‘non_US’
airbnb$language <- as.factor(ifelse(airbnb$language == 'en', 'en', 'non_en'))
airbnb$country_destination <- as.factor(ifelse(airbnb$country_destination == 'US', 'US', 'non_US'))
set first_browser to maily used browser
mainbrowser <- c("-unknown-",
"Android Browser",
"Chrome",
"Chrome Mobile",
"Firefox",
"IE",
"Mobile Safari",
"Safari")
inmainbrowser <- airbnb$first_browser %in% mainbrowser
airbnb$first_browser <- as.character(airbnb$first_browser)
airbnb$first_browser[!inmainbrowser] <- "others"
airbnb$first_browser <- as.factor(airbnb$first_browser)
drop columns date_account_created, timestamp_first_active and date_first_booking
airbnb.simple <- subset(airbnb,
select = -c(date_account_created,
timestamp_first_active,
date_first_booking))
display the cleaned data
str(airbnb.simple)
## 'data.frame': 55991 obs. of 14 variables:
## $ gender : Factor w/ 2 levels "FEMALE","MALE": 1 1 1 1 1 1 1 1 2 1 ...
## $ age : int 56 42 46 47 50 36 33 29 30 26 ...
## $ signup_method : Factor w/ 3 levels "basic","facebook",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ signup_flow : int 3 0 0 0 0 0 0 0 0 0 ...
## $ language : Factor w/ 2 levels "en","non_en": 1 1 1 1 1 1 1 1 1 1 ...
## $ affiliate_channel : Factor w/ 8 levels "api","content",..: 3 3 4 3 4 4 3 3 3 4 ...
## $ affiliate_provider : Factor w/ 16 levels "baidu","bing",..: 4 4 3 4 3 3 4 4 4 3 ...
## $ first_affiliate_tracked: Factor w/ 8 levels "","linked","local ops",..: 8 8 8 5 8 8 8 8 2 8 ...
## $ signup_app : Factor w/ 4 levels "Android","iOS",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ first_device_type : Factor w/ 9 levels "Android Phone",..: 9 6 6 6 6 6 9 6 6 6 ...
## $ first_browser : Factor w/ 9 levels "-unknown-","Android Browser",..: 6 5 9 9 9 5 3 3 3 3 ...
## $ country_destination : Factor w/ 2 levels "non_US","US": 2 1 2 2 2 2 1 1 2 1 ...
## $ dift1 : num 476 765 0 0 0 0 0 0 0 0 ...
## $ dift2 : num 57 -278 -3 -10 -206 -2 -1 0 -1 -3 ...
summary(airbnb.simple)
## gender age signup_method signup_flow
## FEMALE:29798 Min. : 2.0 basic :33157 Min. : 0.000
## MALE :26193 1st Qu.: 28.0 facebook:22783 1st Qu.: 0.000
## Median : 33.0 google : 51 Median : 0.000
## Mean : 36.6 Mean : 2.361
## 3rd Qu.: 41.0 3rd Qu.: 0.000
## Max. :115.0 Max. :25.000
##
## language affiliate_channel affiliate_provider
## en :54491 direct :37254 direct :37176
## non_en: 1500 sem-brand : 6935 google :12819
## sem-non-brand: 4242 other : 3374
## seo : 2540 craigslist: 919
## other : 2266 facebook : 572
## api : 2167 bing : 488
## (Other) : 587 (Other) : 643
## first_affiliate_tracked signup_app first_device_type
## untracked :30476 Android: 784 Mac Desktop :28021
## linked :13002 iOS : 3823 Windows Desktop:18406
## omg :10277 Moweb : 1183 iPhone : 4190
## tracked-other: 1566 Web :50201 iPad : 3303
## product : 310 Other/Unknown : 1026
## : 300 Android Phone : 433
## (Other) : 60 (Other) : 612
## first_browser country_destination dift1
## Chrome :20028 non_US:16362 Min. : 0.000
## Safari :12606 US :39629 1st Qu.: 0.000
## Firefox : 9858 Median : 0.000
## Mobile Safari: 4423 Mean : 0.392
## IE : 4309 3rd Qu.: 0.000
## -unknown- : 4138 Max. :1148.000
## (Other) : 629
## dift2
## Min. :-365.00
## 1st Qu.: -50.00
## Median : -4.00
## Mean : -52.11
## 3rd Qu.: -1.00
## Max. : 349.00
##
write.csv(airbnb.simple, "cleaned_simple.csv")
decision tree
library(tree)
#random select rows as train data
set.seed(3)
train <- sample(1:nrow(airbnb.simple), nrow(airbnb.simple)/2)
airbnb.simple.test <- airbnb.simple[-train, ]
destination.test <- airbnb.simple$country_destination[-train]
tree.airbnb.simple <- tree(country_destination~.,
airbnb.simple,
subset = train)
tree.airbnb.simple #one node,,,
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 27995 33800 US ( 0.2918 0.7082 ) *
tree.pred <- predict(tree.airbnb.simple, airbnb.simple.test, type = "class")
table(tree.pred, destination.test)
## destination.test
## tree.pred non_US US
## non_US 0 0
## US 8194 19802