setwd("C:/Users/linye/Desktop/Tandon2016Fall/MachineLearning/project1/train_users_2.csv")
airbnb <- read.csv("cleaned_data.csv", row.names = 1)

convert date_account_created, timestamp_first_active and date_first_booking to class Date

airbnb$date_account_created <- as.Date(airbnb$date_account_created)
airbnb$timestamp_first_active <- as.Date(airbnb$timestamp_first_active)
airbnb$date_first_booking <- as.Date(airbnb$date_first_booking)

calculate the time difference between date_account_created and timestamp_first_active

airbnb$dift1 <- as.numeric(airbnb$date_account_created - airbnb$timestamp_first_active)

calculate the time difference between date_account_created and date_first_booking

airbnb$dift2 <- as.numeric(airbnb$date_account_created - airbnb$date_first_booking)

simplify data: set language to ‘en’ and ‘non-en’, and country_destination to ‘US’ and ‘non_US’

airbnb$language <- as.factor(ifelse(airbnb$language == 'en', 'en', 'non_en'))
airbnb$country_destination <- as.factor(ifelse(airbnb$country_destination == 'US', 'US', 'non_US'))

set first_browser to maily used browser

mainbrowser <- c("-unknown-",
                 "Android Browser",
                 "Chrome",
                 "Chrome Mobile",
                 "Firefox",
                 "IE",
                 "Mobile Safari",
                 "Safari")

inmainbrowser <- airbnb$first_browser %in% mainbrowser
airbnb$first_browser <- as.character(airbnb$first_browser)
airbnb$first_browser[!inmainbrowser] <- "others" 
airbnb$first_browser <- as.factor(airbnb$first_browser)

drop columns date_account_created, timestamp_first_active and date_first_booking

airbnb.simple <- subset(airbnb,
                        select = -c(date_account_created,
                                    timestamp_first_active,
                                    date_first_booking))

display the cleaned data

str(airbnb.simple)
## 'data.frame':    55991 obs. of  14 variables:
##  $ gender                 : Factor w/ 2 levels "FEMALE","MALE": 1 1 1 1 1 1 1 1 2 1 ...
##  $ age                    : int  56 42 46 47 50 36 33 29 30 26 ...
##  $ signup_method          : Factor w/ 3 levels "basic","facebook",..: 1 2 1 1 1 1 1 1 1 1 ...
##  $ signup_flow            : int  3 0 0 0 0 0 0 0 0 0 ...
##  $ language               : Factor w/ 2 levels "en","non_en": 1 1 1 1 1 1 1 1 1 1 ...
##  $ affiliate_channel      : Factor w/ 8 levels "api","content",..: 3 3 4 3 4 4 3 3 3 4 ...
##  $ affiliate_provider     : Factor w/ 16 levels "baidu","bing",..: 4 4 3 4 3 3 4 4 4 3 ...
##  $ first_affiliate_tracked: Factor w/ 8 levels "","linked","local ops",..: 8 8 8 5 8 8 8 8 2 8 ...
##  $ signup_app             : Factor w/ 4 levels "Android","iOS",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ first_device_type      : Factor w/ 9 levels "Android Phone",..: 9 6 6 6 6 6 9 6 6 6 ...
##  $ first_browser          : Factor w/ 9 levels "-unknown-","Android Browser",..: 6 5 9 9 9 5 3 3 3 3 ...
##  $ country_destination    : Factor w/ 2 levels "non_US","US": 2 1 2 2 2 2 1 1 2 1 ...
##  $ dift1                  : num  476 765 0 0 0 0 0 0 0 0 ...
##  $ dift2                  : num  57 -278 -3 -10 -206 -2 -1 0 -1 -3 ...
summary(airbnb.simple)
##     gender           age         signup_method    signup_flow    
##  FEMALE:29798   Min.   :  2.0   basic   :33157   Min.   : 0.000  
##  MALE  :26193   1st Qu.: 28.0   facebook:22783   1st Qu.: 0.000  
##                 Median : 33.0   google  :   51   Median : 0.000  
##                 Mean   : 36.6                    Mean   : 2.361  
##                 3rd Qu.: 41.0                    3rd Qu.: 0.000  
##                 Max.   :115.0                    Max.   :25.000  
##                                                                  
##    language         affiliate_channel  affiliate_provider
##  en    :54491   direct       :37254   direct    :37176   
##  non_en: 1500   sem-brand    : 6935   google    :12819   
##                 sem-non-brand: 4242   other     : 3374   
##                 seo          : 2540   craigslist:  919   
##                 other        : 2266   facebook  :  572   
##                 api          : 2167   bing      :  488   
##                 (Other)      :  587   (Other)   :  643   
##   first_affiliate_tracked   signup_app          first_device_type
##  untracked    :30476      Android:  784   Mac Desktop    :28021  
##  linked       :13002      iOS    : 3823   Windows Desktop:18406  
##  omg          :10277      Moweb  : 1183   iPhone         : 4190  
##  tracked-other: 1566      Web    :50201   iPad           : 3303  
##  product      :  310                      Other/Unknown  : 1026  
##               :  300                      Android Phone  :  433  
##  (Other)      :   60                      (Other)        :  612  
##        first_browser   country_destination     dift1         
##  Chrome       :20028   non_US:16362        Min.   :   0.000  
##  Safari       :12606   US    :39629        1st Qu.:   0.000  
##  Firefox      : 9858                       Median :   0.000  
##  Mobile Safari: 4423                       Mean   :   0.392  
##  IE           : 4309                       3rd Qu.:   0.000  
##  -unknown-    : 4138                       Max.   :1148.000  
##  (Other)      :  629                                         
##      dift2        
##  Min.   :-365.00  
##  1st Qu.: -50.00  
##  Median :  -4.00  
##  Mean   : -52.11  
##  3rd Qu.:  -1.00  
##  Max.   : 349.00  
## 
write.csv(airbnb.simple, "cleaned_simple.csv")

decision tree

library(tree)

#random select rows as train data
set.seed(3)
train <- sample(1:nrow(airbnb.simple), nrow(airbnb.simple)/2)
airbnb.simple.test <- airbnb.simple[-train, ]

destination.test <- airbnb.simple$country_destination[-train]

tree.airbnb.simple <- tree(country_destination~.,
                           airbnb.simple,
                           subset = train)
tree.airbnb.simple #one node,,,
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 27995 33800 US ( 0.2918 0.7082 ) *
tree.pred <- predict(tree.airbnb.simple, airbnb.simple.test, type = "class")

table(tree.pred, destination.test)
##          destination.test
## tree.pred non_US    US
##    non_US      0     0
##    US       8194 19802