read data

setwd("C:/Users/linye/Desktop/Tandon2016Fall/MachineLearning/project1/train_users_2.csv")

data <- read.csv("train_users_2.csv", header = T, stringsAsFactors = F)
head(data)
##           id date_account_created timestamp_first_active
## 1 gxn3p5htnn           2010-06-28           2.009032e+13
## 2 820tgsjxq7           2011-05-25           2.009052e+13
## 3 4ft3gnwmtx           2010-09-28           2.009061e+13
## 4 bjjt8pjhuk           2011-12-05           2.009103e+13
## 5 87mebub9p4           2010-09-14           2.009121e+13
## 6 osr2jwljor           2010-01-01           2.010010e+13
##   date_first_booking    gender age signup_method signup_flow language
## 1                    -unknown-  NA      facebook           0       en
## 2                         MALE  38      facebook           0       en
## 3         2010-08-02    FEMALE  56         basic           3       en
## 4         2012-09-08    FEMALE  42      facebook           0       en
## 5         2010-02-18 -unknown-  41         basic           0       en
## 6         2010-01-02 -unknown-  NA         basic           0       en
##   affiliate_channel affiliate_provider first_affiliate_tracked signup_app
## 1            direct             direct               untracked        Web
## 2               seo             google               untracked        Web
## 3            direct             direct               untracked        Web
## 4            direct             direct               untracked        Web
## 5            direct             direct               untracked        Web
## 6             other              other                     omg        Web
##   first_device_type first_browser country_destination
## 1       Mac Desktop        Chrome                 NDF
## 2       Mac Desktop        Chrome                 NDF
## 3   Windows Desktop            IE                  US
## 4       Mac Desktop       Firefox               other
## 5       Mac Desktop        Chrome                  US
## 6       Mac Desktop        Chrome                  US
str(data)
## 'data.frame':    213451 obs. of  16 variables:
##  $ id                     : chr  "gxn3p5htnn" "820tgsjxq7" "4ft3gnwmtx" "bjjt8pjhuk" ...
##  $ date_account_created   : chr  "2010-06-28" "2011-05-25" "2010-09-28" "2011-12-05" ...
##  $ timestamp_first_active : num  2.01e+13 2.01e+13 2.01e+13 2.01e+13 2.01e+13 ...
##  $ date_first_booking     : chr  "" "" "2010-08-02" "2012-09-08" ...
##  $ gender                 : chr  "-unknown-" "MALE" "FEMALE" "FEMALE" ...
##  $ age                    : num  NA 38 56 42 41 NA 46 47 50 46 ...
##  $ signup_method          : chr  "facebook" "facebook" "basic" "facebook" ...
##  $ signup_flow            : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ language               : chr  "en" "en" "en" "en" ...
##  $ affiliate_channel      : chr  "direct" "seo" "direct" "direct" ...
##  $ affiliate_provider     : chr  "direct" "google" "direct" "direct" ...
##  $ first_affiliate_tracked: chr  "untracked" "untracked" "untracked" "untracked" ...
##  $ signup_app             : chr  "Web" "Web" "Web" "Web" ...
##  $ first_device_type      : chr  "Mac Desktop" "Mac Desktop" "Windows Desktop" "Mac Desktop" ...
##  $ first_browser          : chr  "Chrome" "Chrome" "IE" "Firefox" ...
##  $ country_destination    : chr  "NDF" "NDF" "US" "other" ...

statistics of every predictor

  1. date_account_created plot sorted histogram
date_account_created <- as.Date(data$date_account_created)
par(mfrow = c(3,1))

hist(date_account_created, 'years',
     main = "histogram", xlab = "date_account_created(years)")
hist(date_account_created, 'months',
     main = "histogram", xlab = "date_account_created(months)")
hist(date_account_created, 'weeks',
     main = "histogram", xlab = "date_account_created(weeks)")

The plot shows the trend of date_account_created, the number of counts is increasing by time.

  1. timestamp_first_active plot sorted histogram
#function convert '20090319043255' to '2009-03-19', and leave the 04:32:55, because date_account_created is not precise to date
converter <- function(s){
        year <- substr(s, 1, 4)
        month <- substr(s, 5, 6)
        day <- substr(s, 7, 8)
        paste(year, month, day, sep = "-")
}

timestamp_first_active <- as.Date(converter(data$timestamp_first_active)) #convert to date class

par(mfrow = c(3,1))
hist(timestamp_first_active, 'years', 
     main = "histogram", xlab = "timestamp_first_active(years)")
hist(timestamp_first_active, 'months', 
     main = "histogram", xlab = "timestamp_first_active(months)")
hist(timestamp_first_active, 'weeks', 
     main = "histogram", xlab = "timestamp_first_active(weeks)")

The plot shows the trend of timestamp_first_active, the number of counts is increasing by time. This plot is reasonable because I think timestamp_first_active and date_account_created are strongly related.

Therefore, I calculate the difference between this two.

dif_fa_ac <- date_account_created - timestamp_first_active
dif <- as.numeric(dif_fa_ac)

par(mfrow = c(2,1))

hist(dif, breaks = 200, 
     main = "histogram", 
     xlab = "dif between date_account_created and timestamp_first_active(in days)")

hist(dif[dif != 0], breaks = 200,
     main = "histogram", 
     xlab = "dif between date_account_created and timestamp_first_active(in days without counting 0 dif)")

median(dif) #median of diff between date_account_created and timestamp_first_active
## [1] 0

the distribution shows nearly all user create an account after they using this website.

  1. date_first_booking
date_first_booking <- as.Date(data$date_first_booking[data$date_first_booking!=""]) #remove null then convert to date class

par(mfrow = c(3,1))
hist(date_first_booking, 'years',
     main = "histogram", xlab = "date_first_booking(years)")
hist(date_first_booking, 'months',
     main = "histogram", xlab = "date_first_booking(years)")
hist(date_first_booking, 'weeks',
     main = "histogram", xlab = "date_first_booking(years)")

l <- length(date_first_booking) #number of not null
t <- nrow(data) #total
l/t #ratio, not null out of total
## [1] 0.4165265

I think this data is kind of weird, because date_first_booking have a sudden decrease around 2014-06-30, then the booking is decrease. In my first thinking, this data should have an increasing trend, like date_account_created data. However, more than half of the dat_first_booking is missing. Therefore, the histogram can not represent the true trend of the date_first_booking. Instead of directly analysis this trend, I subtract date_first_booking with date_account_created and timestamp_first_active respectly.

dif_fb_ac <- date_account_created - date_first_booking
## Warning in unclass(time1) - unclass(time2): longer object length is not a
## multiple of shorter object length
dif_fb_fa <- timestamp_first_active - date_first_booking
## Warning in unclass(time1) - unclass(time2): longer object length is not a
## multiple of shorter object length
dif1 <- as.numeric(dif_fb_ac)
dif2 <- as.numeric(dif_fb_fa)

par(mfrow = c(1,2))

hist(dif1, breaks = 200, 
     main = "histogram", 
     xlab = "dif between date_account_created and date_first_booking(in days)")

hist(dif2, breaks = 200, 
     main = "histogram", 
     xlab = "dif between timestamp_first_active and date_first_booking(in days)")

median(dif1) #median of dif between date_account_created and date_first_booking(in days)
## [1] -49
median(dif2) #median of dif between timestamp_first_active and date_first_booking(in days)
## [1] -50

From the diff distribution it is positive skew(skew to left), the meidan of these two are nearly 50, which means after an account is created, it usually take 50 days for that account first booking.

  1. gender
gender <- data$gender

par(mfrow = c(1,3))
barplot(prop.table( table(as.factor(gender))), main = "gender")

gender[gender == "OTHER"] = '-unknown-'
barplot(prop.table( table(as.factor(gender))), main = "gender" )

barplot(prop.table( table(as.factor(gender[gender != '-unknown-']))), main = "gender" )

t <- length(gender)
m_f <- length(gender[gender != '-unknown-'])
m_f/t #percentage of the non-unknowns out of whole data
## [1] 0.5503886
m <- length(gender[gender == 'MALE'])
f <- length(gender[gender == 'FEMALE'])
m/(m+f) #percentage of males out of whole non-unknowns
## [1] 0.4633941

There are 4 levels in gender, except unknowns, I think ‘others’ is a wrong level which should combine with ‘-unknowns-’. Then more than half of the data is missing, the remaining data, contains more women than men.

  1. age
age <- data$age

par(mfrow = c(3,2))
boxplot(age, main = "age")
hist(age, main = "age", breaks = 150)

age_1 <- subset(age, age <= 120)
boxplot(age_1, main = "age(<= 120)")
hist(age_1, breaks = 120, main = "age(<= 120)")

age_2 <- subset(age, age <= 90)
boxplot(age_2, main = "age(<= 90)")
hist(age_2, breaks = 90, main = "age(<= 90)")

t <- length(age)
a <- length(age[!is.na(age)])
a / t #ratio of non zero
## [1] 0.5877742

In age data, there are some abosultely wrong data, becasue a human age can not excceed(in my knowledge) 130. Personally, I think age should even less than 90, which makes more sense for a human to book a house or travel. This also contain 42% null data.

  1. signup method
signup_method <- data$signup_method
barplot(prop.table( table(as.factor(signup_method))), main = "signup_method")

Most user use website signed up Airbnb.

  1. signup_flow: the page a user came to signup up from
signup_flow <- data$signup_flow
hist(signup_flow)

  1. language: international language preference
language <- data$language
barplot(prop.table( table(as.factor(language))) )

This data shows the majority users are using English(users are all from US), however, I think this data is important for us to predict destination. Because the language distribution and destination distribution are strongly related.

  1. affiliate_channel: what kind of paid marketing
affiliate_channel <- data$affiliate_channel
barplot(prop.table( table(as.factor(affiliate_channel))), main = "affiliate_channel" )

  1. affiliate_provider: where the marketing is e.g. google, craigslist, other
affiliate_provider <- data$affiliate_provider
barplot(prop.table( table(as.factor(affiliate_provider))), main = "affiliate_provider" )

  1. first_affiliate_tracked: whats the first marketing the user interacted with before the signing up
first_affiliate_tracked <- data$first_affiliate_tracked
barplot(prop.table( table(as.factor(first_affiliate_tracked))), main = "first_affiliate_tracked" )

I think “” means that the user interacted with no market before the signing up

  1. signup_app
signup_app <- data$signup_app
barplot(prop.table( table(as.factor(signup_app))), main = "signup_app" )

  1. first_device_type
first_device_type <- data$first_device_type
barplot(prop.table( table(as.factor(first_device_type))), main = "first_device_type" )

  1. first_browser
first_browser <- data$first_browser
barplot(prop.table( table(as.factor(first_browser))), main = "first_browser" )

  1. country_destination: the response need to predict
country_destination <- data$country_destination
barplot(prop.table( table(as.factor(country_destination))), main = "country_destination" )

t <- length(country_destination)
ndf <- length(country_destination[country_destination == "NDF"])
ndf/t
## [1] 0.5834735

Where NDF means no booking, 60% user only create an account but not booking finally.

clean data

assume all predictors are useful, remove rows which contains NA or empty.

d <- subset(data, data$date_first_booking != "")
d <- subset(d, d$gender!="-unknown-")
d <- subset(d, d$gender!="OTHER")
d <- subset(d, !is.na(d$age))
d <- subset(d, d$age < 120)
d$id <- NULL
write.csv(d,"cleaned_data.csv")