library(dplyr)
library(broom)
library(forecast)
library("readr")
library("tidyr")
library(ggplot2)
library(lubridate)
library(RColorBrewer)
library(randomForest)
library(class)
Importing data and changing data types to the desired types.
setwd("C:\\Swapnil\\Kaggle\\Airbnb New Cust Destination\\Intermediate Datasets")
age_gender_bkts <- read_csv("age_gender_bkts.csv")
countries <- read_csv("countries.csv")
sessions <- read_csv("sessions.csv")
test_users <- read_csv("test_users.csv")
train_users <- read_csv("train_users_2.csv")
# Checking table structure and changing data types accordingly
# glimpse(age_gender_bkts)
# summary(age_gender_bkts)
# glimpse(countries)
countries <- countries %>% rename(dest_area = destination_km2, lang_diff_score = language_levenshtein_distance, dest_distance = distance_km, dest_language = destination_language)
test_users_fctr <- test_users %>% mutate_at(c("gender", "signup_method", "signup_flow", "language", "affiliate_channel", "affiliate_provider", "first_affiliate_tracked", "signup_app", "first_device_type", "first_browser"), funs(as.factor)) %>%
mutate(timestamp_first_active = ymd_hms(timestamp_first_active), age = ifelse(age > 200, year(timestamp_first_active)-age, age))
train_users_fctr <- train_users %>% mutate_at(c("gender", "signup_method", "signup_flow", "language", "affiliate_channel", "affiliate_provider", "first_affiliate_tracked", "signup_app", "first_device_type", "first_browser", "country_destination"), as.factor) %>%
mutate(timestamp_first_active = ymd_hms(timestamp_first_active), age = ifelse(age > 200, year(timestamp_first_active)-age, age)) %>% mutate(age = ifelse(age < 5, 5 , age))
sessions_fctr <- sessions %>% mutate_at(c("action","action_type","device_type"), as.factor) %>% rename(id = user_id)
Basic Count 1. How many users in train & test data? 2. How many users’ sessions data available? 3. How many users have age information missing? 4. How many users have gender information missing?
#1
cnt_train_users <- train_users_fctr %>% summarize(cnt_train = n_distinct(id))
test_users_fctr %>% summarize(cnt_test = n_distinct(id)) %>% cbind(cnt_train_users)
## cnt_test cnt_train
## 1 62096 213451
#2
cnt_session_train_users <- train_users_fctr %>% inner_join(sessions_fctr, by = "id") %>% summarize(sessions_train_users = n_distinct(id), prcnt_train_users = sessions_train_users/n_distinct(train_users_fctr$id))
cnt_session_test_users <- test_users_fctr %>% inner_join(sessions_fctr, by = "id") %>% summarize(sessions_test_users = n_distinct(id), prcnt_test_users = sessions_test_users/n_distinct(test_users_fctr$id))
sessions_fctr %>% summarize(tot_users = n_distinct(id)) %>% cbind(cnt_session_train_users, cnt_session_test_users)
## tot_users sessions_train_users prcnt_train_users sessions_test_users
## 1 135484 73815 0.3458171 61668
## prcnt_test_users
## 1 0.9931074
#3
cnt_train_noage <- train_users_fctr %>% summarize(cnt_train = sum(is.na(age)), prcnt_train = cnt_train/n())
test_users_fctr %>% summarize(cnt_test = sum(is.na(age)), prcnt_test = cnt_test/n()) %>% cbind(cnt_train_noage)
## cnt_test prcnt_test cnt_train prcnt_train
## 1 28876 0.4650219 87990 0.4122258
#4
cnt_train_nogender <- train_users_fctr %>% summarize(cnt_train = sum(is.na(gender)| .$gender == "-unknown-"), prcnt_train = cnt_train/n())
test_users_fctr %>% summarize(cnt_test = sum(is.na(gender)| .$gender == "-unknown-"), prcnt_test = cnt_test/n()) %>% cbind(cnt_train_nogender)
## cnt_test prcnt_test cnt_train prcnt_train
## 1 33792 0.5441896 95688 0.4482902
Most preferred country US by far turns out be the most preferred country among users.
Country Prefernece by Gender The plot reveals no major difference in preference by gender.
Country Preference by Age and Gender US is less prefereed by people of age between 25 to 50 and 65 to 95. In these buckets DE sees an increase in preference. Also, the trend of preference across age is similar for both genders.
Destination Preference by Age US, AU, CA and FR seems to be preferred mostly by 0-25 and 55-69 age group while DE, IT and PE are prefereed moslty by 45+ age groups.
Is destination preference affected by language difference? Destination preference is not affected by how different is destination language than English as destination with high difference ~92 has good users higher than low difference of ~63. Gender andn Age also have uniform trend across various languages.
Distribution of users across age in train data
Distribution of users across gender in train data There are quite a lot of users with missing gender information.
Signup Trend over time Signups are steeply increasing year over year.
Booking Pattern by Month of the year The bookings seem to peak around summer around May and June and then slowly decrease during winters.
The oversees destination are preferredby US customers during summer than winters.
Preference of US dropped in 2011 and has been increasing since then. Preference of PT has increased over time while FR has decreased over time.
Is tenure between first activity and first booking related to destination? Most destinations have median tenure of 4-5 days between first activity and first booking date.
Signup Method preference and destination relation Most preferred signup method id direct one followed by facebook.
The preference of destination vary slightly across basic and facebook signup method.
International Language preference and relation with destination Most prefereed international language after en is zh, fr, es and de.
The international language preference of user does affect the selection of destination.
Most commonly used Affiliate Channel and relation with destination Direct is the most used channel followed by sem brand and sem non brand.
The affiliate channel used by customers does indicate the preference of destination with US being dominant in all channels but PT and FR being dominant in sem related channels.
Most commonly used platform for signups and relation of signup platform with destination Most commonly used platform for signup is web followed by ios.
The US and FR are preffered highlu by user signing up through web than any other platform. Users signingup through android have higher chances of not booking than any other platform.
Does first device used by customer indicates his preference? Most commonly used device to visit site for first time is Desktop showcasing that Mobile is used significantly for browsing than booking.
The preference of country varies slightly with the first device used by user to come to website with US being more prefereed by users whose first device is Mac Desktop
Most common first browseer used by users and relation with destination Chrome and Safari are by far the most used first browser.
The preference distribution across major browsers i.e. Chrome, Safari and Firefox is almost similar.
Website activity and influence on destination The messaging and wishlist are most common actions by user on website.
dis_action <- sessions_fctr %>% group_by(action) %>% summarize(users = n_distinct(id), time = sum(secs_elapsed), freq = n())
dis_actiondetail <- sessions_fctr %>% group_by(action_detail) %>% summarize(users = n_distinct(id), time = sum(secs_elapsed), freq = n())
dis_action_actiondetail <- sessions_fctr %>% distinct(action, action_detail)
# Top 10 actions by users
top20_users <- dis_actiondetail %>% arrange(desc(users)) %>% filter(!is.na(time)) %>% top_n(n = 10, wt = users) %>% select(action_detail) %>% rename(`Top 10 by number of users` = action_detail)
top20_freq <- dis_actiondetail %>% arrange(desc(freq)) %>% filter(!is.na(time)) %>% top_n(n = 10, wt = freq) %>% select(action_detail) %>% rename(`Top 10 by frequency of usage` = action_detail)
top20_time <- dis_actiondetail %>% arrange(desc(time)) %>% filter(!is.na(time)) %>% top_n(n = 10, wt = time) %>% select(action_detail) %>% rename(`Top 10 by Time elapsed in usage` = action_detail)
cbind(top20_users,top20_freq, top20_time)
## Top 10 by number of users Top 10 by frequency of usage
## 1 p5 p5
## 2 message_to_host_change listing_recommendations
## 3 post_checkout_action update_listing_description
## 4 send_message message_inbox
## 5 message_inbox message_to_host_change
## 6 profile_references wishlist
## 7 update_listing_description profile_references
## 8 listing_recommendations post_checkout_action
## 9 wishlist send_message
## 10 listing_reviews_page apply_coupon_error
## Top 10 by Time elapsed in usage
## 1 update_listing_description
## 2 message_to_host_change
## 3 listing_reviews_page
## 4 post_checkout_action
## 5 p5
## 6 wishlist
## 7 message_inbox
## 8 profile_references
## 9 guest_itinerary
## 10 terms_and_privacy
The frequency of usage of the top 6 actions varies slightly with the ultimate destination choice of user.
Random Forest First cleaning the data and preparing modelling dataset with required features and then using random forest to predict the first destination country for new users.
# Treating gender to replace other as unknown
train_users_trtd <- train_users_fctr %>% mutate(gender = ifelse(as.character(gender) == "OTHER", "-unknown-", as.character(gender)), mnth_acct = month(date_account_created)) %>% mutate_at(c("gender","mnth_acct"), funs(as.factor))
test_users_trtd <- test_users_fctr %>% mutate(gender = ifelse(as.character(gender) == "OTHER", "-unknown-", as.character(gender)), mnth_acct = month(date_account_created)) %>% mutate_at(c("gender","mnth_acct"), funs(as.factor))
train_users_trtd1 <- train_users_trtd %>% mutate(signup_method = ifelse(as.character(signup_method) %in% c("facebook", "basic"),as.character(signup_method),"others"),language = ifelse(as.character(language) %in% c("en","de","es","fr","ko","zh"),as.character(language),"others"), affiliate_channel = ifelse(as.character(affiliate_channel) %in% c("direct","sem-brand","sem-non-brnad","seo","api"),as.character(affiliate_channel),"others"),
first_browser = ifelse(as.character(first_browser) %in% c("Chrome","Firefox","IE","Mobile Safari","Safari"), as.character(first_browser),"others")) %>% mutate_at(c("signup_method","language","affiliate_channel","first_browser"),funs(as.factor))
test_users_trtd1 <- test_users_trtd %>% mutate(signup_method = ifelse(as.character(signup_method) %in% c("facebook", "basic"),as.character(signup_method),"others"),language = ifelse(as.character(language) %in% c("en","de","es","fr","ko","zh"),as.character(language),"others"), affiliate_channel = ifelse(as.character(affiliate_channel) %in% c("direct","sem-brand","sem-non-brnad","seo","api"),as.character(affiliate_channel),"others"), first_browser = ifelse(as.character(first_browser) %in% c("Chrome","Firefox","IE","Mobile Safari","Safari"), as.character(first_browser),"others")) %>% mutate_at(c("signup_method","language","affiliate_channel","first_browser"),funs(as.factor))
# User level session summary
user_action_time <- user_action %>% spread(action_detail, time_spent) %>% select(-freq)
user_action_summary <- user_action %>% select(-time_spent) %>% spread(action_detail, freq) %>% full_join(user_action_time, by = "id")
# Modelling data set
train_data <- train_users_trtd1 %>% left_join(user_action_summary, by = "id")
# sampling into train and test
sample <- sample(nrow(train_data),nrow(train_data)*0.8)
train_w <- train_data[sample,-c(2:4,8,11,12, 18:29)]
test_w <- train_data[-sample,-c(2:4,8,11,12, 18:29)]
train_w1 <- na.omit(train_w)
test_w1 <- na.omit(test_w)
# Random Forest
model_rf <- randomForest(country_destination~.-id ,data = train_w1,mtry =4, importance = TRUE)
# Predicting train
pred.train <- predict(model_rf, type = "response")
prob.pred.train <- predict(model_rf, type = "prob")
Accuracy_train = mean(pred.train==train_w1$country_destination)
# Predicting test
pred.test <- predict(model_rf,newdata=test_w1)
prob.pred.test <- predict(model_rf,newdata=test_w, type = "prob")
Accuracy_test =mean(pred.test ==test_w1$country_destination, na.rm = T)
cbind(Accuracy_train, Accuracy_test)
## Accuracy_train Accuracy_test
## [1,] 0.5193514 0.5222244
Variable importance plot shows signup method, first browser and device as important to reduce error while age, month of signup and signup method as important for node purity.
KNN Model K-Nearest Neighbour approach will be used to predict the first destination after standardizing the predictors as KNN finds neighbours based on euclidean/hamming distance.
train_w1_mat <- train_w1 %>% select(-1)
x_train <- model.matrix(country_destination~., train_w1_mat) [,-1]
test_w1_mat <- test_w %>% select(-1)
x_test <- model.matrix(country_destination~., test_w1_mat) [,-1]
#Standardizing
std_data <- rbind(x_train,x_test) %>% scale()
x_train_std <- std_data[1:nrow(x_train),]
x_test_std <- std_data[-c(1:nrow(x_train)),]
cl <- train_w1_mat$country_destination
knn.pred <- knn(x_train_std, x_test_std, cl, k = 40)
Accuracy_test = mean(knn.pred == test_w1_mat$country_destination)
Accuracy_test
## [1] 0.4420207