Analysis - Airbnb의 데이터를 바탕으로 전환율과 유입 경로, 예약하는 요일 등을 분석
Analytics - Airbnb 데이터를 바탕으로 머신러닝 모델링을 통해 어느 나라의 유저가 처음 어느곳을 예약할 지 예측 - 새로운 가입자가 최초로 예약할 장소를 예측하는 것
suppressPackageStartupMessages({
library(data.table)
library(forecast)
library(lubridate)
library(zoo)
library(ggplot2)
library(DescTools)
library(sqldf)
library(dplyr)
library(knitr)
library(VIM)
library(mice)
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(ggthemes)
library(DT)
library(corrplot)
library(psych)
library(caret)
library(gmodels)
library(C50)
library(xgboost)
library(readr)
library(stringr)
library(caret)
library(car)
})
train <- read.csv('./input/train_users_2.csv', stringsAsFactors = FALSE)
test <- read.csv('./input/test_users.csv', stringsAsFactors = FALSE)
train_unlabel <- train[-16] # EDA를 위해 label 제거(test data에는 label이 없기 때문에)
Method 1 : R (rbind)
users_r <- rbind(train_unlabel, test)
Method2 : SQL (union all)
users_sql <- sqldf("
select *
from train_unlabel a
union all
select *
from test b
;")
datatable(head(train), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
Reshape data : label table
train$date_first_booking <- as.Date(train$date_first_booking, "%Y-%m-%d")
train$date_account_created <- as.Date(train$date_account_created, "%Y-%m-%d")
train$timestamp_first_active <- as.character(train$timestamp_first_active)
train$timestamp_first_active <- as.Date(train$timestamp_first_active, "%Y%m%d")
train$date_first_booking_year <- year(train$date_first_booking)
train$date_first_booking_month <- month(train$date_first_booking)
train$date_first_booking_quarter <- quarter(train$date_first_booking)
train$date_first_booking_weekdays <- weekdays(train$date_first_booking)
train$date_account_created_year <- year(train$date_account_created)
train$date_account_created_month <- month(train$date_account_created)
train$date_account_created_quarter <- quarter(train$date_account_created)
train$date_account_created_weekdays <- weekdays(train$date_account_created)
train$timestamp_first_active_year <- year(train$timestamp_first_active)
train$timestamp_first_active_month <- month(train$timestamp_first_active)
train$timestamp_first_active_quarter <- quarter(train$timestamp_first_active)
train$timestamp_first_active_weekdays <- weekdays(train$timestamp_first_active)
Reshape data : merge table
users_r$date_first_booking <- as.Date(users_r$date_first_booking, "%Y-%m-%d")
users_r$date_account_created <- as.Date(users_r$date_account_created, "%Y-%m-%d")
users_r$timestamp_first_active <- as.character(users_r$timestamp_first_active)
users_r$timestamp_first_active <- as.Date(users_r$timestamp_first_active, "%Y%m%d")
users_r$date_first_booking_year <- year(users_r$date_first_booking)
users_r$date_first_booking_month <- month(users_r$date_first_booking)
users_r$date_first_booking_quarter <- quarter(users_r$date_first_booking)
users_r$date_first_booking_weekdays <- weekdays(users_r$date_first_booking)
users_r$date_account_created_year <- year(users_r$date_account_created)
users_r$date_account_created_month <- month(users_r$date_account_created)
users_r$date_account_created_quarter <- quarter(users_r$date_account_created)
users_r$date_account_created_weekdays <- weekdays(users_r$date_account_created)
users_r$timestamp_first_active_year <- year(users_r$timestamp_first_active)
users_r$timestamp_first_active_month <- month(users_r$timestamp_first_active)
users_r$timestamp_first_active_quarter <- quarter(users_r$timestamp_first_active)
users_r$timestamp_first_active_weekdays <- weekdays(users_r$timestamp_first_active)
sort(colSums(is.na(users_r)), decreasing = TRUE)
## date_first_booking date_first_booking_year
## 186639 186639
## date_first_booking_month date_first_booking_quarter
## 186639 186639
## date_first_booking_weekdays age
## 186639 116866
## id date_account_created
## 0 0
## timestamp_first_active gender
## 0 0
## signup_method signup_flow
## 0 0
## language affiliate_channel
## 0 0
## affiliate_provider first_affiliate_tracked
## 0 0
## signup_app first_device_type
## 0 0
## first_browser date_account_created_year
## 0 0
## date_account_created_month date_account_created_quarter
## 0 0
## date_account_created_weekdays timestamp_first_active_year
## 0 0
## timestamp_first_active_month timestamp_first_active_quarter
## 0 0
## timestamp_first_active_weekdays
## 0
aggr(users_r, col = c('gray', 'red'), sortVars = T)
##
## Variables sorted by number of missings:
## Variable Count
## date_first_booking 0.6773400
## date_first_booking_year 0.6773400
## date_first_booking_month 0.6773400
## date_first_booking_quarter 0.6773400
## date_first_booking_weekdays 0.6773400
## age 0.4241237
## id 0.0000000
## date_account_created 0.0000000
## timestamp_first_active 0.0000000
## gender 0.0000000
## signup_method 0.0000000
## signup_flow 0.0000000
## language 0.0000000
## affiliate_channel 0.0000000
## affiliate_provider 0.0000000
## first_affiliate_tracked 0.0000000
## signup_app 0.0000000
## first_device_type 0.0000000
## first_browser 0.0000000
## date_account_created_year 0.0000000
## date_account_created_month 0.0000000
## date_account_created_quarter 0.0000000
## date_account_created_weekdays 0.0000000
## timestamp_first_active_year 0.0000000
## timestamp_first_active_month 0.0000000
## timestamp_first_active_quarter 0.0000000
## timestamp_first_active_weekdays 0.0000000
- 결측치는 아래와 같음
- date_first_booking : 186,639 rows
- date_first_booking_year : 186,639 rows
- date_first_booking_month : 186,639 rows
- date_first_booking_quarter : 186,639 rows
- date_first_booking_weekdays : 186,639 rows
- age : 116,866 rows
- 해결 방안
- 방법 1 : 해당 row 제거
- 방법 2 : 해당 variable 제거
- 방법 3 : 결측치 대체 (평균, 중앙값, knn)
- 고민
- 전체(275,547) - 결측치(186639) = 88,908 건
- 상당한 양의 (결측)데이터를 날려버리면 표본이 줄어들기 때문에 대체하는 방향으로 결정
# number of numeric & factor & date variables
numeric_var <- sapply(users_r, is.numeric)
char_var <- sapply(users_r, is.character)
date_var <- sapply(users_r, is.Date)
users_r_num <- users_r[, numeric_var]
users_r_char <- users_r[, char_var]
users_r_date <- users_r[, date_var]
#str(users_r_num)
#str(users_r_char)
#str(users_r_date)
# 11 numeric variables
# 13 character variables
# 3 date variables
age 변수를 살펴보니 2000이 넘어가는 이상치를 발견, 존재할 수 없는 (가입시 실수로 기입한 걸로 예상되는)값이므로 제거하기로 결정
처리 기준
| 종류 | 예시 | 해결 방법 |
|---|---|---|
| 존재할 수 없는 값 | 성별 변수에 3 | 결측처리 |
| 극단적인 값 | 몸무게 변수에 200 | 정상 범위 기준 정해서 결측 처리 |
# merge numeric type data (unlable & test)
b_total <- boxplot(users_r_num)
# original 'age' data (train)
b_age_origin <- boxplot(train$age)
# merge 'age' data (unlabel & test)
b_age_modi <- boxplot(users_r$age)
# original data (train)
summary(train$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 28.00 34.00 49.67 43.00 2014.00 87990
hist(train$age, 5)
# merge data (unlabel & test)
#summary(users_r$age)
#hist(users_r$age, 5)
# original data (train)
#str(which(train$age < b_age_origin$stats[1])) # 1분위수 - 1.5IQR : 57개
#str(which(train$age > b_age_origin$stats[5])) # 3분위수 + 1.5IQR : 5,337개
# merge data (unlabel & test)
#str(which(users_r$age < b_age_modi$stats[1])) # 1분위수 - 1.5IQR : 59개
#str(which(users_r$age > b_age_modi$stats[5])) # 3분위수 + 1.5IQR : 7,772개
# original data (train)
out1 <- which(train$age < b_age_origin$stats[1])
out2 <- which(train$age > b_age_origin$stats[5])
# merge data (unlabel & test)
out3 <- which(users_r$age < b_age_modi$stats[1])
out4 <- which(users_r$age > b_age_modi$stats[5])
train <- train[-c(out1, out2), ]
users_r <- users_r[-c(out3, out4), ]
# original data (train)
summary(train$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.0 28.0 33.0 35.6 41.0 65.0 87990
hist(train$age)
boxplot(train$age)
# merge data (unlabel & test)
#summary(users_r$age)
#hist(users_r$age)
#boxplot(users_r$age)
users_r$ages <- trunc(users_r$age / 10) * 10
train$ages <- trunc(train$age / 10) * 10
# table(users_r$ages)
Desc(train$ages)
## -------------------------------------------------------------------------
## train$ages (numeric)
##
## length n NAs unique 0s mean meanCI
## 207'857 119'867 87'990 6 0 31.03 30.97
## 57.7% 42.3% 0.0% 31.09
##
## .05 .10 .25 median .75 .90 .95
## 20.00 20.00 20.00 30.00 40.00 50.00 50.00
##
## range sd vcoef mad IQR skew kurt
## 50.00 10.70 0.34 14.83 20.00 0.74 0.15
##
##
## level freq perc cumfreq cumperc
## 1 10 1'872 1.6% 1'872 1.6%
## 2 20 36'049 30.1% 37'921 31.6%
## 3 30 47'570 39.7% 85'491 71.3%
## 4 40 20'210 16.9% 105'701 88.2%
## 5 50 10'521 8.8% 116'222 97.0%
## 6 60 3'645 3.0% 119'867 100.0%
#Desc(users_r$ages)
task0 <- sqldf("
with base as(
select
a.*
from (
select
ages
, sum(case when country_destination = 'AU' then 1 else 0 end) as AU
, sum(case when country_destination = 'CA' then 1 else 0 end) as CA
, sum(case when country_destination = 'DE' then 1 else 0 end) as DE
, sum(case when country_destination = 'ES' then 1 else 0 end) as ES
, sum(case when country_destination = 'FR' then 1 else 0 end) as FR
, sum(case when country_destination = 'GB' then 1 else 0 end) as GB
, sum(case when country_destination = 'IT' then 1 else 0 end) as IT
, sum(case when country_destination = 'NDF' then 1 else 0 end) as NDF
, sum(case when country_destination = 'NL' then 1 else 0 end) as NL
, sum(case when country_destination = 'PT' then 1 else 0 end) as PT
, sum(case when country_destination = 'US' then 1 else 0 end) as US
, sum(case when country_destination = 'other' then 1 else 0 end) as other
, count(*) as total
from
train
group by 1
) as a
)
select
*
from
base;
")
| ages | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | 103 | 351 | 210 | 543 | 1310 | 550 | 799 | 67614 | 160 | 59 | 13773 | 2518 | 87990 |
| 10 | 7 | 32 | 18 | 30 | 74 | 31 | 41 | 924 | 11 | 4 | 584 | 116 | 1872 |
| 20 | 106 | 245 | 243 | 569 | 940 | 460 | 566 | 16347 | 191 | 45 | 14262 | 2075 | 36049 |
| 30 | 181 | 440 | 310 | 669 | 1425 | 643 | 763 | 20059 | 225 | 65 | 19669 | 3121 | 47570 |
| 40 | 70 | 182 | 138 | 209 | 606 | 274 | 291 | 9894 | 83 | 18 | 7313 | 1132 | 20210 |
| 50 | 44 | 101 | 70 | 114 | 373 | 189 | 190 | 5253 | 39 | 16 | 3561 | 571 | 10521 |
| 60 | 10 | 32 | 32 | 45 | 117 | 76 | 65 | 1756 | 20 | 4 | 1286 | 202 | 3645 |
- 총 7,831(59 + 7,772)개의 이상치 제거 완료
- 267,716건 잔존
Visualization
# View age chart
ggplot(users_r, aes(x=age)) +
geom_histogram(aes(y=..density..),
# Histogram with density instead of count on y-axis
binwidth=.5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="pink") + # Overlay with transparent density plot
ggtitle("Users age distribution")
# View ages chart
ggplot(data = train, aes(x = ages)) +
xlim(10, 100) +
geom_bar(fill = "#FF6666") +
ggtitle("Users age distribution")
# View age by date_account_created
users_r %>%
# sample_n(2000) %>%
# filter(age<150) %>%
ggplot(aes(age, date_first_booking)) +
geom_point(alpha=.5) +
geom_smooth() +
ggtitle("Age by date_account_created")
Summary
- 20대(32.3%) ~ 30대(38.8%)의 고객(71.2%)이 주로 Airbnb 서비스를 사용하고 있는 것을 알 수 있음.
- (이상치 처리하기 전에도) 70세 이상의 고객이 거의 없음을 알 수 있어서 이를 반영하였음.
“주 사용고객은 전체 고객의 71.2%를 차지하는 2030이다.”
Report
task_gender_ratio <- sqldf("
select
gender
, round(100. * g_tot/tot, 2) as prob
from (
select
gender
, count(*) as g_tot
from users_r
group by 1
) as a, (
select
count(*) as tot
from users_r
) as b
")
| gender | prob |
|---|---|
| -unknown- | 47.75 |
| FEMALE | 27.69 |
| MALE | 24.43 |
| OTHER | 0.12 |
Visualization
Desc(train$gender)
## -------------------------------------------------------------------------
## train$gender (character)
##
## length n NAs unique levels dupes
## 207'857 207'857 0 4 4 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 -unknown- 94'507 45.5% 94'507 45.5%
## 2 FEMALE 60'630 29.2% 155'137 74.6%
## 3 MALE 52'444 25.2% 207'581 99.9%
## 4 OTHER 276 0.1% 207'857 100.0%
ggplot(data = task_gender_ratio, aes(x = reorder(prob, gender), y = prob, fill = gender)) +
geom_bar(stat = "identity") +
coord_flip()
Summary
- 여성 이용자(27.69%) 남성 이용자(24.43%) 보다 조금 더 많은데 실제 매출에 기여하는 성별은 누구인지는 파악되지 않음
- 추후에 성별에 따른 예약 전환율을 분석할 예정
Visualization
Desc(users_r$signup_app)
## -------------------------------------------------------------------------
## users_r$signup_app (character)
##
## length n NAs unique levels dupes
## 267'716 267'716 0 4 4 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Web 212'705 79.5% 212'705 79.5%
## 2 iOS 34'258 12.8% 246'963 92.2%
## 3 Android 10'384 3.9% 257'347 96.1%
## 4 Moweb 10'369 3.9% 267'716 100.0%
users_r %>%
ggplot(aes(x = signup_app)) +
geom_bar() +
ggtitle("signup_app")
Summary
- Web의 비율이 79.5%로 압도적으로 많고 그 다음은 iOS 사용자(12.8%)가 많다.
“Web과 iOS 사용자의 구매 전환율이 높으므로 추후 마케팅할 때 그들에게 집중하는 것이 좋다.”
Visualization
Desc(train$country_destination)
## -------------------------------------------------------------------------
## train$country_destination (character)
##
## length n NAs unique levels dupes
## 207'857 207'857 0 12 12 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 NDF 121'847 58.6% 121'847 58.6%
## 2 US 60'448 29.1% 182'295 87.7%
## 3 other 9'735 4.7% 192'030 92.4%
## 4 FR 4'845 2.3% 196'875 94.7%
## 5 IT 2'715 1.3% 199'590 96.0%
## 6 GB 2'223 1.1% 201'813 97.1%
## 7 ES 2'179 1.0% 203'992 98.1%
## 8 CA 1'383 0.7% 205'375 98.8%
## 9 DE 1'021 0.5% 206'396 99.3%
## 10 NL 729 0.4% 207'125 99.6%
## 11 AU 521 0.3% 207'646 99.9%
## 12 PT 211 0.1% 207'857 100.0%
Summary
- NDF가 가장 많은 비율(58.3%)을 차지하고 있다. No Destination Found의 약자로 예약을 하지 않은 사용자를 뜻한다.
- 많은(전체 사용자의 2/3 이상) 사용자가 예약하지 않았고, 예약한 사용자 중에서는 미국 29.2%로 가장 많은 비율을 차지했음을 알 수 있다.
- 조금 더 구체적인 분석을 실시할 예정
Visualization
Desc(users_r$first_device_type)
## -------------------------------------------------------------------------
## users_r$first_device_type (character)
##
## length n NAs unique levels dupes
## 267'716 267'716 0 9 9 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Mac Desktop 103'224 38.6% 103'224 38.6%
## 2 Windows Desktop 83'782 31.3% 187'006 69.9%
## 3 iPhone 39'432 14.7% 226'438 84.6%
## 4 iPad 17'354 6.5% 243'792 91.1%
## 5 Other/Unknown 10'933 4.1% 254'725 95.1%
## 6 Android Phone 9'313 3.5% 264'038 98.6%
## 7 Android Tablet 2'015 0.8% 266'053 99.4%
## 8 Desktop (Other) 1'475 0.6% 267'528 99.9%
## 9 SmartPhone (Other) 188 0.1% 267'716 100.0%
Summary
- Web의 비율이 높지만, Apple 제품을 쓰는 사용자의 비율이 59.8%로 압도적으로 높다.
Visualization
users_r %>%
sample_n(2000) %>%
# filter(age < 150) %>% # 제거하지 않았을 경우 주석 해제하고 진행
ggplot(aes(date_account_created, date_first_booking)) +
geom_point(alpha = 0.5) + geom_smooth() +
ggtitle("date_first_booking by date_account_created")
Summary
- 두 변수(date_account_created, date_first_booking)는 선형 관계에 있음
즉, 가입한지 오래된 사용자 보다 가입한 지 얼마 안 된 사용자를 대상으로 마케팅을 진행해야함.
Visualization
users_r %>%
sample_n(2000) %>%
# filter(age < 150) %>%
ggplot(aes(date_account_created, date_first_booking)) +
geom_point(aes(size=age, col=signup_app),alpha = 0.5) + geom_smooth() +
ggtitle("user analysis")
Summary
예약을 완료한 대부분의 고객들은 Web을 통해 가입한 고객임을 알 수 있음
숙소를 예약하는 것이기 때문에 (화면이 작은) 스마트폰 보다는, 상대적으로 큰 모니터를 통해 천천히 살펴보면서 예약하는 것이라는 가설을 세울 수 있음
점의 크기로 age를 나타냈지만 너무 많아서 의미를 찾기 어려움, 추가적인 분석이 필요함(분석할 예정).
“최근에 회원을 가입한 사용자 중에 Web을 통해 가입한 고객을 대상으로 마케팅을 해야 한다.”
Report 1 - 가입기기별, 예약 국가별 예약자 수
table(train$signup_app)
##
## Android iOS Moweb Web
## 5406 18832 6167 177452
# conversion_rate_by_country_signup_app
task1 <- sqldf("
with base as(
select
a.*
from (
select
signup_app
, sum(case when country_destination = 'AU' then 1 else 0 end) as AU
, sum(case when country_destination = 'CA' then 1 else 0 end) as CA
, sum(case when country_destination = 'DE' then 1 else 0 end) as DE
, sum(case when country_destination = 'ES' then 1 else 0 end) as ES
, sum(case when country_destination = 'FR' then 1 else 0 end) as FR
, sum(case when country_destination = 'GB' then 1 else 0 end) as GB
, sum(case when country_destination = 'IT' then 1 else 0 end) as IT
, sum(case when country_destination = 'NDF' then 1 else 0 end) as NDF
, sum(case when country_destination = 'NL' then 1 else 0 end) as NL
, sum(case when country_destination = 'PT' then 1 else 0 end) as PT
, sum(case when country_destination = 'US' then 1 else 0 end) as US
, sum(case when country_destination = 'other' then 1 else 0 end) as other
, count(*) as total
from train
group by 1
) as a
)
select *
from base
union all
select
'Total'
, sum(AU)
, sum(CA)
, sum(DE)
, sum(ES)
, sum(FR)
, sum(GB)
, sum(IT)
, sum(NDF)
, sum(NL)
, sum(PT)
, sum(US)
, sum(other)
, sum(total)
from base;
")
# conversion_rate_by_country_signup_app
| signup_app | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Android | 3 | 15 | 9 | 25 | 24 | 16 | 21 | 4197 | 7 | 2 | 921 | 166 | 5406 |
| Moweb | 14 | 28 | 15 | 34 | 55 | 33 | 30 | 4078 | 14 | 3 | 1673 | 190 | 6167 |
| Web | 473 | 1282 | 957 | 2016 | 4548 | 2059 | 2524 | 100172 | 670 | 194 | 53831 | 8726 | 177452 |
| iOS | 31 | 58 | 40 | 104 | 218 | 115 | 140 | 13400 | 38 | 12 | 4023 | 653 | 18832 |
| Total | 521 | 1383 | 1021 | 2179 | 4845 | 2223 | 2715 | 121847 | 729 | 211 | 60448 | 9735 | 207857 |
Report 2 - 가입기기별, 예약 국가별 예약 비율
task2 <- sqldf("
with base as(
select
b.*
, AU + CA + DE + ES + FR + GB + IT + NDF + NL + PT + US + other as total
from (
select
signup_app
, round( 100. * AU / total, 2) as AU
, round( 100. * CA / total, 2) as CA
, round( 100. * DE / total, 2) as DE
, round( 100. * ES / total, 2) as ES
, round( 100. * FR / total, 2) as FR
, round( 100. * GB / total, 2) as GB
, round( 100. * IT / total, 2) as IT
, round( 100. * NDF / total, 2) as NDF
, round( 100. * NL / total, 2) as NL
, round( 100. * PT / total, 2) as PT
, round( 100. * US / total, 2) as US
, round( 100. * other / total, 2) as other
from (
select
signup_app
, sum(case when country_destination = 'AU' then 1 else 0 end) as AU
, sum(case when country_destination = 'CA' then 1 else 0 end) as CA
, sum(case when country_destination = 'DE' then 1 else 0 end) as DE
, sum(case when country_destination = 'ES' then 1 else 0 end) as ES
, sum(case when country_destination = 'FR' then 1 else 0 end) as FR
, sum(case when country_destination = 'GB' then 1 else 0 end) as GB
, sum(case when country_destination = 'IT' then 1 else 0 end) as IT
, sum(case when country_destination = 'NDF' then 1 else 0 end) as NDF
, sum(case when country_destination = 'NL' then 1 else 0 end) as NL
, sum(case when country_destination = 'PT' then 1 else 0 end) as PT
, sum(case when country_destination = 'US' then 1 else 0 end) as US
, sum(case when country_destination = 'other' then 1 else 0 end) as other
, count(*) as total
from train
group by 1
)
) as b
)
select
*
from
base
union all
select
'Total'
, round(avg(AU), 2)
, round(avg(CA), 2)
, round(avg(DE), 2)
, round(avg(ES), 2)
, round(avg(FR), 2)
, round(avg(GB), 2)
, round(avg(IT), 2)
, round(avg(NDF), 2)
, round(avg(NL), 2)
, round(avg(PT), 2)
, round(avg(US), 2)
, round(avg(other), 2)
, null
from
base
;")
| signup_app | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Android | 0.06 | 0.28 | 0.17 | 0.46 | 0.44 | 0.30 | 0.39 | 77.64 | 0.13 | 0.04 | 17.04 | 3.07 | 100.02 |
| Moweb | 0.23 | 0.45 | 0.24 | 0.55 | 0.89 | 0.54 | 0.49 | 66.13 | 0.23 | 0.05 | 27.13 | 3.08 | 100.01 |
| Web | 0.27 | 0.72 | 0.54 | 1.14 | 2.56 | 1.16 | 1.42 | 56.45 | 0.38 | 0.11 | 30.34 | 4.92 | 100.01 |
| iOS | 0.16 | 0.31 | 0.21 | 0.55 | 1.16 | 0.61 | 0.74 | 71.16 | 0.20 | 0.06 | 21.36 | 3.47 | 99.99 |
| Total | 0.18 | 0.44 | 0.29 | 0.68 | 1.26 | 0.65 | 0.76 | 67.84 | 0.23 | 0.07 | 23.97 | 3.64 | NA |
Report 3 for Visualization
- ggplot 시각화를 위한 dataframe 생성
task3 <- sqldf("
select
a.signup_app
, country_destination
, round(100. * total / sa_total, 2) as prob
from (
select
signup_app
, country_destination
, count(*) as total
from
train
group by 1, 2
) as a, (
select
signup_app
, count(*) as sa_total
from
train
group by 1
) b
where a.signup_app = b.signup_app;
")
Visualization
ggplot(data = task3, aes(x = signup_app, y = prob, fill = country_destination)) +
geom_bar(stat = "identity")
Report 4
- 국가별, 기기별 예약자 수
table(train$country_destination)
##
## AU CA DE ES FR GB IT NDF NL other
## 521 1383 1021 2179 4845 2223 2715 121847 729 9735
## PT US
## 211 60448
task4 <- sqldf("
with base as(
select
a.*
from (
select
country_destination
, sum(case when signup_app = 'Android' then 1 else 0 end) as Android
, sum(case when signup_app = 'Moweb' then 1 else 0 end) as Moweb
, sum(case when signup_app = 'Web' then 1 else 0 end) as Web
, sum(case when signup_app = 'iOS' then 1 else 0 end) as iOS
, count(*) as total
from train
group by 1
) as a
)
select *
from base
union all
select
'Total'
, sum(Android)
, sum(Moweb)
, sum(Web)
, sum(iOS)
, sum(total)
from base;
")
| country_destination | Android | Moweb | Web | iOS | total |
|---|---|---|---|---|---|
| AU | 3 | 14 | 473 | 31 | 521 |
| CA | 15 | 28 | 1282 | 58 | 1383 |
| DE | 9 | 15 | 957 | 40 | 1021 |
| ES | 25 | 34 | 2016 | 104 | 2179 |
| FR | 24 | 55 | 4548 | 218 | 4845 |
| GB | 16 | 33 | 2059 | 115 | 2223 |
| IT | 21 | 30 | 2524 | 140 | 2715 |
| NDF | 4197 | 4078 | 100172 | 13400 | 121847 |
| NL | 7 | 14 | 670 | 38 | 729 |
| PT | 2 | 3 | 194 | 12 | 211 |
| US | 921 | 1673 | 53831 | 4023 | 60448 |
| other | 166 | 190 | 8726 | 653 | 9735 |
| Total | 5406 | 6167 | 177452 | 18832 | 207857 |
Report 5
- 국가별, 기기별 예약 비율
task5 <- sqldf("
with base as(
select
b.*
, Android + Moweb + Web + iOS as total
from (
select
country_destination
, round( 100. * Android / total, 2) as Android
, round( 100. * Moweb / total, 2) as Moweb
, round( 100. * Web / total, 2) as Web
, round( 100. * iOS / total, 2) as iOS
from (
select
country_destination
, sum(case when signup_app = 'Android' then 1 else 0 end) as Android
, sum(case when signup_app = 'Moweb' then 1 else 0 end) as Moweb
, sum(case when signup_app = 'Web' then 1 else 0 end) as Web
, sum(case when signup_app = 'iOS' then 1 else 0 end) as iOS
, count(*) as total
from train
group by 1
)
) as b
)
select
*
from
base
union all
select
'Total'
, sum(Android)
, sum(Moweb)
, sum(Web)
, sum(iOS)
, null
from base;
")
| country_destination | Android | Moweb | Web | iOS | total |
|---|---|---|---|---|---|
| AU | 0.58 | 2.69 | 90.79 | 5.95 | 100.01 |
| CA | 1.08 | 2.02 | 92.70 | 4.19 | 99.99 |
| DE | 0.88 | 1.47 | 93.73 | 3.92 | 100.00 |
| ES | 1.15 | 1.56 | 92.52 | 4.77 | 100.00 |
| FR | 0.50 | 1.14 | 93.87 | 4.50 | 100.01 |
| GB | 0.72 | 1.48 | 92.62 | 5.17 | 99.99 |
| IT | 0.77 | 1.10 | 92.97 | 5.16 | 100.00 |
| NDF | 3.44 | 3.35 | 82.21 | 11.00 | 100.00 |
| NL | 0.96 | 1.92 | 91.91 | 5.21 | 100.00 |
| PT | 0.95 | 1.42 | 91.94 | 5.69 | 100.00 |
| US | 1.52 | 2.77 | 89.05 | 6.66 | 100.00 |
| other | 1.71 | 1.95 | 89.64 | 6.71 | 100.01 |
| Total | 14.26 | 22.87 | 1093.95 | 68.93 | NA |
Report 6 for visualization
- ggplot 시각화를 위한 국가별, 기기별 예약 비율
task6 <- sqldf("
select
a.country_destination
, a.signup_app
, round(100. * total / sa_total, 2) as prob
from (
select
country_destination
, signup_app
, count(*) as total
from
train
group by 1, 2
) as a, (
select
country_destination
, count(*) as sa_total
from
train
group by 1
) b
where a.country_destination = b.country_destination;
")
Visualization
ggplot(data = task6, aes(x = country_destination, y = prob, fill = signup_app)) +
geom_bar(stat = "identity")
Summary
Web 전환률은 평균 80~90%로 App을 모두 합친 것보다 훨씬 높다.
(미국의 경우 Wep이 89.2%이고, App이 10.7%인데, 다른 나라도 상황이 비슷하다)
Web 가입자의 전환률이 그들을 대상으로 대상으로 더 높으므로 마케팅을 진행하는 게 좋다.
예약일(분기, 요일)과 destination의 상관관계 분석 (월별, 나라별 방문자 수 지표)
Report
date_first_booking_quarter <- sqldf("
select
date_first_booking_quarter
, count(*) as freq
from
train
where
date_first_booking_quarter not in ('NA')
group by 1
order by 2 desc
")
date_first_booking_month <- sqldf("
select
date_first_booking_month
, count(*) as freq
from
train
where
date_first_booking_month not in ('NA')
group by 1
order by 2 desc
")
date_first_booking_weekdays <- sqldf("
select
date_first_booking_weekdays
, count(*) as freq
from
train
where
date_first_booking_weekdays not in ('NA')
group by 1
order by 2 desc
")
task7 <- sqldf("
with base as(
select
a.*
from (
select
date_first_booking_month
, sum(case when country_destination = 'AU' then 1 else 0 end) as AU
, sum(case when country_destination = 'CA' then 1 else 0 end) as CA
, sum(case when country_destination = 'DE' then 1 else 0 end) as DE
, sum(case when country_destination = 'ES' then 1 else 0 end) as ES
, sum(case when country_destination = 'FR' then 1 else 0 end) as FR
, sum(case when country_destination = 'GB' then 1 else 0 end) as GB
, sum(case when country_destination = 'IT' then 1 else 0 end) as IT
, sum(case when country_destination = 'NDF' then 1 else 0 end) as NDF
, sum(case when country_destination = 'NL' then 1 else 0 end) as NL
, sum(case when country_destination = 'PT' then 1 else 0 end) as PT
, sum(case when country_destination = 'US' then 1 else 0 end) as US
, sum(case when country_destination = 'other' then 1 else 0 end) as other
, count(*) as total
from
train
group by 1
) as a
)
select
*
from
base
union all
select
'Total'
, round((sum(AU) / 12), 2)
, round((sum(CA) / 12), 2)
, round((sum(DE) / 12), 2)
, round((sum(ES) / 12), 2)
, round((sum(FR) / 12), 2)
, round((sum(GB) / 12), 2)
, round((sum(IT) / 12), 2)
, null
, round((sum(NL) / 12), 2)
, round((sum(PT) / 12), 2)
, round((sum(US) / 12), 2)
, round((sum(other) / 12), 2)
, null
from base;
")
task8 <- sqldf("
with base as(
select
b.*
, AU + CA + DE + ES + FR + GB + IT + NDF + NL + PT + US + other as total
from (
select
date_first_booking_month
, round( 100. * AU / total, 2) as AU
, round( 100. * CA / total, 2) as CA
, round( 100. * DE / total, 2) as DE
, round( 100. * ES / total, 2) as ES
, round( 100. * FR / total, 2) as FR
, round( 100. * GB / total, 2) as GB
, round( 100. * IT / total, 2) as IT
, round( 100. * NDF / total, 2) as NDF
, round( 100. * NL / total, 2) as NL
, round( 100. * PT / total, 2) as PT
, round( 100. * US / total, 2) as US
, round( 100. * other / total, 2) as other
from (
select
date_first_booking_month
, sum(case when country_destination = 'AU' then 1 else 0 end) as AU
, sum(case when country_destination = 'CA' then 1 else 0 end) as CA
, sum(case when country_destination = 'DE' then 1 else 0 end) as DE
, sum(case when country_destination = 'ES' then 1 else 0 end) as ES
, sum(case when country_destination = 'FR' then 1 else 0 end) as FR
, sum(case when country_destination = 'GB' then 1 else 0 end) as GB
, sum(case when country_destination = 'IT' then 1 else 0 end) as IT
, sum(case when country_destination = 'NDF' then 1 else 0 end) as NDF
, sum(case when country_destination = 'NL' then 1 else 0 end) as NL
, sum(case when country_destination = 'PT' then 1 else 0 end) as PT
, sum(case when country_destination = 'US' then 1 else 0 end) as US
, sum(case when country_destination = 'other' then 1 else 0 end) as other
, count(*) as total
from train
group by 1
)
) as b
)
select
*
from
base
union all
select
'Avg_booking'
, round((sum(AU) / 12), 2)
, round((sum(CA) / 12), 2)
, round((sum(DE) / 12), 2)
, round((sum(ES) / 12), 2)
, round((sum(FR) / 12), 2)
, round((sum(GB) / 12), 2)
, round((sum(IT) / 12), 2)
, null
, round((sum(NL) / 12), 2)
, round((sum(PT) / 12), 2)
, round((sum(US) / 12), 2)
, round((sum(other) / 12), 2)
, null
from
base
;")
task9 <- sqldf("
select
a.date_first_booking_month
, a.country_destination
, round(100. * total / sa_total, 2) as prob
from (
select
date_first_booking_month
, country_destination
, count(*) as total
from
train
group by 1, 2
) as a, (
select
date_first_booking_month
, count(*) as sa_total
from
train
group by 1
) b
where a.date_first_booking_month = b.date_first_booking_month
;")
| date_first_booking_quarter | freq |
|---|---|
| 2 | 28885 |
| 1 | 20875 |
| 3 | 20209 |
| 4 | 16041 |
| date_first_booking_month | freq |
|---|---|
| 6 | 10217 |
| 5 | 10153 |
| 4 | 8515 |
| 3 | 8087 |
| 7 | 7027 |
| 8 | 6816 |
| 2 | 6539 |
| 9 | 6366 |
| 1 | 6249 |
| 10 | 5987 |
| 11 | 5107 |
| 12 | 4947 |
| date_first_booking_weekdays | freq |
|---|---|
| Wednesday | 13951 |
| Tuesday | 13832 |
| Thursday | 13491 |
| Friday | 12840 |
| Monday | 12341 |
| Saturday | 10043 |
| Sunday | 9512 |
| date_first_booking_month | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 121847 | 0 | 0 | 0 | 0 | 121847 |
| 1 | 57 | 53 | 57 | 112 | 320 | 140 | 157 | 0 | 29 | 9 | 4645 | 670 | 6249 |
| 2 | 46 | 72 | 66 | 163 | 376 | 168 | 194 | 0 | 49 | 18 | 4604 | 783 | 6539 |
| 3 | 47 | 102 | 79 | 206 | 488 | 217 | 280 | 0 | 64 | 22 | 5622 | 960 | 8087 |
| 4 | 27 | 156 | 99 | 256 | 544 | 229 | 341 | 0 | 67 | 23 | 5822 | 951 | 8515 |
| 5 | 53 | 208 | 144 | 326 | 705 | 312 | 418 | 0 | 99 | 27 | 6715 | 1146 | 10153 |
| 6 | 36 | 214 | 130 | 332 | 595 | 295 | 351 | 0 | 116 | 40 | 6859 | 1249 | 10217 |
| 7 | 31 | 136 | 94 | 229 | 348 | 182 | 247 | 0 | 68 | 14 | 4913 | 765 | 7027 |
| 8 | 30 | 139 | 102 | 181 | 399 | 151 | 210 | 0 | 81 | 15 | 4806 | 702 | 6816 |
| 9 | 41 | 101 | 110 | 144 | 332 | 169 | 221 | 0 | 49 | 12 | 4545 | 642 | 6366 |
| 10 | 45 | 67 | 54 | 114 | 291 | 141 | 126 | 0 | 50 | 13 | 4423 | 663 | 5987 |
| 11 | 60 | 73 | 45 | 60 | 238 | 128 | 103 | 0 | 29 | 9 | 3754 | 608 | 5107 |
| 12 | 48 | 62 | 41 | 56 | 209 | 91 | 67 | 0 | 28 | 9 | 3740 | 596 | 4947 |
| 0 | 43 | 115 | 85 | 181 | 403 | 185 | 226 | NA | 60 | 17 | 5037 | 811 | NA |
| date_first_booking_month | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 100 | 0.00 | 0.00 | 0.00 | 0.00 | 100.00 |
| 1 | 0.91 | 0.85 | 0.91 | 1.79 | 5.12 | 2.24 | 2.51 | 0 | 0.46 | 0.14 | 74.33 | 10.72 | 99.98 |
| 2 | 0.70 | 1.10 | 1.01 | 2.49 | 5.75 | 2.57 | 2.97 | 0 | 0.75 | 0.28 | 70.41 | 11.97 | 100.00 |
| 3 | 0.58 | 1.26 | 0.98 | 2.55 | 6.03 | 2.68 | 3.46 | 0 | 0.79 | 0.27 | 69.52 | 11.87 | 99.99 |
| 4 | 0.32 | 1.83 | 1.16 | 3.01 | 6.39 | 2.69 | 4.00 | 0 | 0.79 | 0.27 | 68.37 | 11.17 | 100.00 |
| 5 | 0.52 | 2.05 | 1.42 | 3.21 | 6.94 | 3.07 | 4.12 | 0 | 0.98 | 0.27 | 66.14 | 11.29 | 100.01 |
| 6 | 0.35 | 2.09 | 1.27 | 3.25 | 5.82 | 2.89 | 3.44 | 0 | 1.14 | 0.39 | 67.13 | 12.22 | 99.99 |
| 7 | 0.44 | 1.94 | 1.34 | 3.26 | 4.95 | 2.59 | 3.52 | 0 | 0.97 | 0.20 | 69.92 | 10.89 | 100.02 |
| 8 | 0.44 | 2.04 | 1.50 | 2.66 | 5.85 | 2.22 | 3.08 | 0 | 1.19 | 0.22 | 70.51 | 10.30 | 100.01 |
| 9 | 0.64 | 1.59 | 1.73 | 2.26 | 5.22 | 2.65 | 3.47 | 0 | 0.77 | 0.19 | 71.39 | 10.08 | 99.99 |
| 10 | 0.75 | 1.12 | 0.90 | 1.90 | 4.86 | 2.36 | 2.10 | 0 | 0.84 | 0.22 | 73.88 | 11.07 | 100.00 |
| 11 | 1.17 | 1.43 | 0.88 | 1.17 | 4.66 | 2.51 | 2.02 | 0 | 0.57 | 0.18 | 73.51 | 11.91 | 100.01 |
| 12 | 0.97 | 1.25 | 0.83 | 1.13 | 4.22 | 1.84 | 1.35 | 0 | 0.57 | 0.18 | 75.60 | 12.05 | 99.99 |
| 0 | 0.65 | 1.55 | 1.16 | 2.39 | 5.48 | 2.53 | 3.00 | NA | 0.82 | 0.23 | 70.89 | 11.30 | NA |
**Visualization
#View count of each country_destination by data_first_booking_month
fig.dac_mon <- ggplot(data = train, aes(x = country_destination, fill=country_destination))
fig.dac_mon <- fig.dac_mon + geom_bar(position="identity", alpha=1.0)
fig.dac_mon <- fig.dac_mon + scale_y_log10()
# fig.dac_mon + facet_wrap(~date_first_booking_month)
fig.dac_mon
#View count of each country_destination by data_first_booking_month
fig.dac_mon <- ggplot(train, aes(date_first_booking_month, fill=country_destination))
fig.dac_mon <- fig.dac_mon + geom_bar(position="identity", alpha=1.0)
fig.dac_mon <- fig.dac_mon + scale_y_log10()
fig.dac_mon + facet_wrap(~country_destination)
ggplot(data = task9, aes(x = date_first_booking_month, y = prob, fill = country_destination)) +
geom_bar(stat = "identity")
Summary
- alpah를 1.0으로 설정하여 색을 진하게, log를 사용하여 한 눈에 보기 쉽게 scale 수정 - 나라마다 조금의 차이가 있지만 대체적으로 5~6월에 예약을 많이 한다. 그 이후로는 예약 수가 줄어든다. - 다른 나라들은 거의 유사하지만 호주(AU)의 경우 조금 다른 양상을 보인다. - 추측 : 다른 나라들과 계절이 반대이기 때문에 반대의 양상이 보이지 않을까 추측해볼 수 있음.
Report 1
- 연령대에 따른 예약자 수
task10 <- sqldf("
with base as(
select
a.*
from (
select
ages
, sum(case when country_destination = 'AU' then 1 else 0 end) as AU
, sum(case when country_destination = 'CA' then 1 else 0 end) as CA
, sum(case when country_destination = 'DE' then 1 else 0 end) as DE
, sum(case when country_destination = 'ES' then 1 else 0 end) as ES
, sum(case when country_destination = 'FR' then 1 else 0 end) as FR
, sum(case when country_destination = 'GB' then 1 else 0 end) as GB
, sum(case when country_destination = 'IT' then 1 else 0 end) as IT
, sum(case when country_destination = 'NDF' then 1 else 0 end) as NDF
, sum(case when country_destination = 'NL' then 1 else 0 end) as NL
, sum(case when country_destination = 'PT' then 1 else 0 end) as PT
, sum(case when country_destination = 'US' then 1 else 0 end) as US
, sum(case when country_destination = 'other' then 1 else 0 end) as other
, count(*) as total
from
train
group by 1
) as a
)
select
*
from
base
where ages > 0 and ages < 70
union all
select
'Avg'
, round((sum(AU) / 12), 2)
, round((sum(CA) / 12), 2)
, round((sum(DE) / 12), 2)
, round((sum(ES) / 12), 2)
, round((sum(FR) / 12), 2)
, round((sum(GB) / 12), 2)
, round((sum(IT) / 12), 2)
, null
, round((sum(NL) / 12), 2)
, round((sum(PT) / 12), 2)
, round((sum(US) / 12), 2)
, round((sum(other) / 12), 2)
, null
from base
union all
select
'Total'
, sum(AU)
, sum(CA)
, sum(DE)
, sum(ES)
, sum(FR)
, sum(GB)
, sum(IT)
, null
, sum(NL)
, sum(PT)
, sum(US)
, sum(other)
, null
from base;
")
| ages | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | 7 | 32 | 18 | 30 | 74 | 31 | 41 | 924 | 11 | 4 | 584 | 116 | 1872 |
| 20 | 106 | 245 | 243 | 569 | 940 | 460 | 566 | 16347 | 191 | 45 | 14262 | 2075 | 36049 |
| 30 | 181 | 440 | 310 | 669 | 1425 | 643 | 763 | 20059 | 225 | 65 | 19669 | 3121 | 47570 |
| 40 | 70 | 182 | 138 | 209 | 606 | 274 | 291 | 9894 | 83 | 18 | 7313 | 1132 | 20210 |
| 50 | 44 | 101 | 70 | 114 | 373 | 189 | 190 | 5253 | 39 | 16 | 3561 | 571 | 10521 |
| 60 | 10 | 32 | 32 | 45 | 117 | 76 | 65 | 1756 | 20 | 4 | 1286 | 202 | 3645 |
| 0 | 43 | 115 | 85 | 181 | 403 | 185 | 226 | NA | 60 | 17 | 5037 | 811 | NA |
| 0 | 521 | 1383 | 1021 | 2179 | 4845 | 2223 | 2715 | NA | 729 | 211 | 60448 | 9735 | NA |
Report 2
- 연령대에 따른 예약 비율
task11 <- sqldf("
with base as (
select
b.*
, AU + CA + DE + ES + FR + GB + IT + NDF + NL + PT + US + other as total
from (
select
ages
, round( 100. * AU / total, 2) as AU
, round( 100. * CA / total, 2) as CA
, round( 100. * DE / total, 2) as DE
, round( 100. * ES / total, 2) as ES
, round( 100. * FR / total, 2) as FR
, round( 100. * GB / total, 2) as GB
, round( 100. * IT / total, 2) as IT
, round( 100. * NDF / total, 2) as NDF
, round( 100. * NL / total, 2) as NL
, round( 100. * PT / total, 2) as PT
, round( 100. * US / total, 2) as US
, round( 100. * other / total, 2) as other
from (
select
ages
, sum(case when country_destination = 'AU' then 1 else 0 end) as AU
, sum(case when country_destination = 'CA' then 1 else 0 end) as CA
, sum(case when country_destination = 'DE' then 1 else 0 end) as DE
, sum(case when country_destination = 'ES' then 1 else 0 end) as ES
, sum(case when country_destination = 'FR' then 1 else 0 end) as FR
, sum(case when country_destination = 'GB' then 1 else 0 end) as GB
, sum(case when country_destination = 'IT' then 1 else 0 end) as IT
, sum(case when country_destination = 'NDF' then 1 else 0 end) as NDF
, sum(case when country_destination = 'NL' then 1 else 0 end) as NL
, sum(case when country_destination = 'PT' then 1 else 0 end) as PT
, sum(case when country_destination = 'US' then 1 else 0 end) as US
, sum(case when country_destination = 'other' then 1 else 0 end) as other
, count(*) as total
from (
select *
from train
where ages > 0 and ages < 70
)
group by 1
)
) as b
)
select
*
from
base
union all
select
'Avg_ages'
, round(avg(AU), 2)
, round(avg(CA), 2)
, round(avg(DE), 2)
, round(avg(ES), 2)
, round(avg(FR), 2)
, round(avg(GB), 2)
, round(avg(IT), 2)
, null
, round(avg(NL), 2)
, round(avg(PT), 2)
, round(avg(US), 2)
, round(avg(other), 2)
, null
from
base
;")
| ages | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | 0.37 | 1.71 | 0.96 | 1.60 | 3.95 | 1.66 | 2.19 | 49.36 | 0.59 | 0.21 | 31.20 | 6.20 | 100.00 |
| 20 | 0.29 | 0.68 | 0.67 | 1.58 | 2.61 | 1.28 | 1.57 | 45.35 | 0.53 | 0.12 | 39.56 | 5.76 | 100.00 |
| 30 | 0.38 | 0.92 | 0.65 | 1.41 | 3.00 | 1.35 | 1.60 | 42.17 | 0.47 | 0.14 | 41.35 | 6.56 | 100.00 |
| 40 | 0.35 | 0.90 | 0.68 | 1.03 | 3.00 | 1.36 | 1.44 | 48.96 | 0.41 | 0.09 | 36.19 | 5.60 | 100.01 |
| 50 | 0.42 | 0.96 | 0.67 | 1.08 | 3.55 | 1.80 | 1.81 | 49.93 | 0.37 | 0.15 | 33.85 | 5.43 | 100.02 |
| 60 | 0.27 | 0.88 | 0.88 | 1.23 | 3.21 | 2.09 | 1.78 | 48.18 | 0.55 | 0.11 | 35.28 | 5.54 | 100.00 |
| 0 | 0.35 | 1.01 | 0.75 | 1.32 | 3.22 | 1.59 | 1.73 | NA | 0.49 | 0.14 | 36.24 | 5.85 | NA |
Report 3 for visualization
- ggplot 시각화를 위한 dataframe
task12 <- sqldf("
select
a.ages
, a.country_destination
, round(100. * ac_total / ages_total, 2) as prob
from (
select
ages
, country_destination
, count(*) as ac_total
from
train
group by 1, 2
) as a, (
select
ages
, count(*) as ages_total
from
train
group by 1
) as b
where a.ages = b.ages
")
Visualization 1
not.na <- !is.na(train$ages)
ggplot(data = task12[not.na, ], aes(x = ages, y = prob, fill = country_destination)) +
geom_bar(stat = "identity", alpha = 1) +
xlim(14, 70)
# View count of age
not.na <- !is.na(train$age)
fig.age <- ggplot(train[not.na,], aes(age, fill=country_destination))
fig.age <- fig.age + geom_bar(position="identity", alpha=0.5)
fig.age <- fig.age + xlim(14, 70)
fig.age <- fig.age + scale_y_log10()
fig.age + facet_wrap(~country_destination)
# View count of ages
not.na <- !is.na(train$ages)
fig.ages <- ggplot(train[not.na,], aes(ages, fill=country_destination))
fig.ages <- fig.ages + geom_bar(position="identity", alpha=0.5)
fig.ages <- fig.ages + xlim(14, 70)
fig.ages <- fig.ages + scale_y_log10()
fig.ages + facet_wrap(~country_destination)
Summary 1
- 대부분의 나라가 전체 인구 분포와 유사함 - Age와 Destination의 관계를 Boxplot을 통해 표현 - Boxplot을 이용하면 전체 데이터 분포를 더 잘 파악할 수 있고, 어디에 데이터가 분포되어 있는지 Outlier의 존재 유무 등을 파악할 수 있음
Visualization 2
# Boxplot for age (between 10 and 70)
fig.age.bx <- ggplot(train, aes(x=country_destination, y=age))
fig.age.bx <- fig.age.bx + geom_boxplot(aes(colour=country_destination))
fig.age.bx <- fig.age.bx + ylim(10, 70)
fig.age.bx
# boxplot for ages
fig.ages.bx <- ggplot(train, aes(x=country_destination, y=ages))
fig.ages.bx <- fig.ages.bx + geom_boxplot(aes(colour=country_destination))
fig.ages.bx <- fig.ages.bx + ylim(10, 70)
fig.ages.bx
Summary 2
- 모든 나라의 20~30대(중반)에 많이 집중되어 있음을 알 수 있음. - 영국(GB)가 연령대가 조금 높은 편이고, 스페인(ES)가 조금 낮은 편. - 중위값은 모든 나라에서 박스의 아래쪽에 있는 것으로 보아 젊은 연령대가 Airbnb를 더 많이 사용하고 있음을 알 수 있음.
Report - 국가별, 성별의 구매 전환 수 (Conversion rate by country destination & gender)
task_a <- sqldf("
with base as(
select a.*
from (
select
country_destination
, sum(case when gender = '-unknown-' then 1
when gender = 'OTHER' then 1 else 0 end) as unknown
, sum(case when gender = 'FEMALE' then 1 else 0 end) as female
, sum(case when gender = 'MALE' then 1 else 0 end) as male
, count(*) as total
from
train
group by 1) as a
)
select *
from base
union all
select
'Total'
, sum(unknown)
, sum(female)
, sum(male)
, sum(total)
from base
union all
select
'Avg'
, avg(unknown)
, avg(female)
, avg(male)
, avg(total)
from base
")
task_b <- sqldf("
with base as(
select
a.*
, male_ratio + female_ratio + unknown_ratio as tot_ratio
from(
select
country_destination
, round(100. * male / total, 2) as male_ratio
, round(100. * female / total, 2) as female_ratio
, round(100. * unknown / total, 2) as unknown_ratio
from (
select
country_destination
, sum(case when gender = '-unknown-' then 1
when gender = 'OTHER' then 1 else 0 end) as unknown
, sum(case when gender = 'FEMALE' then 1 else 0 end) as female
, sum(case when gender = 'MALE' then 1 else 0 end) as male
, count(*) as total
from
train
group by 1
)
) as a
)
select *
from base
union all
select
'Total_avg'
, round(avg(male_ratio), 2)
, round(avg(female_ratio), 2)
, round(avg(unknown_ratio), 2)
, round(avg(tot_ratio), 2)
from base
")
task_c <- sqldf("
select
a.gender
, a.country_destination
, round(100. * freq / freq_total, 2) as prob
from (
select
gender
, country_destination
, count(*) as freq
from train
group by 1, 2
) as a, (
select
gender
, count(*) freq_total
from train
) as b
")
| country_destination | unknown | female | male | total |
|---|---|---|---|---|
| AU | 140.000 | 200.0 | 181.000 | 521.00 |
| CA | 478.000 | 442.0 | 463.000 | 1383.00 |
| DE | 281.000 | 344.0 | 396.000 | 1021.00 |
| ES | 701.000 | 830.0 | 648.000 | 2179.00 |
| FR | 1678.000 | 1893.0 | 1274.000 | 4845.00 |
| GB | 739.000 | 838.0 | 646.000 | 2223.00 |
| IT | 1016.000 | 1042.0 | 657.000 | 2715.00 |
| NDF | 66394.000 | 29769.0 | 25684.000 | 121847.00 |
| NL | 225.000 | 241.0 | 263.000 | 729.00 |
| PT | 69.000 | 75.0 | 67.000 | 211.00 |
| US | 19668.000 | 21915.0 | 18865.000 | 60448.00 |
| other | 3394.000 | 3041.0 | 3300.000 | 9735.00 |
| Total | 94783.000 | 60630.0 | 52444.000 | 207857.00 |
| Avg | 7898.583 | 5052.5 | 4370.333 | 17321.42 |
| country_destination | male_ratio | female_ratio | unknown_ratio | tot_ratio |
|---|---|---|---|---|
| AU | 34.74 | 38.39 | 26.87 | 100 |
| CA | 33.48 | 31.96 | 34.56 | 100 |
| DE | 38.79 | 33.69 | 27.52 | 100 |
| ES | 29.74 | 38.09 | 32.17 | 100 |
| FR | 26.30 | 39.07 | 34.63 | 100 |
| GB | 29.06 | 37.70 | 33.24 | 100 |
| IT | 24.20 | 38.38 | 37.42 | 100 |
| NDF | 21.08 | 24.43 | 54.49 | 100 |
| NL | 36.08 | 33.06 | 30.86 | 100 |
| PT | 31.75 | 35.55 | 32.70 | 100 |
| US | 31.21 | 36.25 | 32.54 | 100 |
| other | 33.90 | 31.24 | 34.86 | 100 |
| Total_avg | 30.86 | 34.82 | 34.32 | 100 |
Visualization
- 국가별, 성별 비중 시각화
ggplot(data = task_b, aes(x = country_destination, y = task_b$male_ratio, fill = country_destination)) +
geom_bar(stat = 'identity') +
ggtitle("male ratio") +
coord_flip()
ggplot(data = task_b, aes(x = country_destination, y = task_b$female_ratio, fill = country_destination)) +
geom_bar(stat = 'identity') +
ggtitle("female ratio") +
coord_flip()
ggplot(data = task_b, aes(x = country_destination, y = task_b$unknown_ratio, fill = country_destination)) +
geom_bar(stat = 'identity') +
ggtitle("unknown ratio") +
coord_flip()
ggplot(data = task_c, aes(x = gender, y = prob, fill = country_destination)) +
geom_bar(stat = "identity") +
ggtitle("country destination ratio by gender") +
coord_flip()
Report
- 국가별, 성별 전환율 차이 비교 리포트 1) 국가별로 어떠한 성별이 얼만큼의 차이로 전환율이 우세한지 확인하는 지표 만들기 2) 성별로 우세한 국가 뽑기
task_d <- sqldf("
select
country_destination
, male_ratio
, female_ratio
, unknown_ratio
, tot_ratio
, case when male_ratio > female_ratio then 'M'
when female_ratio > male_ratio then 'F' end as male_vs_female
, abs(male_ratio - female_ratio) as male_female_diff
from
task_b
")
| country_destination | male_ratio | female_ratio | unknown_ratio | tot_ratio | male_vs_female | male_female_diff |
|---|---|---|---|---|---|---|
| AU | 34.74 | 38.39 | 26.87 | 100 | F | 3.65 |
| CA | 33.48 | 31.96 | 34.56 | 100 | M | 1.52 |
| DE | 38.79 | 33.69 | 27.52 | 100 | M | 5.10 |
| ES | 29.74 | 38.09 | 32.17 | 100 | F | 8.35 |
| FR | 26.30 | 39.07 | 34.63 | 100 | F | 12.77 |
| GB | 29.06 | 37.70 | 33.24 | 100 | F | 8.64 |
| IT | 24.20 | 38.38 | 37.42 | 100 | F | 14.18 |
| NDF | 21.08 | 24.43 | 54.49 | 100 | F | 3.35 |
| NL | 36.08 | 33.06 | 30.86 | 100 | M | 3.02 |
| PT | 31.75 | 35.55 | 32.70 | 100 | F | 3.80 |
| US | 31.21 | 36.25 | 32.54 | 100 | F | 5.04 |
| other | 33.90 | 31.24 | 34.86 | 100 | M | 2.66 |
| Total_avg | 30.86 | 34.82 | 34.32 | 100 | F | 3.96 |
summary(task_d)
## country_destination male_ratio female_ratio unknown_ratio
## Length:13 Min. :21.08 Min. :24.43 Min. :26.87
## Class :character 1st Qu.:29.06 1st Qu.:33.06 1st Qu.:32.17
## Mode :character Median :31.21 Median :35.55 Median :33.24
## Mean :30.86 Mean :34.82 Mean :34.32
## 3rd Qu.:33.90 3rd Qu.:38.09 3rd Qu.:34.63
## Max. :38.79 Max. :39.07 Max. :54.49
## tot_ratio male_vs_female male_female_diff
## Min. :100 Length:13 Min. : 1.520
## 1st Qu.:100 Class :character 1st Qu.: 3.350
## Median :100 Mode :character Median : 3.960
## Mean :100 Mean : 5.849
## 3rd Qu.:100 3rd Qu.: 8.350
## Max. :100 Max. :14.180
table(task_d$male_vs_female)
##
## F M
## 9 4
Summary
- 평균적으로 여성의 전환율이 남성보다 3.87 %의 차이로 우세하다. - 남성의 전환율이 우세한 국가는 4개국으로 [‘CA’ ‘DE’ ‘NL’ ‘other’] 이 있다. - 여성의 전환율이 우세한 국가는 8개국으로 [‘AU’ ‘ES’ ‘FR’ ‘GB’ ‘IT’ ‘PT’ ‘US’ ‘total_avg_gender’] 이 있다.
Outliers treatment
train$date_diff <- train$date_first_booking - train$timestamp_first_active
train$date_diff <- as.numeric(train$date_diff)
train %>%
select(timestamp_first_active, date_first_booking, date_diff) %>%
head()
## timestamp_first_active date_first_booking date_diff
## 1 2009-03-19 <NA> NA
## 2 2009-05-23 <NA> NA
## 3 2009-06-09 2010-08-02 419
## 4 2009-10-31 2012-09-08 1043
## 5 2009-12-08 2010-02-18 72
## 6 2010-01-01 2010-01-02 1
summary(train$date_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 1.00 3.00 44.89 30.00 1369.00 121847
hist(train$date_diff)
b_diff <- boxplot(train$date_diff)
# 이상치로 분류된 값을 보면 365일 이상의 관측치의 수는 평균적으로 1~2개 이므로 그 이상은 제거(365일의 관측치는 50개)
#a <- train[train$date_diff > b_diff$stats[5],]
#str(a)
#table(a$date_diff)
under_diff_365 <- train %>%
filter(date_diff < 365)
hist(under_diff_365$date_diff)
task_e <- sqldf("
select
a.dd as periods
, count(*) freq
from (
select
date_diff
, case when date_diff < 1 then 0
when date_diff >= 1 and date_diff < 2 then '1 days'
when date_diff >= 2 and date_diff < 3 then '2 days'
when date_diff >= 3 and date_diff < 4 then '3 days'
when date_diff >= 4 and date_diff < 5 then '4 days'
when date_diff >= 5 and date_diff < 6 then '5 days'
when date_diff >= 6 and date_diff < 7 then '6 days'
when date_diff >= 7 and date_diff < 8 then '7 days'
when date_diff >= 8 and date_diff <= 14 then '2 weeks'
when date_diff >= 15 and date_diff <= 21 then '3 weeks'
when date_diff >= 22 and date_diff <= 28 then '4 weeks'
when date_diff >= 31 and date_diff <= 60 then '2 months'
when date_diff >= 61 and date_diff <= 90 then '3 months'
when date_diff >= 91 and date_diff <= 120 then '4 months'
when date_diff >= 121 and date_diff <= 150 then '5 months'
when date_diff >= 151 and date_diff <= 180 then '6 months'
when date_diff >= 181 and date_diff <= 210 then '7 months'
when date_diff >= 211 and date_diff <= 240 then '8 months'
when date_diff >= 241 and date_diff <= 270 then '9 months'
when date_diff >= 271 and date_diff <= 300 then '10 months'
when date_diff >= 301 and date_diff <= 330 then '11 months'
when date_diff >= 331 and date_diff <= 365 then '12 months'
when date_diff is NULL then 'NA'
else 'others' end as dd
from
train
) as a
group by 1
order by freq desc
")
Report
task_ee <- sqldf("select *
from task_e
where periods not in ('NA')")
task_eee <- sqldf("
select
a.periods
, round(100. * p_tot / b.tot, 2) as prob
from(
select
periods
, freq as p_tot
from task_ee
) as a, (
select
sum(freq) as tot
from task_ee
) as b
")
| periods | freq |
|---|---|
| 0 | 20710 |
| 1 days | 14248 |
| 2 days | 6271 |
| 2 weeks | 5823 |
| 2 months | 4248 |
| 3 days | 3863 |
| 3 weeks | 2974 |
| 4 days | 2836 |
| 3 months | 2615 |
| 5 days | 2174 |
| 4 months | 2002 |
| 4 weeks | 1917 |
| 12 months | 1794 |
| 5 months | 1769 |
| 6 days | 1726 |
| 7 days | 1616 |
| 6 months | 1576 |
| 7 months | 1514 |
| 11 months | 1504 |
| 8 months | 1480 |
| 9 months | 1418 |
| 10 months | 1389 |
| others | 543 |
| periods | prob |
|---|---|
| 0 | 24.08 |
| 1 days | 16.57 |
| 2 days | 7.29 |
| 2 weeks | 6.77 |
| 2 months | 4.94 |
| 3 days | 4.49 |
| 3 weeks | 3.46 |
| 4 days | 3.30 |
| 3 months | 3.04 |
| 5 days | 2.53 |
| 4 months | 2.33 |
| 4 weeks | 2.23 |
| 12 months | 2.09 |
| 5 months | 2.06 |
| 6 days | 2.01 |
| 7 days | 1.88 |
| 6 months | 1.83 |
| 7 months | 1.76 |
| 11 months | 1.75 |
| 8 months | 1.72 |
| 9 months | 1.65 |
| 10 months | 1.61 |
| others | 0.63 |
Visualization
ggplot(data = task_ee, aes(x = reorder(periods, freq), y = freq)) +
geom_bar(stat = "identity", fill = "#FF6666") +
ggtitle("") +
coord_flip()
Summary
- 23개의 계급값으로 구분(당일~others) - 총 예약 건수는 88,908건 - 대부분의 예약은 2주 내에 이루어졌음 (54.81%)
weekly_created <- group_by(training, date = week_created) %>%
summarise(created_count = n())
weekly_stats <- merge(weekly_booked, weekly_created, by="date")
weekly_stats <- melt(weekly_stats, id.vars = "date")
weekly_plot <- ggplot(weekly_stats, aes(date, value, col = variable)) +
geom_line()
weekly_plot
사람들은 크리스마스(연휴)가 있는 주에 크리스마스 휴가를 예약하지 않는다. 보통 사람들은 몇 달 혹은 몇 주전에 예약을 할 것이다.
그리고 사람들은 주말보다 평일(수요일)에 가장 이용을 많이 한다. 아마 사무실에서 업무에 치이면서 무료하고 반복된 패턴을 벗어나고 싶은 욕구가 클 때 여행을 이용하는 것 같다. 상대적으로 여유로운 주말이나 연휴에는 이미 휴식을 취하거나 여가를 보내고 있기 때문에 이용을 덜 하는 것 같다.
Summary
1) 서비스의 주요 고객의 연령대는 전체 고객의 71.2%를 차지하는 20(32.3%)~30(38.8%)이다. 2) 대부분의 나라가 전체 인구 분포와 유사하다. - 중위 값은 모든 나라에서 박스의 아래쪽에 있는 것으로 보아 젊은 연령대가 airbnb를 더 많이 이용하고 있음을 알 수 있음 - 영국(GB)의 경우 연령대가 조금 높은 편이고, 스페인(ES)가 조금 낮은 편
Report
| ages | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | 7 | 32 | 18 | 30 | 74 | 31 | 41 | 924 | 11 | 4 | 584 | 116 | 1872 |
| 20 | 106 | 245 | 243 | 569 | 940 | 460 | 566 | 16347 | 191 | 45 | 14262 | 2075 | 36049 |
| 30 | 181 | 440 | 310 | 669 | 1425 | 643 | 763 | 20059 | 225 | 65 | 19669 | 3121 | 47570 |
| 40 | 70 | 182 | 138 | 209 | 606 | 274 | 291 | 9894 | 83 | 18 | 7313 | 1132 | 20210 |
| 50 | 44 | 101 | 70 | 114 | 373 | 189 | 190 | 5253 | 39 | 16 | 3561 | 571 | 10521 |
| 60 | 10 | 32 | 32 | 45 | 117 | 76 | 65 | 1756 | 20 | 4 | 1286 | 202 | 3645 |
| 0 | 43 | 115 | 85 | 181 | 403 | 185 | 226 | NA | 60 | 17 | 5037 | 811 | NA |
| 0 | 521 | 1383 | 1021 | 2179 | 4845 | 2223 | 2715 | NA | 729 | 211 | 60448 | 9735 | NA |
| ages | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | 0.37 | 1.71 | 0.96 | 1.60 | 3.95 | 1.66 | 2.19 | 49.36 | 0.59 | 0.21 | 31.20 | 6.20 | 100.00 |
| 20 | 0.29 | 0.68 | 0.67 | 1.58 | 2.61 | 1.28 | 1.57 | 45.35 | 0.53 | 0.12 | 39.56 | 5.76 | 100.00 |
| 30 | 0.38 | 0.92 | 0.65 | 1.41 | 3.00 | 1.35 | 1.60 | 42.17 | 0.47 | 0.14 | 41.35 | 6.56 | 100.00 |
| 40 | 0.35 | 0.90 | 0.68 | 1.03 | 3.00 | 1.36 | 1.44 | 48.96 | 0.41 | 0.09 | 36.19 | 5.60 | 100.01 |
| 50 | 0.42 | 0.96 | 0.67 | 1.08 | 3.55 | 1.80 | 1.81 | 49.93 | 0.37 | 0.15 | 33.85 | 5.43 | 100.02 |
| 60 | 0.27 | 0.88 | 0.88 | 1.23 | 3.21 | 2.09 | 1.78 | 48.18 | 0.55 | 0.11 | 35.28 | 5.54 | 100.00 |
| 0 | 0.35 | 1.01 | 0.75 | 1.32 | 3.22 | 1.59 | 1.73 | NA | 0.49 | 0.14 | 36.24 | 5.85 | NA |
Visualization
# View age chart
ggplot(users_r, aes(x=age)) +
geom_histogram(aes(y=..density..),
binwidth=.5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="pink") + # Overlay with transparent density plot
ggtitle("Users age distribution")
# View ages chart
ggplot(data = train, aes(x = ages)) +
xlim(10, 100) +
geom_bar(fill = "#FF6666") +
ggtitle("Users age distribution")
# View count of age
not.na <- !is.na(train$age)
fig.age <- ggplot(train[not.na,], aes(age, fill=country_destination))
fig.age <- fig.age + geom_bar(position="identity", alpha=0.5)
fig.age <- fig.age + xlim(14, 70)
fig.age <- fig.age + scale_y_log10()
fig.age + facet_wrap(~country_destination)
# View count of ages
not.na <- !is.na(train$ages)
fig.ages <- ggplot(train[not.na,], aes(ages, fill=country_destination))
fig.ages <- fig.ages + geom_bar(position="identity", alpha=0.5)
fig.ages <- fig.ages + xlim(14, 70)
fig.ages <- fig.ages + scale_y_log10()
fig.ages + facet_wrap(~country_destination)
# Boxplot for age (between 10 and 70)
fig.age.bx <- ggplot(train, aes(x=country_destination, y=age))
fig.age.bx <- fig.age.bx + geom_boxplot(aes(colour=country_destination))
fig.age.bx <- fig.age.bx + ylim(10, 70)
fig.age.bx
Summary
1) 여성 이용자(27.69%) 남성 이용자(24.43%) 보다 조금 더 많은데 실제 매출에 기여하는 성별은 누구인지는 파악되지 않으므로 전환율(Conversion Rate) 분석을 진행하였음 2) 평균적으로 여성의 전환율이 남성보다 3.87 %의 차이로 우세함을 알 수 있었다. 3) 남성의 전환율이 우세한 국가는 4개국으로 [‘CA’ ‘DE’ ‘NL’ ‘other’]가 있다. 4) 여성의 전환율이 우세한 국가는 8개국으로 [‘AU’ ‘ES’ ‘FR’ ‘GB’ ‘IT’ ‘PT’ ‘US’]가 있다.
Report
| gender | prob |
|---|---|
| -unknown- | 47.75 |
| FEMALE | 27.69 |
| MALE | 24.43 |
| OTHER | 0.12 |
| country_destination | unknown | female | male | total |
|---|---|---|---|---|
| AU | 140.000 | 200.0 | 181.000 | 521.00 |
| CA | 478.000 | 442.0 | 463.000 | 1383.00 |
| DE | 281.000 | 344.0 | 396.000 | 1021.00 |
| ES | 701.000 | 830.0 | 648.000 | 2179.00 |
| FR | 1678.000 | 1893.0 | 1274.000 | 4845.00 |
| GB | 739.000 | 838.0 | 646.000 | 2223.00 |
| IT | 1016.000 | 1042.0 | 657.000 | 2715.00 |
| NDF | 66394.000 | 29769.0 | 25684.000 | 121847.00 |
| NL | 225.000 | 241.0 | 263.000 | 729.00 |
| PT | 69.000 | 75.0 | 67.000 | 211.00 |
| US | 19668.000 | 21915.0 | 18865.000 | 60448.00 |
| other | 3394.000 | 3041.0 | 3300.000 | 9735.00 |
| Total | 94783.000 | 60630.0 | 52444.000 | 207857.00 |
| Avg | 7898.583 | 5052.5 | 4370.333 | 17321.42 |
| country_destination | male_ratio | female_ratio | unknown_ratio | tot_ratio |
|---|---|---|---|---|
| AU | 34.74 | 38.39 | 26.87 | 100 |
| CA | 33.48 | 31.96 | 34.56 | 100 |
| DE | 38.79 | 33.69 | 27.52 | 100 |
| ES | 29.74 | 38.09 | 32.17 | 100 |
| FR | 26.30 | 39.07 | 34.63 | 100 |
| GB | 29.06 | 37.70 | 33.24 | 100 |
| IT | 24.20 | 38.38 | 37.42 | 100 |
| NDF | 21.08 | 24.43 | 54.49 | 100 |
| NL | 36.08 | 33.06 | 30.86 | 100 |
| PT | 31.75 | 35.55 | 32.70 | 100 |
| US | 31.21 | 36.25 | 32.54 | 100 |
| other | 33.90 | 31.24 | 34.86 | 100 |
| Total_avg | 30.86 | 34.82 | 34.32 | 100 |
Visualization
Desc(train$gender)
## -------------------------------------------------------------------------
## train$gender (character)
##
## length n NAs unique levels dupes
## 207'857 207'857 0 4 4 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 -unknown- 94'507 45.5% 94'507 45.5%
## 2 FEMALE 60'630 29.2% 155'137 74.6%
## 3 MALE 52'444 25.2% 207'581 99.9%
## 4 OTHER 276 0.1% 207'857 100.0%
ggplot(data = task_gender_ratio, aes(x = reorder(prob, gender), y = prob, fill = gender)) +
geom_bar(stat = "identity") +
coord_flip()
ggplot(data = task_b, aes(x = reorder(country_destination, male_ratio), y = task_b$male_ratio, fill = country_destination)) +
geom_bar(stat = 'identity') +
ggtitle("male ratio") +
coord_flip()
ggplot(data = task_b, aes(x = reorder(country_destination, female_ratio), y = task_b$female_ratio, fill = country_destination)) +
geom_bar(stat = 'identity') +
ggtitle("female ratio") +
coord_flip()
ggplot(data = task_b, aes(x = reorder(country_destination, unknown_ratio), y = task_b$unknown_ratio, fill = country_destination)) +
geom_bar(stat = 'identity') +
ggtitle("unknown ratio") +
coord_flip()
ggplot(data = task_c, aes(x = reorder(gender, prob), y = prob, fill = country_destination)) +
geom_bar(stat = "identity") +
ggtitle("country destination ratio by gender") +
coord_flip()
| country_destination | male_ratio | female_ratio | unknown_ratio | tot_ratio | male_vs_female | male_female_diff |
|---|---|---|---|---|---|---|
| AU | 34.74 | 38.39 | 26.87 | 100 | F | 3.65 |
| CA | 33.48 | 31.96 | 34.56 | 100 | M | 1.52 |
| DE | 38.79 | 33.69 | 27.52 | 100 | M | 5.10 |
| ES | 29.74 | 38.09 | 32.17 | 100 | F | 8.35 |
| FR | 26.30 | 39.07 | 34.63 | 100 | F | 12.77 |
| GB | 29.06 | 37.70 | 33.24 | 100 | F | 8.64 |
| IT | 24.20 | 38.38 | 37.42 | 100 | F | 14.18 |
| NDF | 21.08 | 24.43 | 54.49 | 100 | F | 3.35 |
| NL | 36.08 | 33.06 | 30.86 | 100 | M | 3.02 |
| PT | 31.75 | 35.55 | 32.70 | 100 | F | 3.80 |
| US | 31.21 | 36.25 | 32.54 | 100 | F | 5.04 |
| other | 33.90 | 31.24 | 34.86 | 100 | M | 2.66 |
| Total_avg | 30.86 | 34.82 | 34.32 | 100 | F | 3.96 |
Summary
1) Web의 비율이 79.5%로 압도적으로 많고 그 다음은 iOS 사용자(12.8%)가 많다. “Web과 iOS 사용자의 구매 전환율이 높으므로 추후 마케팅할 때 그들에게 집중하는 것이 좋다.”
2) Web의 비율이 높지만, Apple 제품을 쓰는 사용자의 비율이 59.8%로 압도적으로 높다.
3) Web 전환률은 평균 80~90%로 App을 모두 합친 것보다 훨씬 높다.
(미국의 경우 Wep이 89.2%이고, App이 10.7%인데, 다른 나라도 상황이 비슷하다)
4) Web 가입자의 전환률이 그들을 대상으로 대상으로 더 높으므로 마케팅을 진행하는 게 좋다.
5) Device type 분석을 진행한 결과, Apple 유저의 경우 61.61 %의 전환율(Mac desktop 48.50 % + iPad 6.81 % + iPhone 6.30 %)을 보여주고 있다. 이는 Web으로 들어갈 수 있는 모든 Desktop의 34.74% 전환율(Windows desktop 33.56 % + Desktop other 0.62 % + Android tablet 0.56 %) 보다 압도적으로 높다.
Report
z <- sqldf("
with base as(
select
b.*
, AU + CA + DE + ES + FR + GB + IT + NDF + NL + PT + US + other as total
from (
select
first_device_type
, round( 100. * AU / total, 2) as AU
, round( 100. * CA / total, 2) as CA
, round( 100. * DE / total, 2) as DE
, round( 100. * ES / total, 2) as ES
, round( 100. * FR / total, 2) as FR
, round( 100. * GB / total, 2) as GB
, round( 100. * IT / total, 2) as IT
, round( 100. * NDF / total, 2) as NDF
, round( 100. * NL / total, 2) as NL
, round( 100. * PT / total, 2) as PT
, round( 100. * US / total, 2) as US
, round( 100. * other / total, 2) as other
from (
select
first_device_type
, sum(case when country_destination = 'AU' then 1 else 0 end) as AU
, sum(case when country_destination = 'CA' then 1 else 0 end) as CA
, sum(case when country_destination = 'DE' then 1 else 0 end) as DE
, sum(case when country_destination = 'ES' then 1 else 0 end) as ES
, sum(case when country_destination = 'FR' then 1 else 0 end) as FR
, sum(case when country_destination = 'GB' then 1 else 0 end) as GB
, sum(case when country_destination = 'IT' then 1 else 0 end) as IT
, sum(case when country_destination = 'NDF' then 1 else 0 end) as NDF
, sum(case when country_destination = 'NL' then 1 else 0 end) as NL
, sum(case when country_destination = 'PT' then 1 else 0 end) as PT
, sum(case when country_destination = 'US' then 1 else 0 end) as US
, sum(case when country_destination = 'other' then 1 else 0 end) as other
, count(*) as total
from train
group by 1
)
) as b
)
select
*
from
base
union all
select
'Total'
, round(avg(AU), 2)
, round(avg(CA), 2)
, round(avg(DE), 2)
, round(avg(ES), 2)
, round(avg(FR), 2)
, round(avg(GB), 2)
, round(avg(IT), 2)
, round(avg(NDF), 2)
, round(avg(NL), 2)
, round(avg(PT), 2)
, round(avg(US), 2)
, round(avg(other), 2)
, null
from
base
;")
zz <- sqldf("
select
a.first_device_type
, country_destination
, round(100. * total / sa_total, 2) as prob
from (
select
first_device_type
, country_destination
, count(*) as total
from
train
group by 1, 2
) as a, (
select
first_device_type
, count(*) as sa_total
from
train
group by 1
) b
where a.first_device_type = b.first_device_type;
")
zzz <- sqldf("
with base as(
select
b.*
, Android_Phone + Android_Tablet + Desktop_Other + Mac_Desktop + Other_Unknown + SmartPhone_Other + Windows_Desktop + iPad + iPhone as total
from (
select
country_destination
, round( 100. * ap / total, 2) as 'Android_Phone'
, round( 100. * at / total, 2) as 'Android_Tablet'
, round( 100. * do / total, 2) as 'Desktop_Other'
, round( 100. * md / total, 2) as 'Mac_Desktop'
, round( 100. * ou / total, 2) as 'Other_Unknown'
, round( 100. * so / total, 2) as 'SmartPhone_Other'
, round( 100. * wd / total, 2) as 'Windows_Desktop'
, round( 100. * ipa / total, 2) as 'iPad'
, round( 100. * iph / total, 2) as 'iPhone'
from (
select
country_destination
, sum(case when first_device_type = 'Android Phone' then 1 else 0 end) as ap
, sum(case when first_device_type = 'Android Tablet' then 1 else 0 end) as at
, sum(case when first_device_type = 'Desktop (Other)' then 1 else 0 end) as do
, sum(case when first_device_type = 'Mac Desktop' then 1 else 0 end) as md
, sum(case when first_device_type = 'Other/Unknown' then 1 else 0 end) as ou
, sum(case when first_device_type = 'SmartPhone (Other)' then 1 else 0 end) as so
, sum(case when first_device_type = 'Windows Desktop' then 1 else 0 end) as wd
, sum(case when first_device_type = 'iPad' then 1 else 0 end) as ipa
, sum(case when first_device_type = 'iPhone' then 1 else 0 end) as iph
, count(*) as total
from train
group by 1
)
) as b
)
select
*
from
base
union all
select
'AVG'
, round(avg(Android_Phone), 2)
, round(avg(Android_Tablet), 2)
, round(avg(Desktop_Other), 2)
, round(avg(Mac_Desktop), 2)
, round(avg(Other_Unknown), 2)
, round(avg(SmartPhone_Other), 2)
, round(avg(Windows_Desktop), 2)
, round(avg(iPad), 2)
, round(avg(iPhone), 2)
, null
from base
;
")
Desc(users_r$signup_app)
## -------------------------------------------------------------------------
## users_r$signup_app (character)
##
## length n NAs unique levels dupes
## 267'716 267'716 0 4 4 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Web 212'705 79.5% 212'705 79.5%
## 2 iOS 34'258 12.8% 246'963 92.2%
## 3 Android 10'384 3.9% 257'347 96.1%
## 4 Moweb 10'369 3.9% 267'716 100.0%
Desc(users_r$first_device_type)
## -------------------------------------------------------------------------
## users_r$first_device_type (character)
##
## length n NAs unique levels dupes
## 267'716 267'716 0 9 9 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Mac Desktop 103'224 38.6% 103'224 38.6%
## 2 Windows Desktop 83'782 31.3% 187'006 69.9%
## 3 iPhone 39'432 14.7% 226'438 84.6%
## 4 iPad 17'354 6.5% 243'792 91.1%
## 5 Other/Unknown 10'933 4.1% 254'725 95.1%
## 6 Android Phone 9'313 3.5% 264'038 98.6%
## 7 Android Tablet 2'015 0.8% 266'053 99.4%
## 8 Desktop (Other) 1'475 0.6% 267'528 99.9%
## 9 SmartPhone (Other) 188 0.1% 267'716 100.0%
| signup_app | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Android | 3 | 15 | 9 | 25 | 24 | 16 | 21 | 4197 | 7 | 2 | 921 | 166 | 5406 |
| Moweb | 14 | 28 | 15 | 34 | 55 | 33 | 30 | 4078 | 14 | 3 | 1673 | 190 | 6167 |
| Web | 473 | 1282 | 957 | 2016 | 4548 | 2059 | 2524 | 100172 | 670 | 194 | 53831 | 8726 | 177452 |
| iOS | 31 | 58 | 40 | 104 | 218 | 115 | 140 | 13400 | 38 | 12 | 4023 | 653 | 18832 |
| Total | 521 | 1383 | 1021 | 2179 | 4845 | 2223 | 2715 | 121847 | 729 | 211 | 60448 | 9735 | 207857 |
| signup_app | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Android | 0.06 | 0.28 | 0.17 | 0.46 | 0.44 | 0.30 | 0.39 | 77.64 | 0.13 | 0.04 | 17.04 | 3.07 | 100.02 |
| Moweb | 0.23 | 0.45 | 0.24 | 0.55 | 0.89 | 0.54 | 0.49 | 66.13 | 0.23 | 0.05 | 27.13 | 3.08 | 100.01 |
| Web | 0.27 | 0.72 | 0.54 | 1.14 | 2.56 | 1.16 | 1.42 | 56.45 | 0.38 | 0.11 | 30.34 | 4.92 | 100.01 |
| iOS | 0.16 | 0.31 | 0.21 | 0.55 | 1.16 | 0.61 | 0.74 | 71.16 | 0.20 | 0.06 | 21.36 | 3.47 | 99.99 |
| Total | 0.18 | 0.44 | 0.29 | 0.68 | 1.26 | 0.65 | 0.76 | 67.84 | 0.23 | 0.07 | 23.97 | 3.64 | NA |
| country_destination | Android | Moweb | Web | iOS | total |
|---|---|---|---|---|---|
| AU | 0.58 | 2.69 | 90.79 | 5.95 | 100.01 |
| CA | 1.08 | 2.02 | 92.70 | 4.19 | 99.99 |
| DE | 0.88 | 1.47 | 93.73 | 3.92 | 100.00 |
| ES | 1.15 | 1.56 | 92.52 | 4.77 | 100.00 |
| FR | 0.50 | 1.14 | 93.87 | 4.50 | 100.01 |
| GB | 0.72 | 1.48 | 92.62 | 5.17 | 99.99 |
| IT | 0.77 | 1.10 | 92.97 | 5.16 | 100.00 |
| NDF | 3.44 | 3.35 | 82.21 | 11.00 | 100.00 |
| NL | 0.96 | 1.92 | 91.91 | 5.21 | 100.00 |
| PT | 0.95 | 1.42 | 91.94 | 5.69 | 100.00 |
| US | 1.52 | 2.77 | 89.05 | 6.66 | 100.00 |
| other | 1.71 | 1.95 | 89.64 | 6.71 | 100.01 |
| Total | 14.26 | 22.87 | 1093.95 | 68.93 | NA |
| first_device_type | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Android Phone | 0.00 | 0.47 | 0.14 | 0.47 | 0.62 | 0.11 | 0.43 | 74.59 | 0.14 | 0.00 | 20.09 | 2.93 | 99.99 |
| Android Tablet | 0.16 | 0.88 | 0.40 | 0.88 | 1.76 | 0.40 | 1.36 | 64.82 | 0.40 | 0.16 | 23.80 | 4.97 | 99.99 |
| Desktop (Other) | 0.26 | 1.54 | 1.11 | 0.94 | 1.89 | 0.69 | 1.11 | 54.50 | 0.43 | 0.00 | 32.90 | 4.63 | 100.00 |
| Mac Desktop | 0.29 | 0.75 | 0.61 | 1.22 | 2.88 | 1.32 | 1.53 | 53.07 | 0.44 | 0.12 | 32.89 | 4.87 | 99.99 |
| Other/Unknown | 0.17 | 0.45 | 0.24 | 0.58 | 1.53 | 0.48 | 0.54 | 73.02 | 0.17 | 0.05 | 20.11 | 2.65 | 99.99 |
| SmartPhone (Other) | 0.00 | 0.00 | 0.00 | 0.00 | 2.70 | 0.00 | 1.35 | 66.22 | 0.00 | 0.00 | 27.03 | 2.70 | 100.00 |
| Windows Desktop | 0.23 | 0.71 | 0.46 | 1.06 | 2.14 | 1.03 | 1.30 | 58.88 | 0.30 | 0.11 | 28.57 | 5.22 | 100.01 |
| iPad | 0.30 | 0.55 | 0.42 | 1.05 | 2.43 | 1.09 | 1.53 | 62.21 | 0.47 | 0.09 | 25.26 | 4.59 | 99.99 |
| iPhone | 0.17 | 0.29 | 0.29 | 0.61 | 1.28 | 0.66 | 0.76 | 69.22 | 0.19 | 0.05 | 23.10 | 3.37 | 99.99 |
| Total | 0.18 | 0.63 | 0.41 | 0.76 | 1.91 | 0.64 | 1.10 | 64.06 | 0.28 | 0.06 | 25.97 | 3.99 | NA |
| country_destination | Android_Phone | Android_Tablet | Desktop_Other | Mac_Desktop | Other_Unknown | SmartPhone_Other | Windows_Desktop | iPad | iPhone | total |
|---|---|---|---|---|---|---|---|---|---|---|
| AU | 0.00 | 0.38 | 0.58 | 49.33 | 3.45 | 0.00 | 31.48 | 8.06 | 6.72 | 100.00 |
| CA | 0.94 | 0.80 | 1.30 | 47.36 | 3.40 | 0.00 | 36.37 | 5.50 | 4.34 | 100.01 |
| DE | 0.39 | 0.49 | 1.27 | 52.20 | 2.45 | 0.00 | 31.73 | 5.68 | 5.78 | 99.99 |
| ES | 0.60 | 0.50 | 0.50 | 48.97 | 2.80 | 0.00 | 34.14 | 6.70 | 5.78 | 99.99 |
| FR | 0.35 | 0.45 | 0.45 | 51.95 | 3.30 | 0.04 | 31.04 | 6.98 | 5.43 | 99.99 |
| GB | 0.13 | 0.22 | 0.36 | 51.69 | 2.25 | 0.00 | 32.48 | 6.79 | 6.07 | 99.99 |
| IT | 0.44 | 0.63 | 0.48 | 49.06 | 2.10 | 0.04 | 33.63 | 7.85 | 5.78 | 100.01 |
| NDF | 1.69 | 0.66 | 0.52 | 38.01 | 6.28 | 0.04 | 34.03 | 7.10 | 11.67 | 100.00 |
| NL | 0.55 | 0.69 | 0.69 | 52.40 | 2.47 | 0.00 | 28.81 | 9.05 | 5.35 | 100.01 |
| PT | 0.00 | 0.95 | 0.00 | 50.71 | 2.37 | 0.00 | 35.07 | 5.69 | 5.21 | 100.00 |
| US | 0.92 | 0.49 | 0.64 | 47.49 | 3.49 | 0.03 | 33.29 | 5.81 | 7.85 | 100.01 |
| other | 0.83 | 0.64 | 0.55 | 43.68 | 2.86 | 0.02 | 37.76 | 6.55 | 7.11 | 100.00 |
| AVG | 0.57 | 0.58 | 0.61 | 48.57 | 3.10 | 0.01 | 33.32 | 6.81 | 6.42 | NA |
Visualization
ggplot(data = task3, aes(x = signup_app, y = prob, fill = country_destination)) +
geom_bar(stat = "identity")
ggplot(data = task6, aes(x = country_destination, y = prob, fill = signup_app)) +
geom_bar(stat = "identity")
ggplot(data = zz, aes(x = first_device_type, y = prob, fill = country_destination)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Summary
나라마다 조금의 차이가 있지만 대체적으로 5~6월에 예약을 많이 한다. 그 이후로는 예약 수가 줄어드는 경향을 보인다. 다른 나라들은 거의 유사하지만 호주(AU)의 경우 조금 다른 양상을 보인다. 추측 : 다른 나라들과 계절이 반대이기 때문에 반대의 양상이 보이지 않을까 추측해볼 수 있음.
Report
| date_first_booking_quarter | freq |
|---|---|
| 2 | 28885 |
| 1 | 20875 |
| 3 | 20209 |
| 4 | 16041 |
| date_first_booking_month | freq |
|---|---|
| 6 | 10217 |
| 5 | 10153 |
| 4 | 8515 |
| 3 | 8087 |
| 7 | 7027 |
| 8 | 6816 |
| 2 | 6539 |
| 9 | 6366 |
| 1 | 6249 |
| 10 | 5987 |
| 11 | 5107 |
| 12 | 4947 |
| date_first_booking_weekdays | freq |
|---|---|
| Wednesday | 13951 |
| Tuesday | 13832 |
| Thursday | 13491 |
| Friday | 12840 |
| Monday | 12341 |
| Saturday | 10043 |
| Sunday | 9512 |
| date_first_booking_month | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 121847 | 0 | 0 | 0 | 0 | 121847 |
| 1 | 57 | 53 | 57 | 112 | 320 | 140 | 157 | 0 | 29 | 9 | 4645 | 670 | 6249 |
| 2 | 46 | 72 | 66 | 163 | 376 | 168 | 194 | 0 | 49 | 18 | 4604 | 783 | 6539 |
| 3 | 47 | 102 | 79 | 206 | 488 | 217 | 280 | 0 | 64 | 22 | 5622 | 960 | 8087 |
| 4 | 27 | 156 | 99 | 256 | 544 | 229 | 341 | 0 | 67 | 23 | 5822 | 951 | 8515 |
| 5 | 53 | 208 | 144 | 326 | 705 | 312 | 418 | 0 | 99 | 27 | 6715 | 1146 | 10153 |
| 6 | 36 | 214 | 130 | 332 | 595 | 295 | 351 | 0 | 116 | 40 | 6859 | 1249 | 10217 |
| 7 | 31 | 136 | 94 | 229 | 348 | 182 | 247 | 0 | 68 | 14 | 4913 | 765 | 7027 |
| 8 | 30 | 139 | 102 | 181 | 399 | 151 | 210 | 0 | 81 | 15 | 4806 | 702 | 6816 |
| 9 | 41 | 101 | 110 | 144 | 332 | 169 | 221 | 0 | 49 | 12 | 4545 | 642 | 6366 |
| 10 | 45 | 67 | 54 | 114 | 291 | 141 | 126 | 0 | 50 | 13 | 4423 | 663 | 5987 |
| 11 | 60 | 73 | 45 | 60 | 238 | 128 | 103 | 0 | 29 | 9 | 3754 | 608 | 5107 |
| 12 | 48 | 62 | 41 | 56 | 209 | 91 | 67 | 0 | 28 | 9 | 3740 | 596 | 4947 |
| 0 | 43 | 115 | 85 | 181 | 403 | 185 | 226 | NA | 60 | 17 | 5037 | 811 | NA |
| date_first_booking_month | AU | CA | DE | ES | FR | GB | IT | NDF | NL | PT | US | other | total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 100 | 0.00 | 0.00 | 0.00 | 0.00 | 100.00 |
| 1 | 0.91 | 0.85 | 0.91 | 1.79 | 5.12 | 2.24 | 2.51 | 0 | 0.46 | 0.14 | 74.33 | 10.72 | 99.98 |
| 2 | 0.70 | 1.10 | 1.01 | 2.49 | 5.75 | 2.57 | 2.97 | 0 | 0.75 | 0.28 | 70.41 | 11.97 | 100.00 |
| 3 | 0.58 | 1.26 | 0.98 | 2.55 | 6.03 | 2.68 | 3.46 | 0 | 0.79 | 0.27 | 69.52 | 11.87 | 99.99 |
| 4 | 0.32 | 1.83 | 1.16 | 3.01 | 6.39 | 2.69 | 4.00 | 0 | 0.79 | 0.27 | 68.37 | 11.17 | 100.00 |
| 5 | 0.52 | 2.05 | 1.42 | 3.21 | 6.94 | 3.07 | 4.12 | 0 | 0.98 | 0.27 | 66.14 | 11.29 | 100.01 |
| 6 | 0.35 | 2.09 | 1.27 | 3.25 | 5.82 | 2.89 | 3.44 | 0 | 1.14 | 0.39 | 67.13 | 12.22 | 99.99 |
| 7 | 0.44 | 1.94 | 1.34 | 3.26 | 4.95 | 2.59 | 3.52 | 0 | 0.97 | 0.20 | 69.92 | 10.89 | 100.02 |
| 8 | 0.44 | 2.04 | 1.50 | 2.66 | 5.85 | 2.22 | 3.08 | 0 | 1.19 | 0.22 | 70.51 | 10.30 | 100.01 |
| 9 | 0.64 | 1.59 | 1.73 | 2.26 | 5.22 | 2.65 | 3.47 | 0 | 0.77 | 0.19 | 71.39 | 10.08 | 99.99 |
| 10 | 0.75 | 1.12 | 0.90 | 1.90 | 4.86 | 2.36 | 2.10 | 0 | 0.84 | 0.22 | 73.88 | 11.07 | 100.00 |
| 11 | 1.17 | 1.43 | 0.88 | 1.17 | 4.66 | 2.51 | 2.02 | 0 | 0.57 | 0.18 | 73.51 | 11.91 | 100.01 |
| 12 | 0.97 | 1.25 | 0.83 | 1.13 | 4.22 | 1.84 | 1.35 | 0 | 0.57 | 0.18 | 75.60 | 12.05 | 99.99 |
| 0 | 0.65 | 1.55 | 1.16 | 2.39 | 5.48 | 2.53 | 3.00 | NA | 0.82 | 0.23 | 70.89 | 11.30 | NA |
Visualization
#View count of each country_destination by data_first_booking_month
fig.dac_mon <- ggplot(train, aes(date_first_booking_month, fill=country_destination))
fig.dac_mon <- fig.dac_mon + geom_bar(position="identity", alpha=1.0)
fig.dac_mon <- fig.dac_mon + scale_y_log10()
fig.dac_mon + facet_wrap(~country_destination)
ggplot(data = task9, aes(x = date_first_booking_month, y = prob, fill = country_destination)) +
geom_bar(stat = "identity")
Summary
23개의 계급값으로 구분(당일~others) 총 예약 건수는 88,908건 예약의 상당수는 2주 내에 이루어졌다는 것을 알 수 있다.(54.81%)
Report
| periods | freq |
|---|---|
| 0 | 20710 |
| 1 days | 14248 |
| 2 days | 6271 |
| 2 weeks | 5823 |
| 2 months | 4248 |
| 3 days | 3863 |
| 3 weeks | 2974 |
| 4 days | 2836 |
| 3 months | 2615 |
| 5 days | 2174 |
| 4 months | 2002 |
| 4 weeks | 1917 |
| 12 months | 1794 |
| 5 months | 1769 |
| 6 days | 1726 |
| 7 days | 1616 |
| 6 months | 1576 |
| 7 months | 1514 |
| 11 months | 1504 |
| 8 months | 1480 |
| 9 months | 1418 |
| 10 months | 1389 |
| others | 543 |
| periods | prob |
|---|---|
| 0 | 24.08 |
| 1 days | 16.57 |
| 2 days | 7.29 |
| 2 weeks | 6.77 |
| 2 months | 4.94 |
| 3 days | 4.49 |
| 3 weeks | 3.46 |
| 4 days | 3.30 |
| 3 months | 3.04 |
| 5 days | 2.53 |
| 4 months | 2.33 |
| 4 weeks | 2.23 |
| 12 months | 2.09 |
| 5 months | 2.06 |
| 6 days | 2.01 |
| 7 days | 1.88 |
| 6 months | 1.83 |
| 7 months | 1.76 |
| 11 months | 1.75 |
| 8 months | 1.72 |
| 9 months | 1.65 |
| 10 months | 1.61 |
| others | 0.63 |
Visualization
1) Apple을 사용하는 2030의 젊은 층을 타겟으로 마케팅을 진행
2) 나라에 따라 다르지만 5~6월 성수기 외에 프로모션을 진행해서 전환율은 높이는 방법도 고려해볼 수 있음
3) 가입한 지 2주 내에 첫 예약이 이뤄지는 경우가 많기 때문에 처음 가입하고 난 다음의 UX를 잘 설계함으로써 지속적으로 이용할 수 있도록 관계를 맺는 것이 중요함
4) 사람들은 크리스마스(연휴)가 있는 주에 크리스마스 휴가를 예약하지 않는다. 보통 사람들은 몇 달 혹은 몇 주전에 예약을 할 것이다.
5) 주말보다 평일(수요일)에 가장 이용을 많이 한다. 아마 사무실에서 업무에 치이면서 무료하고 반복된 패턴을 벗어나고 싶은 욕구가 클 때 여행을 이용하는 것 같다. 상대적으로 여유로운 주말이나 연휴에는 이미 휴식을 취하거나 여가를 보내고 있기 때문에 이용을 덜 하는 것 같다.
- 이와 같이 여러 변수들간의 관계를 시각화 하면서 어떤 관계가 있는지 유추해 볼 수 있다.
- 이것으로 EDA를 마치고 다음 장부터는 Machine Learning Algorithm을 통해 위 데이터를 학습하고 예측하는 과정을 진행할 예정이다.
- Split the data
# This R script is based on Sandro's python script, which produces a LB score of 0.8655
# This script should produce a LB score of 0.86547
set.seed(1)
# load data
df_train = read_csv("./input/train_users_2.csv")
df_test = read_csv("./input/test_users.csv")
labels = df_train['country_destination']
df_train = df_train[-grep('country_destination', colnames(df_train))]
# combine train and test data
df_all = rbind(df_train,df_test)
# remove date_first_booking
df_all = df_all[-c(which(colnames(df_all) %in% c('date_first_booking')))]
# replace missing values
df_all[is.na(df_all)] <- -1
# split date_account_created in year, month and day
dac = as.data.frame(str_split_fixed(df_all$date_account_created, '-', 3))
df_all['dac_year'] = dac[,1]
df_all['dac_month'] = dac[,2]
df_all['dac_day'] = dac[,3]
df_all = df_all[,-c(which(colnames(df_all) %in% c('date_account_created')))]
# split timestamp_first_active in year, month and day
df_all[,'tfa_year'] = substring(as.character(df_all[,'timestamp_first_active']), 1, 4)
df_all['tfa_month'] = substring(as.character(df_all['timestamp_first_active']), 5, 6)
df_all['tfa_day'] = substring(as.character(df_all['timestamp_first_active']), 7, 8)
df_all = df_all[,-c(which(colnames(df_all) %in% c('timestamp_first_active')))]
# clean Age by removing values
df_all[df_all$age < 14 | df_all$age > 100,'age'] <- -1
# one-hot-encoding features
ohe_feats = c('gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser')
dummies <- dummyVars(~ gender + signup_method + signup_flow + language + affiliate_channel + affiliate_provider + first_affiliate_tracked + signup_app + first_device_type + first_browser, data = df_all)
df_all_ohe <- as.data.frame(predict(dummies, newdata = df_all))
df_all_combined <- cbind(df_all[,-c(which(colnames(df_all) %in% ohe_feats))],df_all_ohe)
# split train and test
X = df_all_combined[df_all_combined$id %in% df_train$id,]
y <- recode(labels$country_destination,"'NDF'=0; 'US'=1; 'other'=2; 'FR'=3; 'CA'=4; 'GB'=5; 'ES'=6; 'IT'=7; 'PT'=8; 'NL'=9; 'DE'=10; 'AU'=11")
X_test = df_all_combined[df_all_combined$id %in% df_test$id,]
# train xgboost
xgb <- xgboost(data = data.matrix(X[,-1]),
label = y,
eta = 0.1,
max_depth = 9,
nround=25,
subsample = 0.5,
colsample_bytree = 0.5,
seed = 1,
eval_metric = "merror",
objective = "multi:softprob",
num_class = 12,
nthread = 3
)
## [1] train-merror:0.364903
## [2] train-merror:0.364337
## [3] train-merror:0.363540
## [4] train-merror:0.363189
## [5] train-merror:0.362781
## [6] train-merror:0.362449
## [7] train-merror:0.362341
## [8] train-merror:0.362247
## [9] train-merror:0.362027
## [10] train-merror:0.361741
## [11] train-merror:0.361488
## [12] train-merror:0.361273
## [13] train-merror:0.361198
## [14] train-merror:0.361020
## [15] train-merror:0.360818
## [16] train-merror:0.360757
## [17] train-merror:0.360687
## [18] train-merror:0.360486
## [19] train-merror:0.360350
## [20] train-merror:0.360050
## [21] train-merror:0.359628
## [22] train-merror:0.359474
## [23] train-merror:0.359221
## [24] train-merror:0.358963
## [25] train-merror:0.358902
# predict values in test set
y_pred <- predict(xgb, data.matrix(X_test[,-1]))
# extract the 5 classes with highest probabilities
predictions <- as.data.frame(matrix(y_pred, nrow=12))
rownames(predictions) <- c('NDF','US','other','FR','CA','GB','ES','IT','PT','NL','DE','AU')
predictions_top5 <- as.vector(apply(predictions, 2, function(x) names(sort(x)[12:8])))
# create submission
ids <- NULL
for (i in 1:NROW(X_test)) {
idx <- X_test$id[i]
ids <- append(ids, rep(idx,5))
}
submission <- NULL
submission$id <- ids
submission$country <- predictions_top5
# generate submission file
submission <- as.data.frame(submission)
write.csv(submission, "submission.csv", quote=FALSE, row.names = FALSE)