library('dplyr')
library('ggplot2')
This script is to cross-analyze the transaction and browsing datasets of Air Travel Category in 2017. There 28239 unique machine id in the browsing dataset, while only 5438 had purchased tickets.
Note: Previsouly, I construct a new dataset,called ‘Air_Browsing_df’, which is derived from the browsing dataset. It includes 313,107 records of users browsing data of 12 air travel websites, where some users had ever purchased tickets according to the ‘transaction’ dataset.
# import transaction dataset of Air Travel
Air_tran_df <- read.csv('air_tran.csv',na=NA,stringsAsFactors = F)
Air_travel_domain_id <- unique(Air_tran_df$domain_id)
Air_travel_domain_name <- unique(Air_tran_df$domain_name)
Tran_machine_id <- unique(Air_tran_df$machine_id) # 5438 unique machine id
# aa_customers & expedia customers
aa_cus <- unique(Air_tran_df$machine_id[which(Air_tran_df$domain_name=='aa.com')]) #940 customers of aa
exp_cus <- unique(Air_tran_df$machine_id[which(Air_tran_df$domain_name=='expedia.com')]) #801 customers of expedia
# Group Air Travel into 'agents' & 'airline'
Agents <- c("expedia.com" ,"travelocity.com","cheaptickets.com","priceline.com","orbitz.com","hotwire.com","kayak.com")
Airline <- c("jetblue.com" ,"southwest.com","united.com","alaskaair.com","aa.com" ,"delta.com","spirit.com")
# SELECT AGENTs
Agents_df <- Air_tran_df %>%
filter(domain_name %in% Agents)
Airline_df <- Air_tran_df %>%
filter(domain_name %in% Airline)
Airtravel_gp <- Air_tran_df %>%
group_by(domain_id,domain_name,machine_id) %>%
count() %>%
arrange(desc(n))
ggplot(data = Airtravel_gp) +
geom_histogram(aes(x=n),binwidth = 5) +
facet_wrap(~domain_name)
Agent_machine_id <- unique(Agents_df$machine_id)
Airline_machine_id <- unique(Airline_df$machine_id)
Airtravel_gp$tra_type <- 'airline'
Airtravel_gp$tra_type[which(Airtravel_gp$domain_name %in% Agents)] <- 'agents'
qplot(x=n,data = Airtravel_gp)+facet_wrap(~tra_type)
# import Browsing data of Air Travel
Air_Browsing_df <- read.csv('air_brows_df.csv',na=NA,stringsAsFactors = F)
Air_Browsing_df <- Air_Browsing_df[,-1]
name_air <- names(Air_Browsing_df[,-1])
names(Air_Browsing_df) <- c('machine_id',name_air) # Change column names
# machine_id
Air_Cus <- unique(Air_Browsing_df$machine_id)
Air_Cus_Type <- rep('Both',length(Air_Cus))
Air_Cus_Type[which(!Air_Cus %in%Tran_machine_id)] <- 'BrowsOnly'
Air_Cus_Type[which(Air_Cus%in%Agent_machine_id &!Air_Cus%in%Airline_machine_id)] <- 'AgentOnly'
Air_Cus_Type[which(!Air_Cus%in%Agent_machine_id &Air_Cus%in%Airline_machine_id)] <- 'AirlineOnly'
Air_df_num <- as.data.frame(table(Air_Cus_Type))
ggplot(Air_df_num) + geom_bar(aes(x= Air_Cus_Type,y=Freq),stat='identity') +geom_text(aes(x= Air_Cus_Type,y=Freq,label=Freq,vjust=-0.7))
Air_Cus01 <- data.frame(Air_Cus,Air_Cus_Type)
Air_Cus01 <- rename(Air_Cus01,machine_id=Air_Cus)
# indicate the type of browsing website
Air_Browsing_df$brows_type <- 'Airline'
Air_Browsing_df$brows_type[which(Air_Browsing_df$domain_name %in% Agents) ] <- 'Agent'
# indicate users that had purchase records
Brows_machine_id <- unique(Air_Browsing_df$machine_id) # 28239 unique machine id in the browsing dataset
Purchased_machine_index <- which(Air_Browsing_df$machine_id %in% Tran_machine_id)
Air_Browsing_df$purchased_machine_id <- 0
Air_Browsing_df$purchased_machine_id[Purchased_machine_index] <- 1
Air_Browsing_df <- Air_Browsing_df %>% left_join(Air_Cus01,by='machine_id')
# Create dummy variables indicating whethter this machine_id ever purchased tickets from air agents or airlines.
# add a column 'Agent', which equals to 1, if it has purchased tickets from air agents.
Air_Browsing_df$AG_cus <- 0
Air_Browsing_df$AG_cus[which(Air_Browsing_df$machine_id %in% Agent_machine_id)] <-1
# add a column 'Airline', which equals to 1, if it has purchased tickets from airlines.
Air_Browsing_df$AL_cus <- 0
Air_Browsing_df$AL_cus[which(Air_Browsing_df$machine_id %in% Airline_machine_id)] <-1
Browsing_gp <- Air_Browsing_df %>% group_by(machine_id,domain_name,brows_type,Air_Cus_Type) %>% count()
ggplot(Browsing_gp) + geom_histogram(aes(x=n,fill=domain_name)) + facet_wrap(~Air_Cus_Type)
# __ Descriptive analysis__
AG_df <- Air_Browsing_df %>% filter(AG_cus==1)
AG_gp <- AG_df %>% group_by(machine_id,domain_name,brows_type) %>% count()
ggplot(AG_gp) + geom_histogram(aes(x=n)) + facet_wrap(~brows_type)
ggplot(AG_gp) + geom_histogram(aes(x=n,fill=domain_name))
# __ Descriptive analysis__
AL_df <- Air_Browsing_df %>% filter(AL_cus==1)
AL_gp <- AL_df %>% group_by(machine_id,domain_name,brows_type) %>% count()
ggplot(AL_gp) + geom_histogram(aes(x=n)) + facet_wrap(~brows_type)
ggplot(AL_gp) + geom_histogram(aes(x=n,fill=domain_name))
# subset 'aa.com' customters browsing data
AA_B_DF <- Air_Browsing_df %>% filter(machine_id %in% aa_cus)
barplot(sort(table(AA_B_DF$brows_type),decreasing = T),main = 'Brows type')
barplot(table(AA_B_DF$domain_name,AA_B_DF$brows_type))
table(AA_B_DF$domain_name,AA_B_DF$brows_type)
##
## Agent Airline
## aa.com 0 8293
## alaskaair.com 0 430
## cheaptickets.com 306 0
## delta.com 0 2193
## expedia.com 4140 0
## hotwire.com 979 0
## jetblue.com 0 1266
## kayak.com 2509 0
## orbitz.com 1204 0
## priceline.com 4703 0
## southwest.com 0 3650
## spirit.com 0 1021
## travelocity.com 1644 0
## united.com 0 2301
barplot(sort(table(AA_B_DF$domain_name),decreasing = T),main = 'Brows sites',lwd=0.1,cex.names = 0.7,las=2)
# GROUP AA.COM
AA_B_DF_gp <- AA_B_DF %>% group_by(machine_id,domain_name,brows_type) %>% count() %>% arrange(desc(n))
AA_B_DF_gp$brows_type02 <- AA_B_DF_gp$brows_type
AA_B_DF_gp$brows_type02[which(AA_B_DF_gp$brows_type=='Airline'&AA_B_DF_gp$domain_name!='aa.com')] <- 'Other Airlines'
AA_B_DF_gp$brows_type02[which(AA_B_DF_gp$domain_name=='aa.com')] <- 'aa.com'
table(AA_B_DF_gp$brows_type02)[c(1,3,2)]
##
## aa.com Other Airlines Agent
## 898 1722 2840