library('dplyr')
library('ggplot2')

Introduction

This script is to cross-analyze the transaction and browsing datasets of Air Travel Category in 2017. There 28239 unique machine id in the browsing dataset, while only 5438 had purchased tickets.

Note: Previsouly, I construct a new dataset,called ‘Air_Browsing_df’, which is derived from the browsing dataset. It includes 313,107 records of users browsing data of 12 air travel websites, where some users had ever purchased tickets according to the ‘transaction’ dataset.

# import transaction dataset of Air Travel
  Air_tran_df <- read.csv('air_tran.csv',na=NA,stringsAsFactors = F)
  Air_travel_domain_id <- unique(Air_tran_df$domain_id)
  Air_travel_domain_name <- unique(Air_tran_df$domain_name)
  Tran_machine_id <- unique(Air_tran_df$machine_id) # 5438 unique machine id

  # aa_customers & expedia customers
  aa_cus <- unique(Air_tran_df$machine_id[which(Air_tran_df$domain_name=='aa.com')]) #940 customers of aa
  exp_cus <- unique(Air_tran_df$machine_id[which(Air_tran_df$domain_name=='expedia.com')]) #801 customers of expedia
  
  # Group Air Travel into 'agents' & 'airline'
    Agents <- c("expedia.com" ,"travelocity.com","cheaptickets.com","priceline.com","orbitz.com","hotwire.com","kayak.com")
    Airline <- c("jetblue.com" ,"southwest.com","united.com","alaskaair.com","aa.com" ,"delta.com","spirit.com")
  
    # SELECT AGENTs
  Agents_df <- Air_tran_df %>%
    filter(domain_name %in% Agents)
  
  Airline_df <- Air_tran_df %>%
    filter(domain_name %in% Airline)
  
  Airtravel_gp <- Air_tran_df %>%
    group_by(domain_id,domain_name,machine_id) %>%
    count() %>%
    arrange(desc(n))

histogram of transaction by websites

  ggplot(data = Airtravel_gp) +
    geom_histogram(aes(x=n),binwidth = 5) +
    facet_wrap(~domain_name)

  Agent_machine_id <- unique(Agents_df$machine_id)
  Airline_machine_id <- unique(Airline_df$machine_id)
  
  Airtravel_gp$tra_type <- 'airline'
  Airtravel_gp$tra_type[which(Airtravel_gp$domain_name %in% Agents)] <- 'agents'
  
  qplot(x=n,data = Airtravel_gp)+facet_wrap(~tra_type)  

# import Browsing data of Air Travel
  Air_Browsing_df <- read.csv('air_brows_df.csv',na=NA,stringsAsFactors = F)
  Air_Browsing_df <- Air_Browsing_df[,-1]
  name_air <- names(Air_Browsing_df[,-1])
  names(Air_Browsing_df) <- c('machine_id',name_air) # Change column names
 # machine_id
  Air_Cus <- unique(Air_Browsing_df$machine_id)
  Air_Cus_Type <- rep('Both',length(Air_Cus))
  Air_Cus_Type[which(!Air_Cus %in%Tran_machine_id)] <- 'BrowsOnly'
  Air_Cus_Type[which(Air_Cus%in%Agent_machine_id &!Air_Cus%in%Airline_machine_id)] <- 'AgentOnly'
  Air_Cus_Type[which(!Air_Cus%in%Agent_machine_id &Air_Cus%in%Airline_machine_id)] <- 'AirlineOnly'
  
  Air_df_num <- as.data.frame(table(Air_Cus_Type))
  ggplot(Air_df_num) + geom_bar(aes(x= Air_Cus_Type,y=Freq),stat='identity') +geom_text(aes(x= Air_Cus_Type,y=Freq,label=Freq,vjust=-0.7))

  Air_Cus01 <- data.frame(Air_Cus,Air_Cus_Type)
  Air_Cus01 <- rename(Air_Cus01,machine_id=Air_Cus)

Create new columns

 # indicate the type of browsing website
    Air_Browsing_df$brows_type <- 'Airline'
    Air_Browsing_df$brows_type[which(Air_Browsing_df$domain_name %in% Agents) ] <- 'Agent'

  # indicate users that had purchase records  
    Brows_machine_id <- unique(Air_Browsing_df$machine_id) # 28239 unique machine id in the browsing dataset 
    Purchased_machine_index <- which(Air_Browsing_df$machine_id %in% Tran_machine_id)
    Air_Browsing_df$purchased_machine_id <- 0
    Air_Browsing_df$purchased_machine_id[Purchased_machine_index] <- 1
    Air_Browsing_df <- Air_Browsing_df %>% left_join(Air_Cus01,by='machine_id')
    
 # Create dummy variables indicating whethter this machine_id ever purchased tickets from air agents or airlines.
    # add a column 'Agent', which equals to 1, if it has purchased tickets from air agents.
    Air_Browsing_df$AG_cus <- 0
    Air_Browsing_df$AG_cus[which(Air_Browsing_df$machine_id %in% Agent_machine_id)] <-1
    
    # add a column 'Airline', which equals to 1, if it has purchased tickets from airlines.
    Air_Browsing_df$AL_cus <- 0
    Air_Browsing_df$AL_cus[which(Air_Browsing_df$machine_id %in% Airline_machine_id)] <-1
    Browsing_gp <- Air_Browsing_df %>% group_by(machine_id,domain_name,brows_type,Air_Cus_Type) %>% count()
    ggplot(Browsing_gp) + geom_histogram(aes(x=n,fill=domain_name)) + facet_wrap(~Air_Cus_Type)

analysis into online Travel Agent customers

  # __ Descriptive analysis__
    AG_df <- Air_Browsing_df %>% filter(AG_cus==1) 
    AG_gp <- AG_df %>% group_by(machine_id,domain_name,brows_type) %>% count()
    ggplot(AG_gp) + geom_histogram(aes(x=n)) + facet_wrap(~brows_type)

    ggplot(AG_gp) + geom_histogram(aes(x=n,fill=domain_name)) 

analysis into Airline customers

  # __ Descriptive analysis__
    AL_df <- Air_Browsing_df %>% filter(AL_cus==1) 
    AL_gp <- AL_df %>% group_by(machine_id,domain_name,brows_type) %>% count()
    ggplot(AL_gp) + geom_histogram(aes(x=n)) + facet_wrap(~brows_type)

    ggplot(AL_gp) + geom_histogram(aes(x=n,fill=domain_name)) 

    # subset 'aa.com' customters browsing data
      AA_B_DF <- Air_Browsing_df %>% filter(machine_id %in% aa_cus)
      barplot(sort(table(AA_B_DF$brows_type),decreasing = T),main = 'Brows type')               

      barplot(table(AA_B_DF$domain_name,AA_B_DF$brows_type))

      table(AA_B_DF$domain_name,AA_B_DF$brows_type)
##                   
##                    Agent Airline
##   aa.com               0    8293
##   alaskaair.com        0     430
##   cheaptickets.com   306       0
##   delta.com            0    2193
##   expedia.com       4140       0
##   hotwire.com        979       0
##   jetblue.com          0    1266
##   kayak.com         2509       0
##   orbitz.com        1204       0
##   priceline.com     4703       0
##   southwest.com        0    3650
##   spirit.com           0    1021
##   travelocity.com   1644       0
##   united.com           0    2301
      barplot(sort(table(AA_B_DF$domain_name),decreasing = T),main = 'Brows sites',lwd=0.1,cex.names = 0.7,las=2)

          # GROUP AA.COM
          AA_B_DF_gp <- AA_B_DF %>% group_by(machine_id,domain_name,brows_type) %>% count() %>% arrange(desc(n))
          AA_B_DF_gp$brows_type02 <- AA_B_DF_gp$brows_type 
          AA_B_DF_gp$brows_type02[which(AA_B_DF_gp$brows_type=='Airline'&AA_B_DF_gp$domain_name!='aa.com')] <- 'Other Airlines'
          AA_B_DF_gp$brows_type02[which(AA_B_DF_gp$domain_name=='aa.com')] <- 'aa.com'
          table(AA_B_DF_gp$brows_type02)[c(1,3,2)]
## 
##         aa.com Other Airlines          Agent 
##            898           1722           2840