df_markt <-read.csv("~/Downloads/marketing_campaigns.csv",sep = ";")
# Graph 1:
require(ggplot2)
## Loading required package: ggplot2
ggplot(df_markt, aes(x = Cost, y = Revenue, color = factor(Campaign), size = Visits)) + geom_point(alpha = 0.6) + ggtitle("Campaing_Cost_Revenue_Evolution_Overview")

# Aim: To visualize the relationship between revenue,visits,and cost per campaign
# Why: To visualize how the number of visits influenced revenue per campaign,and to see if number of visits change as the cost of the campaign changes .
# Observations:Aldebaran, a low cost campaign relative to the other two, attracts more (600 -range) visitors as more is more is spent and the revenue begins to exceed the cost as number of visitors grow. Conttington is a high cost campaign that also generates a hight revenue; revenue is also positively related to cost. However the number of visits is seemingly unchanging throughout the 31-weeks. The more money spent on Bartlendan, it seems, the faster the growth in revenue, however visit do not seem to be increasing past the 250EUR cost mark.
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(plyr)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
require(grDevices)
# Graph 2: Visits_per_Week
ggplot(subset(df_markt,df_markt$Campaign %in% c("Aldebaran","Cottington","Bartledan")), aes(x = Week, y = Visits ,color= Campaign)) + geom_line(alpha = 0.6) + ggtitle("Visits_Per_Week")

# Aim: To visualize evolution of visits per campaign per week.
# Why: Compare camplaign on the basis of growth in visits, in order to optimize for visits if #Visits is a KPI for each campaign.
# Observations: Aldebaran is the fastest growing campaign in terms of visists so it would be a focus if the goal is to increase visits(traffic to the site.) Conttington visits have remained flat from week 1 and are beginning to drop towards week_30.
# Graph 3 visits v. revenue
ggplot(subset(df_markt,df_markt$Campaign %in% c("Aldebaran","Cottington","Bartledan")), aes(x = Visits, y = Revenue, size = Week, color= Campaign)) + geom_point(alpha = 0.6)+ggtitle("Revenue vs. Visits ")

# Aim: To visualize in more detail the relashionship between revenue and visits per campaign.
# Why: To understand to what exent visits are drivers of revenue for each campaign.
# Observations: Cottington is a high revenue campaign with mostly flat number of visits week_on_week. Bartledan revenues increase fast with only a small growth in the number of visits. Aldebaran visits have grown quite fast week on week resulting in a steady rise in revenue.
#Graph 4 visits vs. costs
ggplot(subset(df_markt,df_markt$Campaign %in% c("Aldebaran","Cottington","Bartledan")), aes(x = Visits, y = Cost, size =Week,color= Campaign)) + geom_point(alpha = 0.6) +ggtitle("Cost vs. Visits ")

# Aim: To visualize in more detail the relashionship between cost and visits per campaign.
# Why: To understand to what exent the cost of a campaign influences the number of visits.
# Observations: Cottington is becomes more expensive week_on_week with the relatively the same number of visits per week. The change in the number of visits is smaller than the cost change to get those visits for Bartledan. Aldebaran visits increase fast with a small cost increment.
#Take_away: If visits are a priority Aldebaran is a low cost campaign that has fast growing visits with small incremental spent. However if revenue is a priority and cost is not an issue, Cottington is a good campaing to focus on. If revenue is a priority but there is still a desired increase in visits, Bartledan is a good option as visits will grow to a small extent and revenue will increase by a lot more, however it is very costly.
# b. Quality of visits per campaign
require(dplyr)
require(plyr)
require(grDevices)
# add profitability per visit and revenue per visit to data
df_markt = mutate(df_markt, Profit = Revenue - Cost, Sale_Value = Revenue/Visits,Visit_Profitability = Profit/Visits )
# Revenue per visit over time graph
ggplot(subset(df_markt,df_markt$Campaign %in% c("Aldebaran","Cottington","Bartledan")), aes(x = Week, y = Sale_Value, size = Cost, color= Campaign)) + geom_point(alpha = 0.6)+ggtitle("Revenue Per Visit ")

# Profitability per visit over time graph
ggplot(subset(df_markt,df_markt$Campaign %in% c("Aldebaran","Cottington","Bartledan")), aes(x = Week, y = Visit_Profitability, size= Cost, color= Campaign)) + geom_point(alpha = 0.8)+ggtitle("Profit per Visit")

#Take_Away:
#Cottington has superior revenue per visit compared to the other two, so visitors book more expensive hotels. However, it is the most expensive campaign and we see a slight drop in revenue per visit on week 30.The number of visits in this campaign remain constant and ther is a falling return on money spent as profitability per visit has continues to drop from week 1 to 30. A good reason to keep investing money into the campaign would be if the goal is to high revenue (luxury) bookings.
#Bartledan cost has increased from 100 to around 400 in the later weeks, almost as much as Cottington but revenue per visit is still ~1Eur lower than that of Cottington in week 31. The sales value and profitability of this campaign are relatively poor despite increasing cost.
#Alderberan visists result in lower value sales as revenue per visit remains small but the data indicate the with more money spent on this campaign revenue per visit has a potential for an upward tren even though it may not be a fast moving upward trend. However profitability also justifies increase spent (of cost) as this is the one campaign with the most growing profit per visit.
# 250 EUR to invest for week 31 onwards, what advise would you give?
#Detailed unnerstanding of the relatinship between spent and the other variables in each campaign
df2 <- subset(df_markt,df_markt$Campaign %in% c("Aldebaran"))
#Check to see how the amount spent on the campaign correlates with visits and revenue
cor(df2$Cost,df2$Visits)
## [1] 0.9624518
cor(df2$Cost,df2$Revenue)
## [1] 0.9905441
#Since visits and revenue are highly correlated,run a regression model on just revenue to see how an increase in cost will affect revenue. A multiple regression with both visits and revenue as dependant variables will lead to multicolinearity making it difficult to measure the true impact of increased spent on the dependent variables of revenue and visits.
cor(df2$Visits,df2$Revenue)
## [1] 0.964278
#Linear regression model
reg_Ald <- lm(df2$Revenue~df2$Cost)
summary(reg_Ald)
##
## Call:
## lm(formula = df2$Revenue ~ df2$Cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.4958 -7.5846 -0.1837 7.2847 22.3882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16.80930 3.57492 -4.702 6.26e-05 ***
## df2$Cost 1.18076 0.03091 38.205 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.31 on 28 degrees of freedom
## Multiple R-squared: 0.9812, Adjusted R-squared: 0.9805
## F-statistic: 1460 on 1 and 28 DF, p-value: < 2.2e-16
revenue_Ald <- -16.81 + 1.181*250 #=278.44 EUR
profit_Ald <- 278.44-250 # 28.44 EUR
visits_reg_Ald <- lm(df2$Visits~df2$Cost)
summary(visits_reg_Ald)
##
## Call:
## lm(formula = df2$Visits ~ df2$Cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -76.694 -29.022 -3.457 21.562 168.250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.7112 16.8026 3.137 0.00399 **
## df2$Cost 2.7254 0.1453 18.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.48 on 28 degrees of freedom
## Multiple R-squared: 0.9263, Adjusted R-squared: 0.9237
## F-statistic: 352 on 1 and 28 DF, p-value: < 2.2e-16
visits_Ald <- 52.71 + 2.73*(250) #735
df3 <- subset(df_markt,df_markt$Campaign %in% c("Bartledan"))
#Check to see how the amount spent on the campaign correlates with visists and revenue
cor(df3$Cost,df3$Visits)
## [1] 0.9853698
cor(df3$Cost,df3$Revenue)
## [1] 0.9969146
cor(df3$Visits,df3$Revenue)
## [1] 0.9891884
#Linear regression model
reg_Bart <- lm(df3$Revenue~df3$Cost)
summary(reg_Bart)
##
## Call:
## lm(formula = df3$Revenue ~ df3$Cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.4475 -5.6622 0.3115 3.8375 17.9685
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8688 3.7112 0.773 0.446
## df3$Cost 0.8871 0.0132 67.205 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.506 on 28 degrees of freedom
## Multiple R-squared: 0.9938, Adjusted R-squared: 0.9936
## F-statistic: 4516 on 1 and 28 DF, p-value: < 2.2e-16
reg_Bart_visit <- lm(df3$Visits~df3$Cost)
summary(reg_Bart_visit)
##
## Call:
## lm(formula = df3$Visits ~ df3$Cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0253 -5.0148 -0.6857 5.4750 16.4755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.0451 3.9354 15.51 2.81e-15 ***
## df3$Cost 0.4282 0.0140 30.59 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.96 on 28 degrees of freedom
## Multiple R-squared: 0.971, Adjusted R-squared: 0.9699
## F-statistic: 936 on 1 and 28 DF, p-value: < 2.2e-16
revenue_bart <- 2.87 + 0.8871*250 # = 224.65 EUR
profit_bart <- 250-224.65 # = 25.35 EUR
visits_bart <- 61.045 + 0.4282*(250) #168
df4 <- subset(df_markt,df_markt$Campaign %in% c("Cottington"))
#Check to see how the amount spent on the campaign correlates with visits and revenue
cor(df4$Cost,df4$Visits)
## [1] 0.3417286
cor(df4$Cost,df4$Revenue)
## [1] 0.9722772
cor(df4$Visits,df4$Revenue)
## [1] 0.5156817
#Linear regression model
reg_Cot <- lm(df4$Revenue~df4$Cost)
summary(reg_Cot)
##
## Call:
## lm(formula = df4$Revenue ~ df4$Cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.756 -2.804 1.338 6.908 13.501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 141.71377 10.47783 13.53 4.7e-14 ***
## df4$Cost 0.63728 0.02846 22.39 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.42 on 29 degrees of freedom
## Multiple R-squared: 0.9453, Adjusted R-squared: 0.9434
## F-statistic: 501.4 on 1 and 29 DF, p-value: < 2.2e-16
reg_cort_visit <- lm(df4$Visits~df4$Cost)
summary(reg_cort_visit)
##
## Call:
## lm(formula = df4$Visits ~ df4$Cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.237 -3.204 1.675 5.604 14.083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 133.63692 8.48767 15.745 9.56e-16 ***
## df4$Cost 0.04514 0.02305 1.958 0.0599 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.251 on 29 degrees of freedom
## Multiple R-squared: 0.1168, Adjusted R-squared: 0.08632
## F-statistic: 3.834 on 1 and 29 DF, p-value: 0.05989
revenue_Cot <- 141.714 + 0.63728*250 # = 301.03
profit_Cot <- 301.03 - 250 # = 51.03
# with a 95% confidence level that we do not commit a type one error of rejecting the null hypothesis (Cost affect visits), increasing costs by will not have any effect on the number of visist from the Cottington Campaign.
# Advice
# With 250 EUR to invest, it would be profitable to invest in Aldebaran. The other two campaigns require a lot more investment to see a significant movement in visits whereas Aldebaran will have a higher increase in visits and generates more profit relative to the other two campaigns.
# Part 2: Sessions
require(dplyr)
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# Goal: Determine if/how the available data influences booking information
# Hypothesis
# 1. Shorter sessions will have fewer bookings (as this may indicate a change of mind or just a bounce off the site)
# 2. A higher number of clickouts will result in fewer or no bookings (undecided bookers will test out several other options, opening them up to making bookings elsewhere)
# 3. More bookings will occur at night (people are more relaxed (probably drinking a glass of wine after 5pm) and less critical of their actions and threfore quicker to make decisions)
#sessions data
df_sess<-read.csv("~/Downloads/session_data.csv",sep = ";")
# Add session length in minutes (to test H1)
df_sess =mutate(df_sess,time_len = strptime(df_sess$session_end_text,"%H:%M:%S" )-strptime(df_sess$session_start_text,"%H:%M:%S" ))
# Add new column with time format and determine AM or PM to test (H3)
df_sess =mutate(df_sess,end_time= as.ITime(strptime(df_sess$session_end_text,"%H:%M:%S")))
x<-as.ITime(strptime("18:00:00","%H:%M:%S")) # set 6pm as reference point for night or day
df_sess =mutate(df_sess,Night_Day = ifelse(df_sess$end_time>=x,"Night","Day"))
#Test Hypothesis by using logistic regression model because we have a binary dependent variable (booking) and we want to understand the relationship between this variable and the other variables in the data-set.
#But first, we need to understand the nature of the binary variable in the dataset.To do this, check for class bias, which occurs when the propotion of events (1) is smaller than the propotion of non-events(0).
table(df_sess$booking)
##
## 0 1
## 9033 967
# Our data set has more non-events than events (by a large margin) so we must take this into account by selecting a sample from the data that equally represents bookings and no-bookings. This way, we will can create an effective model to help understand how the other variable influence booking or non-booking.
#Creating the model:
#1. Sample an equal propotion of the 0 and 1 from booking column and make this our training_data (or development data)
#2. The remaining data, after we sample, will then be used as a testing_data set for our model (good thing we have a large enough number of observations to allow us to do this)
#1. Create Training
book_ones <-df_sess[which(df_sess$booking == 1),] # get all the 1's
book_zeros <- df_sess[which(df_sess$booking==0),] # get all the 0's
set.seed(100) #generating our random sample and making sure it remains the same random sample so that our training set is consistent and repeatable across many test sets.
book_ones_train <- sample(1:nrow(book_ones),0.7*nrow(book_ones)) # selecting the training 1's
book_zeros_train <- sample(1:nrow(book_zeros),0.7*nrow(book_ones)) # selecting the training 0's equal to the training 1's
training_ones <- book_ones[book_ones_train,] # make table of the sample of ones
training_zeros <- book_zeros[book_zeros_train,] # make taple of the sample of zeros
training_data <- rbind(training_ones,training_zeros) # bind the rows of the two (0's and 1's) together to form a training data set
# Create Test Data
testing_ones <- book_ones[-book_ones_train,] #all the 1's that are not in the training sample
testing_zeros <- book_zeros[-book_zeros_train,] #all the 0's that are not in the training sample
testing_data <- rbind(testing_ones,testing_zeros) #bind together the training set
# Logit Regression Model
logitMod <- glm(booking~ clickouts + Night_Day + time_len,data = training_data,family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
predicted <- plogis(predict(logitMod,testing_data))
summary(logitMod)
##
## Call:
## glm(formula = booking ~ clickouts + Night_Day + time_len, family = binomial(link = "logit"),
## data = training_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3413 -1.1758 0.5055 1.1694 1.3322
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1244305 0.1911263 0.651 0.5150
## clickouts -0.1138377 0.0500391 -2.275 0.0229 *
## Night_DayNight -0.0176838 0.1221500 -0.145 0.8849
## time_len 0.0008545 0.0008528 1.002 0.3164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1874.3 on 1351 degrees of freedom
## Residual deviance: 1867.3 on 1348 degrees of freedom
## AIC: 1875.3
##
## Number of Fisher Scoring iterations: 7
#Intepreting Significant Results
# Of the variables tested, clickouts have a relationhip to whether a booking is made at a 99% confidnece level (given that the p_value is only 2.2% implying that it is very unlikely that clickout are not related to bookings ).
# Model for clickouts (continous variable)
# logit(p) = b0 + b1*clickouts, where p is the probability of a booking
# let's pick a value for clickouts at 3 then the conditional logit of a booking is:
CO_3 <- 0.1244305 -0.1138377*3 #-0.2170826
#To examin the effect of a 1 unit increase in clickouts,pick clickouts of 4
CO_4 <- 0.1244305 -0.1138377*4 #-0.3309203
# The diffrence
diff_CO <- -0.3309203-(-0.2170826) # -0.1138377 a change in the log odds created by a one unit increase in the clickouts. We want to get to a probability of the odds.
# exponentiate the log odds to get the odds
odds <- exp(0.1138377) # 1.12057
probability <- 1/(1.12057024)
# 89.2%, given that the sign of the coefficient is negative, a one unit increase in the number of clickouts can be expected to result in an 89.2% decrease in the odds of a booking.
#This confirms the hypothesis (H2) that a high number of clickout results in high likelihood of not booking.
# To test the predictive power of the model:
library(InformationValue)
optCutOff <- optimalCutoff(testing_data$booking, predicted)[1]
#0.6045
misClassError(testing_data$booking, predicted, threshold = optCutOff)
## [1] 0.0338
# 0.0338 the % mismatch of predicted values vs. actual values is low, only 3% so the model is effective in explaining the impact of the other variables on booking.