Reading the dataset

restaurants <- read.csv("AdvertisingDataV2.csv")
# attaching data columns of the dataframe
attach(restaurants)
# dimension of the dataframe
dim(restaurants)
## [1] 30000     6

Frequency table for adtype variable

addmargins(table(adType))
## adType
## Curr Ads  New Ads   No Ads      Sum 
##    10000    10000    10000    30000

Summarising the variables

library(psych)
# Crucial dependent variable is reservations
describe(restaurants)
##                 vars     n     mean      sd  median  trimmed      mad min
## adType*            1 30000     2.00    0.82     2.0     2.00     1.48   1
## pageViews          2 30000   468.06  168.16   391.0   458.18   149.74 145
## phoneCalls         3 30000    37.71    7.97    37.0    37.20     7.41  17
## reservations       4 30000    36.55    7.99    36.0    35.97     7.41  15
## businessID         5 30000 15000.50 8660.40 15000.5 15000.50 11119.50   1
## restaurantType*    6 30000     1.60    0.49     2.0     1.62     0.00   1
##                   max range  skew kurtosis    se
## adType*             3     2  0.00    -1.50  0.00
## pageViews         929   784  0.45    -1.29  0.97
## phoneCalls         77    60  0.65     0.57  0.05
## reservations       79    64  0.78     0.88  0.05
## businessID      30000 29999  0.00    -1.20 50.00
## restaurantType*     2     1 -0.41    -1.83  0.00

Descriptive statistics by adType

library(data.table)
dt <- data.table(restaurants)
dt[, list(count = .N,
        mean = round(mean(reservations), 3), 
        sd = round(sd(reservations), 3),
        median = round(median(reservations), 3),
        min = min(reservations),
        max = max(reservations)), 
   by = list(adType)]
##      adType count   mean    sd median min max
## 1:   No Ads 10000 33.960 6.591     33  15  58
## 2: Curr Ads 10000 34.021 6.504     33  18  59
## 3:  New Ads 10000 41.681 8.153     40  18  79

Boxplot of reservations by adType

# box plot of ad type
boxplot(reservations ~ adType, data = restaurants,
        main = "Boxplot of Ad Type",
        xlab = "Ad Type", ylab = "Reservations",
        digits = 2)

# Mean Plot of Reservations by Ad Type

# mean plot by ad type
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
plotmeans(reservations ~ adType, data = restaurants,
          xlab = "Ad Type", ylab = "Number of Reservations",
          digits = 2, col = "black", ccol = "blue", barwidth = 2,
          legends = TRUE, mean.labels = TRUE, n.label=FALSE, frame = TRUE)

One-way ANOVA test

# one-way ANOVA 
oneWayfit <- aov(reservations ~ adType, data = restaurants)
# summary of the ANOVA model
summary(oneWayfit)
##                Df  Sum Sq Mean Sq F value Pr(>F)    
## adType          2  394228  197114    3885 <2e-16 ***
## Residuals   29997 1522018      51                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As p-value <0.05, we can say that there are significant differences in reservations for different adTypes

ANOVA Assumption - Normality

# Anderson-Darling normality test
library(nortest)
ad.test(reservations)
## 
##  Anderson-Darling normality test
## 
## data:  reservations
## A = 222.08, p-value < 2.2e-16

As p-value < 0.05, we can’t asssume normality of data.

ANOVA Assumption - Homogeneity of Variance

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
leveneTest(reservations ~ adType, data = restaurants)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     2  134.29 < 2.2e-16 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can’t assume the homogeneity of variances of groups.

Pairwise Comparison test

# Tukey Comparison Test
TukeyHSD(oneWayfit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = reservations ~ adType, data = restaurants)
## 
## $adType
##                     diff        lwr        upr     p adj
## New Ads-Curr Ads  7.6593  7.4232043  7.8953957 0.0000000
## No Ads-Curr Ads  -0.0608 -0.2968957  0.1752957 0.8181708
## No Ads-New Ads   -7.7201 -7.9561957 -7.4840043 0.0000000
# Plotting the result
plot(TukeyHSD(oneWayfit))

Box-cox transformation

# Box-Cox transformation
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
restaurants_trans <- BoxCoxTrans(restaurants$reservations)
restaurants_trans
## Box-Cox Transformation
## 
## 30000 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   31.00   36.00   36.55   41.00   79.00 
## 
## Largest/Smallest: 5.27 
## Sample Skewness: 0.777 
## 
## Estimated Lambda: -0.2 
## With fudge factor, Lambda = 0 will be used for transformations

Given lambda = 0, we will use log transformation

# Log transformation of variable reservations
restaurants$log_reservations <- log(restaurants$reservations)
attach(restaurants)
## The following objects are masked from restaurants (pos = 12):
## 
##     adType, businessID, pageViews, phoneCalls, reservations,
##     restaurantType
# Checking for Normality
ad.test(log_reservations)
## 
##  Anderson-Darling normality test
## 
## data:  log_reservations
## A = 36.301, p-value < 2.2e-16
# Checking for Homogeneity of Variances
library(car)
leveneTest(log_reservations ~ adType, data = restaurants)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     2  24.465 2.419e-11 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

One-way anova after transformation

# one-way ANOVA
oneWayfit_transformed <- aov(log_reservations ~ adType, data = restaurants)
# summary of the ANOVA model
summary(oneWayfit_transformed)
##                Df Sum Sq Mean Sq F value Pr(>F)    
## adType          2  278.1  139.05    3845 <2e-16 ***
## Residuals   29997 1084.8    0.04                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Even after transformation, we couldn’t assume normality and homogeneity of variances

Kruskal-Wallis Rank Sum Test since Normality is not satisfied

# Kruskal-Wallis rank sum test
kruskal.test(reservations ~ adType, data = restaurants)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  reservations by adType
## Kruskal-Wallis chi-squared = 5856.4, df = 2, p-value < 2.2e-16

Welch one-way test since variances are not same

# anova test when variances are not same
oneway.test(log_reservations ~ adType, data = restaurants)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  log_reservations and adType
## F = 3892.4, num df = 2, denom df = 19993, p-value < 2.2e-16

Two-way ANOVA

Frequency table for adtype and restaurantType

attach(restaurants)
## The following objects are masked from restaurants (pos = 3):
## 
##     adType, businessID, log_reservations, pageViews, phoneCalls,
##     reservations, restaurantType
## The following objects are masked from restaurants (pos = 13):
## 
##     adType, businessID, pageViews, phoneCalls, reservations,
##     restaurantType
addmargins(table(adType,restaurantType))
##           restaurantType
## adType     chain independent   Sum
##   Curr Ads  4000        6000 10000
##   New Ads   4000        6000 10000
##   No Ads    4000        6000 10000
##   Sum      12000       18000 30000

Summarising the variables

library(psych)
# Crucial dependent variable is reservations
describe(restaurants)
##                  vars     n     mean      sd   median  trimmed      mad
## adType*             1 30000     2.00    0.82     2.00     2.00     1.48
## pageViews           2 30000   468.06  168.16   391.00   458.18   149.74
## phoneCalls          3 30000    37.71    7.97    37.00    37.20     7.41
## reservations        4 30000    36.55    7.99    36.00    35.97     7.41
## businessID          5 30000 15000.50 8660.40 15000.50 15000.50 11119.50
## restaurantType*     6 30000     1.60    0.49     2.00     1.62     0.00
## log_reservations    7 30000     3.58    0.21     3.58     3.57     0.22
##                     min      max    range  skew kurtosis    se
## adType*            1.00     3.00     2.00  0.00    -1.50  0.00
## pageViews        145.00   929.00   784.00  0.45    -1.29  0.97
## phoneCalls        17.00    77.00    60.00  0.65     0.57  0.05
## reservations      15.00    79.00    64.00  0.78     0.88  0.05
## businessID         1.00 30000.00 29999.00  0.00    -1.20 50.00
## restaurantType*    1.00     2.00     1.00 -0.41    -1.83  0.00
## log_reservations   2.71     4.37     1.66  0.14    -0.07  0.00

Descriptive statistics by adType

library(data.table)
dt <- data.table(restaurants)
dt[, list(count = .N,
        mean = round(mean(reservations), 3), 
        sd = round(sd(reservations), 3),
        median = round(median(reservations), 3),
        min = min(reservations),
        max = max(reservations)), 
   by = list(adType)]
##      adType count   mean    sd median min max
## 1:   No Ads 10000 33.960 6.591     33  15  58
## 2: Curr Ads 10000 34.021 6.504     33  18  59
## 3:  New Ads 10000 41.681 8.153     40  18  79

Descriptive statistics by restaurantType

dt[, list(count = .N,
        mean = round(mean(reservations), 3), 
        sd = round(sd(reservations), 3),
        median = round(median(reservations), 3),
        min = min(reservations),
        max = max(reservations)), 
   by = list(restaurantType)]
##    restaurantType count   mean    sd median min max
## 1:          chain 12000 42.676 7.464     42  18  79
## 2:    independent 18000 32.473 5.261     32  15  51

Boxplot of reservations by adType

# box plot of ad type
boxplot(reservations ~ adType, data = restaurants,
        main = "Boxplot of Ad Type",
        xlab = "Ad Type", ylab = "Reservations",
        digits = 2)

Boxplot of reservations by RestaurantType

# box plot of ad type
boxplot(reservations ~ restaurantType, data = restaurants,
        main = "Boxplot of Ad Type",
        xlab = "Restaurant Type", ylab = "Reservations",
        digits = 2)

# box plot by adType and restaurant type
library(lattice)
bwplot(reservations ~ adType | restaurantType, data = restaurants, main = "Boxplot of Reservations by Ad Type and Restaurant Type", ylab = "Number of Reservations", col = "black")

Mean Plot of Reservations by Restaurant Type

# mean plot by ad type
library(gplots)
plotmeans(reservations ~ restaurantType, data = restaurants, xlab = "Restaurant Type", ylab = "Number of Reservations", digits = 2, col = "black", ccol = "blue", barwidth = 2, legends = TRUE, mean.labels = TRUE, n.label=FALSE, frame = TRUE)

Interaction Plot of Ad Type and Restaurant Type

interaction.plot(adType, restaurantType, reservations, type = "b", col = c(1:3), leg.bty = "o", leg.bg = "beige", lwd = 2, pch = c(18, 24, 22), xlab = "Restaurant type", ylab = "Number of Reservations", main = "Interaction Plot of Ad Type and Restaurant Type")

Two-way ANOVA test

# two-way ANOVA
twoWayfit <- aov(reservations ~ adType * restaurantType, data = restaurants)
# summary of the ANOVA model
summary(twoWayfit)
##                          Df Sum Sq Mean Sq   F value   Pr(>F)    
## adType                    2 394228  197114  7658.285  < 2e-16 ***
## restaurantType            1 749570  749570 29122.292  < 2e-16 ***
## adType:restaurantType     2    442     221     8.594 0.000186 ***
## Residuals             29994 772006      26                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As p-value <0.05, we can say that there are significant differences in reservations for different adTypes and restaurantTypes

ANOVA Assumption - Normality

# plot
plot(twoWayfit, 2)

Anderson-Darling test

# Anderson-Darling normality test
library(nortest)
ad.test(reservations)
## 
##  Anderson-Darling normality test
## 
## data:  reservations
## A = 222.08, p-value < 2.2e-16

As p-value < 0.05, we can’t asssume normality of data.

ANOVA Assumption - Homogeneity of Variance

# plot
plot(twoWayfit, 1)

Levene test

library(car)
leveneTest(reservations ~ adType * restaurantType, data = restaurants)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5  974.04 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can’t assume the homogeneity of variances of groups.

Two-way anova after transformation

# two-way ANOVA
twoWayfit_transformed <- aov(log_reservations ~ adType * restaurantType, data = restaurants)
# summary of the ANOVA model
summary(twoWayfit_transformed)
##                          Df Sum Sq Mean Sq F value Pr(>F)    
## adType                    2  278.1   139.1  7588.0 <2e-16 ***
## restaurantType            1  530.9   530.9 28967.7 <2e-16 ***
## adType:restaurantType     2    4.3     2.2   118.2 <2e-16 ***
## Residuals             29994  549.7     0.0                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Even after transformation, we couldn’t assume normality and homogeneity of variances

Multiple Pairwise Comparison Test

# Tukey comparison test
TukeyHSD(twoWayfit_transformed)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = log_reservations ~ adType * restaurantType, data = restaurants)
## 
## $adType
##                          diff          lwr          upr     p adj
## New Ads-Curr Ads  0.202877282  0.198390382  0.207364182 0.0000000
## No Ads-Curr Ads  -0.002709695 -0.007196595  0.001777205 0.3328927
## No Ads-New Ads   -0.205586978 -0.210073878 -0.201100078 0.0000000
## 
## $restaurantType
##                         diff        lwr        upr p adj
## independent-chain -0.2715314 -0.2746583 -0.2684046     0
## 
## $`adType:restaurantType`
##                                                  diff          lwr
## New Ads:chain-Curr Ads:chain              0.170776091  0.162149996
## No Ads:chain-Curr Ads:chain              -0.004558874 -0.013184969
## Curr Ads:independent-Curr Ads:chain      -0.290392755 -0.298267266
## New Ads:independent-Curr Ads:chain       -0.066114678 -0.073989190
## No Ads:independent-Curr Ads:chain        -0.291869665 -0.299744176
## No Ads:chain-New Ads:chain               -0.175334964 -0.183961060
## Curr Ads:independent-New Ads:chain       -0.461168846 -0.469043357
## New Ads:independent-New Ads:chain        -0.236890769 -0.244765280
## No Ads:independent-New Ads:chain         -0.462645755 -0.470520267
## Curr Ads:independent-No Ads:chain        -0.285833881 -0.293708393
## New Ads:independent-No Ads:chain         -0.061555804 -0.069430316
## No Ads:independent-No Ads:chain          -0.287310791 -0.295185302
## New Ads:independent-Curr Ads:independent  0.224278077  0.217234900
## No Ads:independent-Curr Ads:independent  -0.001476910 -0.008520087
## No Ads:independent-New Ads:independent   -0.225754986 -0.232798164
##                                                   upr     p adj
## New Ads:chain-Curr Ads:chain              0.179402186 0.0000000
## No Ads:chain-Curr Ads:chain               0.004067221 0.6603760
## Curr Ads:independent-Curr Ads:chain      -0.282518243 0.0000000
## New Ads:independent-Curr Ads:chain       -0.058240167 0.0000000
## No Ads:independent-Curr Ads:chain        -0.283995153 0.0000000
## No Ads:chain-New Ads:chain               -0.166708869 0.0000000
## Curr Ads:independent-New Ads:chain       -0.453294334 0.0000000
## New Ads:independent-New Ads:chain        -0.229016257 0.0000000
## No Ads:independent-New Ads:chain         -0.454771244 0.0000000
## Curr Ads:independent-No Ads:chain        -0.277959370 0.0000000
## New Ads:independent-No Ads:chain         -0.053681293 0.0000000
## No Ads:independent-No Ads:chain          -0.279436279 0.0000000
## New Ads:independent-Curr Ads:independent  0.231321254 0.0000000
## No Ads:independent-Curr Ads:independent   0.005566268 0.9912292
## No Ads:independent-New Ads:independent   -0.218711809 0.0000000
# Tukey pair-wise comparisons plot - differences in mean levels of adType
plot(TukeyHSD(twoWayfit_transformed, 1))

# Tukey pair-wise comparisons plot - differences in mean levels of restaurantType
plot(TukeyHSD(twoWayfit_transformed, 2))

# Tukey pair-wise comparisons plot - differences in mean levels of adType and restaurantType
plot(TukeyHSD(twoWayfit_transformed, 3))

Kruskal-Wallis rank sum test as Normality is not satisfied

# Kruskal-Wallis rank sum test
kruskal.test(reservations ~ interaction(adType, restaurantType), data = restaurants)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  reservations by interaction(adType, restaurantType)
## Kruskal-Wallis chi-squared = 18667, df = 5, p-value < 2.2e-16

Welch two-way test as variances are not same

# anova test when variances are not same
oneway.test(log_reservations ~ interaction(adType, restaurantType), data = restaurants)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  log_reservations and interaction(adType, restaurantType)
## F = 7794, num df = 5, denom df = 12999, p-value < 2.2e-16