Reading the dataset

restaurants <- read.csv("AdvertisingDataV2.csv")
# attaching data columns of the dataframe
attach(restaurants)
# dimension of the dataframe
dim(restaurants)

## [1] 30000     6

Frequency table for adtype variable

addmargins(table(adType))

## adType
## Curr Ads  New Ads   No Ads      Sum 
##    10000    10000    10000    30000

Summarising the variables

library(psych)
# Crucial dependent variable is reservations
describe(restaurants)

##                 vars     n     mean      sd  median  trimmed      mad min
## adType*            1 30000     2.00    0.82     2.0     2.00     1.48   1
## pageViews          2 30000   468.06  168.16   391.0   458.18   149.74 145
## phoneCalls         3 30000    37.71    7.97    37.0    37.20     7.41  17
## reservations       4 30000    36.55    7.99    36.0    35.97     7.41  15
## businessID         5 30000 15000.50 8660.40 15000.5 15000.50 11119.50   1
## restaurantType*    6 30000     1.60    0.49     2.0     1.62     0.00   1
##                   max range  skew kurtosis    se
## adType*             3     2  0.00    -1.50  0.00
## pageViews         929   784  0.45    -1.29  0.97
## phoneCalls         77    60  0.65     0.57  0.05
## reservations       79    64  0.78     0.88  0.05
## businessID      30000 29999  0.00    -1.20 50.00
## restaurantType*     2     1 -0.41    -1.83  0.00

Descriptive statistics by adType

library(data.table)
dt <- data.table(restaurants)
dt[, list(count = .N,
        mean = round(mean(reservations), 3), 
        sd = round(sd(reservations), 3),
        median = round(median(reservations), 3),
        min = min(reservations),
        max = max(reservations)), 
   by = list(adType)]

##      adType count   mean    sd median min max
## 1:   No Ads 10000 33.960 6.591     33  15  58
## 2: Curr Ads 10000 34.021 6.504     33  18  59
## 3:  New Ads 10000 41.681 8.153     40  18  79

Boxplot of reservations by adType

# box plot of ad type
boxplot(reservations ~ adType, data = restaurants,
        main = "Boxplot of Ad Type",
        xlab = "Ad Type", ylab = "Reservations",
        digits = 2)

# Mean Plot of Reservations by Ad Type

# mean plot by ad type
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

plotmeans(reservations ~ adType, data = restaurants,
          xlab = "Ad Type", ylab = "Number of Reservations",
          digits = 2, col = "black", ccol = "blue", barwidth = 2,
          legends = TRUE, mean.labels = TRUE, n.label=FALSE, frame = TRUE)

One-way ANOVA test

# one-way ANOVA 
oneWayfit <- aov(reservations ~ adType, data = restaurants)
# summary of the ANOVA model
summary(oneWayfit)

##                Df  Sum Sq Mean Sq F value Pr(>F)    
## adType          2  394228  197114    3885 <2e-16 ***
## Residuals   29997 1522018      51                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As p-value <0.05, we can say that there are significant differences in reservations for different adTypes

ANOVA Assumption - Normality

# Anderson-Darling normality test
library(nortest)
ad.test(reservations)

## 
##  Anderson-Darling normality test
## 
## data:  reservations
## A = 222.08, p-value < 2.2e-16

As p-value < 0.05, we can’t asssume normality of data.

ANOVA Assumption - Homogeneity of Variance

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

leveneTest(reservations ~ adType, data = restaurants)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     2  134.29 < 2.2e-16 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can’t assume the homogeneity of variances of groups.

Pairwise Comparison test

# Tukey Comparison Test
TukeyHSD(oneWayfit)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = reservations ~ adType, data = restaurants)
## 
## $adType
##                     diff        lwr        upr     p adj
## New Ads-Curr Ads  7.6593  7.4232043  7.8953957 0.0000000
## No Ads-Curr Ads  -0.0608 -0.2968957  0.1752957 0.8181708
## No Ads-New Ads   -7.7201 -7.9561957 -7.4840043 0.0000000

# Plotting the result
plot(TukeyHSD(oneWayfit))

Box-cox transformation

# Box-Cox transformation
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

restaurants_trans <- BoxCoxTrans(restaurants$reservations)
restaurants_trans

## Box-Cox Transformation
## 
## 30000 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   31.00   36.00   36.55   41.00   79.00 
## 
## Largest/Smallest: 5.27 
## Sample Skewness: 0.777 
## 
## Estimated Lambda: -0.2 
## With fudge factor, Lambda = 0 will be used for transformations

Given lambda = 0, we will use log transformation

# Log transformation of variable reservations
restaurants$log_reservations <- log(restaurants$reservations)
attach(restaurants)

## The following objects are masked from restaurants (pos = 12):
## 
##     adType, businessID, pageViews, phoneCalls, reservations,
##     restaurantType

# Checking for Normality
ad.test(log_reservations)

## 
##  Anderson-Darling normality test
## 
## data:  log_reservations
## A = 36.301, p-value < 2.2e-16

# Checking for Homogeneity of Variances
library(car)
leveneTest(log_reservations ~ adType, data = restaurants)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     2  24.465 2.419e-11 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

One-way anova after transformation

# one-way ANOVA
oneWayfit_transformed <- aov(log_reservations ~ adType, data = restaurants)
# summary of the ANOVA model
summary(oneWayfit_transformed)

##                Df Sum Sq Mean Sq F value Pr(>F)    
## adType          2  278.1  139.05    3845 <2e-16 ***
## Residuals   29997 1084.8    0.04                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Even after transformation, we couldn’t assume normality and homogeneity of variances

Kruskal-Wallis Rank Sum Test since Normality is not satisfied

# Kruskal-Wallis rank sum test
kruskal.test(reservations ~ adType, data = restaurants)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  reservations by adType
## Kruskal-Wallis chi-squared = 5856.4, df = 2, p-value < 2.2e-16

Welch one-way test since variances are not same

# anova test when variances are not same
oneway.test(log_reservations ~ adType, data = restaurants)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  log_reservations and adType
## F = 3892.4, num df = 2, denom df = 19993, p-value < 2.2e-16

Two-way ANOVA

Frequency table for adtype and restaurantType

attach(restaurants)

## The following objects are masked from restaurants (pos = 3):
## 
##     adType, businessID, log_reservations, pageViews, phoneCalls,
##     reservations, restaurantType

## The following objects are masked from restaurants (pos = 13):
## 
##     adType, businessID, pageViews, phoneCalls, reservations,
##     restaurantType

addmargins(table(adType,restaurantType))

##           restaurantType
## adType     chain independent   Sum
##   Curr Ads  4000        6000 10000
##   New Ads   4000        6000 10000
##   No Ads    4000        6000 10000
##   Sum      12000       18000 30000

Summarising the variables

library(psych)
# Crucial dependent variable is reservations
describe(restaurants)

##                  vars     n     mean      sd   median  trimmed      mad
## adType*             1 30000     2.00    0.82     2.00     2.00     1.48
## pageViews           2 30000   468.06  168.16   391.00   458.18   149.74
## phoneCalls          3 30000    37.71    7.97    37.00    37.20     7.41
## reservations        4 30000    36.55    7.99    36.00    35.97     7.41
## businessID          5 30000 15000.50 8660.40 15000.50 15000.50 11119.50
## restaurantType*     6 30000     1.60    0.49     2.00     1.62     0.00
## log_reservations    7 30000     3.58    0.21     3.58     3.57     0.22
##                     min      max    range  skew kurtosis    se
## adType*            1.00     3.00     2.00  0.00    -1.50  0.00
## pageViews        145.00   929.00   784.00  0.45    -1.29  0.97
## phoneCalls        17.00    77.00    60.00  0.65     0.57  0.05
## reservations      15.00    79.00    64.00  0.78     0.88  0.05
## businessID         1.00 30000.00 29999.00  0.00    -1.20 50.00
## restaurantType*    1.00     2.00     1.00 -0.41    -1.83  0.00
## log_reservations   2.71     4.37     1.66  0.14    -0.07  0.00

Descriptive statistics by adType

library(data.table)
dt <- data.table(restaurants)
dt[, list(count = .N,
        mean = round(mean(reservations), 3), 
        sd = round(sd(reservations), 3),
        median = round(median(reservations), 3),
        min = min(reservations),
        max = max(reservations)), 
   by = list(adType)]

##      adType count   mean    sd median min max
## 1:   No Ads 10000 33.960 6.591     33  15  58
## 2: Curr Ads 10000 34.021 6.504     33  18  59
## 3:  New Ads 10000 41.681 8.153     40  18  79

Descriptive statistics by restaurantType

dt[, list(count = .N,
        mean = round(mean(reservations), 3), 
        sd = round(sd(reservations), 3),
        median = round(median(reservations), 3),
        min = min(reservations),
        max = max(reservations)), 
   by = list(restaurantType)]

##    restaurantType count   mean    sd median min max
## 1:          chain 12000 42.676 7.464     42  18  79
## 2:    independent 18000 32.473 5.261     32  15  51

Boxplot of reservations by adType

# box plot of ad type
boxplot(reservations ~ adType, data = restaurants,
        main = "Boxplot of Ad Type",
        xlab = "Ad Type", ylab = "Reservations",
        digits = 2)

Boxplot of reservations by RestaurantType

# box plot of ad type
boxplot(reservations ~ restaurantType, data = restaurants,
        main = "Boxplot of Ad Type",
        xlab = "Restaurant Type", ylab = "Reservations",
        digits = 2)

# box plot by adType and restaurant type
library(lattice)
bwplot(reservations ~ adType | restaurantType, data = restaurants, main = "Boxplot of Reservations by Ad Type and Restaurant Type", ylab = "Number of Reservations", col = "black")

Mean Plot of Reservations by Restaurant Type

# mean plot by ad type
library(gplots)
plotmeans(reservations ~ restaurantType, data = restaurants, xlab = "Restaurant Type", ylab = "Number of Reservations", digits = 2, col = "black", ccol = "blue", barwidth = 2, legends = TRUE, mean.labels = TRUE, n.label=FALSE, frame = TRUE)

Interaction Plot of Ad Type and Restaurant Type

interaction.plot(adType, restaurantType, reservations, type = "b", col = c(1:3), leg.bty = "o", leg.bg = "beige", lwd = 2, pch = c(18, 24, 22), xlab = "Restaurant type", ylab = "Number of Reservations", main = "Interaction Plot of Ad Type and Restaurant Type")

Two-way ANOVA test

# two-way ANOVA
twoWayfit <- aov(reservations ~ adType * restaurantType, data = restaurants)
# summary of the ANOVA model
summary(twoWayfit)

##                          Df Sum Sq Mean Sq   F value   Pr(>F)    
## adType                    2 394228  197114  7658.285  < 2e-16 ***
## restaurantType            1 749570  749570 29122.292  < 2e-16 ***
## adType:restaurantType     2    442     221     8.594 0.000186 ***
## Residuals             29994 772006      26                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As p-value <0.05, we can say that there are significant differences in reservations for different adTypes and restaurantTypes

ANOVA Assumption - Normality

# plot
plot(twoWayfit, 2)

Anderson-Darling test

# Anderson-Darling normality test
library(nortest)
ad.test(reservations)

## 
##  Anderson-Darling normality test
## 
## data:  reservations
## A = 222.08, p-value < 2.2e-16

As p-value < 0.05, we can’t asssume normality of data.

ANOVA Assumption - Homogeneity of Variance

# plot
plot(twoWayfit, 1)

Levene test

library(car)
leveneTest(reservations ~ adType * restaurantType, data = restaurants)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5  974.04 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can’t assume the homogeneity of variances of groups.

Two-way anova after transformation

# two-way ANOVA
twoWayfit_transformed <- aov(log_reservations ~ adType * restaurantType, data = restaurants)
# summary of the ANOVA model
summary(twoWayfit_transformed)

##                          Df Sum Sq Mean Sq F value Pr(>F)    
## adType                    2  278.1   139.1  7588.0 <2e-16 ***
## restaurantType            1  530.9   530.9 28967.7 <2e-16 ***
## adType:restaurantType     2    4.3     2.2   118.2 <2e-16 ***
## Residuals             29994  549.7     0.0                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Even after transformation, we couldn’t assume normality and homogeneity of variances

Multiple Pairwise Comparison Test

# Tukey comparison test
TukeyHSD(twoWayfit_transformed)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = log_reservations ~ adType * restaurantType, data = restaurants)
## 
## $adType
##                          diff          lwr          upr     p adj
## New Ads-Curr Ads  0.202877282  0.198390382  0.207364182 0.0000000
## No Ads-Curr Ads  -0.002709695 -0.007196595  0.001777205 0.3328927
## No Ads-New Ads   -0.205586978 -0.210073878 -0.201100078 0.0000000
## 
## $restaurantType
##                         diff        lwr        upr p adj
## independent-chain -0.2715314 -0.2746583 -0.2684046     0
## 
## $`adType:restaurantType`
##                                                  diff          lwr
## New Ads:chain-Curr Ads:chain              0.170776091  0.162149996
## No Ads:chain-Curr Ads:chain              -0.004558874 -0.013184969
## Curr Ads:independent-Curr Ads:chain      -0.290392755 -0.298267266
## New Ads:independent-Curr Ads:chain       -0.066114678 -0.073989190
## No Ads:independent-Curr Ads:chain        -0.291869665 -0.299744176
## No Ads:chain-New Ads:chain               -0.175334964 -0.183961060
## Curr Ads:independent-New Ads:chain       -0.461168846 -0.469043357
## New Ads:independent-New Ads:chain        -0.236890769 -0.244765280
## No Ads:independent-New Ads:chain         -0.462645755 -0.470520267
## Curr Ads:independent-No Ads:chain        -0.285833881 -0.293708393
## New Ads:independent-No Ads:chain         -0.061555804 -0.069430316
## No Ads:independent-No Ads:chain          -0.287310791 -0.295185302
## New Ads:independent-Curr Ads:independent  0.224278077  0.217234900
## No Ads:independent-Curr Ads:independent  -0.001476910 -0.008520087
## No Ads:independent-New Ads:independent   -0.225754986 -0.232798164
##                                                   upr     p adj
## New Ads:chain-Curr Ads:chain              0.179402186 0.0000000
## No Ads:chain-Curr Ads:chain               0.004067221 0.6603760
## Curr Ads:independent-Curr Ads:chain      -0.282518243 0.0000000
## New Ads:independent-Curr Ads:chain       -0.058240167 0.0000000
## No Ads:independent-Curr Ads:chain        -0.283995153 0.0000000
## No Ads:chain-New Ads:chain               -0.166708869 0.0000000
## Curr Ads:independent-New Ads:chain       -0.453294334 0.0000000
## New Ads:independent-New Ads:chain        -0.229016257 0.0000000
## No Ads:independent-New Ads:chain         -0.454771244 0.0000000
## Curr Ads:independent-No Ads:chain        -0.277959370 0.0000000
## New Ads:independent-No Ads:chain         -0.053681293 0.0000000
## No Ads:independent-No Ads:chain          -0.279436279 0.0000000
## New Ads:independent-Curr Ads:independent  0.231321254 0.0000000
## No Ads:independent-Curr Ads:independent   0.005566268 0.9912292
## No Ads:independent-New Ads:independent   -0.218711809 0.0000000

# Tukey pair-wise comparisons plot - differences in mean levels of adType
plot(TukeyHSD(twoWayfit_transformed, 1))

# Tukey pair-wise comparisons plot - differences in mean levels of restaurantType
plot(TukeyHSD(twoWayfit_transformed, 2))

# Tukey pair-wise comparisons plot - differences in mean levels of adType and restaurantType
plot(TukeyHSD(twoWayfit_transformed, 3))

Kruskal-Wallis rank sum test as Normality is not satisfied

# Kruskal-Wallis rank sum test
kruskal.test(reservations ~ interaction(adType, restaurantType), data = restaurants)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  reservations by interaction(adType, restaurantType)
## Kruskal-Wallis chi-squared = 18667, df = 5, p-value < 2.2e-16

Welch two-way test as variances are not same

# anova test when variances are not same
oneway.test(log_reservations ~ interaction(adType, restaurantType), data = restaurants)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  log_reservations and interaction(adType, restaurantType)
## F = 7794, num df = 5, denom df = 12999, p-value < 2.2e-16

RestaurantAnalysis

Rezul

24/11/2018

Reading the dataset

Frequency table for adtype variable

Summarising the variables

Descriptive statistics by adType

Boxplot of reservations by adType

One-way ANOVA test

As p-value <0.05, we can say that there are significant differences in reservations for different adTypes

ANOVA Assumption - Normality

As p-value < 0.05, we can’t asssume normality of data.

ANOVA Assumption - Homogeneity of Variance

We can’t assume the homogeneity of variances of groups.

Pairwise Comparison test

Box-cox transformation

Given lambda = 0, we will use log transformation

One-way anova after transformation

Even after transformation, we couldn’t assume normality and homogeneity of variances

Kruskal-Wallis Rank Sum Test since Normality is not satisfied

Welch one-way test since variances are not same

Two-way ANOVA

Frequency table for adtype and restaurantType

Summarising the variables

Descriptive statistics by adType

Descriptive statistics by restaurantType

Boxplot of reservations by adType

Boxplot of reservations by RestaurantType

Mean Plot of Reservations by Restaurant Type

Interaction Plot of Ad Type and Restaurant Type

Two-way ANOVA test

As p-value <0.05, we can say that there are significant differences in reservations for different adTypes and restaurantTypes

ANOVA Assumption - Normality

Anderson-Darling test

As p-value < 0.05, we can’t asssume normality of data.

ANOVA Assumption - Homogeneity of Variance

Levene test

We can’t assume the homogeneity of variances of groups.

Two-way anova after transformation

Even after transformation, we couldn’t assume normality and homogeneity of variances

Multiple Pairwise Comparison Test

Kruskal-Wallis rank sum test as Normality is not satisfied

Welch two-way test as variances are not same