ONE-WAY ANOVA

Create a frequency table using your crucial dependent variable (x)

data.df = read.csv(paste("AdvertisingDataV2.csv", sep=""))
addmargins(table(data.df$adType))
## 
## Curr Ads  New Ads   No Ads      Sum 
##    10000    10000    10000    30000

Create a table showing mean, sd, median, min, max, count of your crucial dependent variable.

library(data.table)
dt <- data.table(data.df)
dt[, list(Count = .N,
        mean = round(mean(pageViews), 3), 
        sd = round(mean(pageViews), 3),
        median = round(median(pageViews), 3),
        min = min(pageViews),
        max = max(pageViews)), 
   by = list(adType)]
##      adType Count    mean      sd median min max
## 1:   No Ads 10000 419.779 419.779    339 145 766
## 2: Curr Ads 10000 501.191 501.191    419 209 929
## 3:  New Ads 10000 483.211 483.211    384 188 918

Draw box plots, side-by-side, showing the distribution of the dependent variable (y) w.r.t. Independent variable (x).

# box plot of organ
boxplot(pageViews ~ adType, data = data.df,
        main = "Boxplot of adType",
        xlab = "adType", ylab = "pageViews")

Draw mean plots, showing the average value of the dependent variable (y) w.r.t. Independent variable (x).

# mean plot
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
plotmeans(pageViews ~ adType, data = data.df,
          xlab = "adType", ylab = "pageViews",
          digits=2, col = "black", ccol = "blue", barwidth = 2,
          legends = TRUE, mean.labels = TRUE, frame = TRUE)
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

Run one-way ANOVA of dependent variable (y) with respect to independent variable (x)

oneWayfit <- aov(pageViews ~ adType, data = data.df)
# summary of the ANOVA model
summary(oneWayfit)
##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## adType          2  36582190 18291095   675.9 <2e-16 ***
## Residuals   29997 811743988    27061                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check ANOVA assumptions for your analysis.

Normality of the Dependent Variable

# check for normality in each group
library(nortest)
ad.test(data.df$pageViews)
## 
##  Anderson-Darling normality test
## 
## data:  data.df$pageViews
## A = 1432.3, p-value < 2.2e-16
# We use Anderson-Darling normality test since, sample size is >5000
# with(data.df, tapply(pageViews , adType, shapiro.test))

Homogeneity of Variance

# Check for homogeneity of variance
library(car)
## Loading required package: carData
leveneTest(pageViews ~ adType, data = data.df)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     2  57.346 < 2.2e-16 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We reject the null hypothesis. There is heterogeneity in variance of pageViews

Check ANOVA assumptions for your analysis.

Homogeneity of Variance

# Check for homogeneity of variance
library(car)
leveneTest(data.df$pageViews , data.df$adType, data = data.df)
## Levene's Test for Homogeneity of Variance (center = median: data.df)
##          Df F value    Pr(>F)    
## group     2  57.346 < 2.2e-16 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Generate pairwise comparison test values and plot, for one-way ANOVA.

# Tukey comparison test
TukeyHSD(oneWayfit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pageViews ~ adType, data = data.df)
## 
## $adType
##                      diff      lwr      upr p adj
## New Ads-Curr Ads -17.9798 -23.4322 -12.5274     0
## No Ads-Curr Ads  -81.4114 -86.8638 -75.9590     0
## No Ads-New Ads   -63.4316 -68.8840 -57.9792     0
# Tukey pair-wise comparisons plot
plot(TukeyHSD(oneWayfit))

Run ANOVA test if the normality is not satisfied.

# Kruskal-Wallis rank sum test
kruskal.test(pageViews ~ adType, data = data.df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  pageViews by adType
## Kruskal-Wallis chi-squared = 2865.7, df = 2, p-value < 2.2e-16

We use, Kruskal-Wallis Rank Sum Test. This test can be used when the normality assumption is violated.

Run ANOVA test if variances are not equal.

# anova test when variances are not same
oneway.test(pageViews ~ adType, data = data.df)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  pageViews and adType
## F = 734.5, num df = 2, denom df = 19934, p-value < 2.2e-16

We use Welch One-way Test. This test is used when variances are heterogeneous.

ONE-WAY ANOVA

Create a frequency table using your crucial dependent variables (x1, x2, x3,…)

addmargins(table(data.df$adType, data.df$restaurantType))
##           
##            chain independent   Sum
##   Curr Ads  4000        6000 10000
##   New Ads   4000        6000 10000
##   No Ads    4000        6000 10000
##   Sum      12000       18000 30000

Create a frequency table (s) showing mean, sd, median, min, max, count of your crucial dependent variables (x1, x2, x3,…).

library(data.table)
dt <- data.table(data.df)
dt[, list(Count = .N,
        mean = round(mean(pageViews), 3), 
        sd = round(mean(pageViews), 3),
        median = round(median(pageViews), 3),
        min = min(pageViews),
        max = max(pageViews)), 
   by = list(adType)]
##      adType Count    mean      sd median min max
## 1:   No Ads 10000 419.779 419.779    339 145 766
## 2: Curr Ads 10000 501.191 501.191    419 209 929
## 3:  New Ads 10000 483.211 483.211    384 188 918
dt <- data.table(data.df)
dt[, list(Count = .N,
        mean = round(mean(pageViews), 3), 
        sd = round(mean(pageViews), 3),
        median = round(median(pageViews), 3),
        min = min(pageViews),
        max = max(pageViews)), 
   by = list(restaurantType)]
##    restaurantType Count    mean      sd median min max
## 1:          chain 12000 660.180 660.180    660 437 929
## 2:    independent 18000 339.981 339.981    339 145 530

Draw box plots, side-by-side, showing the distribution of the dependent variable (y) w.r.t. Independent variables (x1, x2, x3,…).

# box plot of adType and restaurantType
library(lattice)
bwplot(pageViews ~ adType | restaurantType, data = data.df,
        main = "Boxplot of adType and restaurantType",
        ylab = "Number of pageViews",
        col = "black")

Draw mean plots, showing the average value of the dependent variable (y) w.r.t. Independent variables (x1, x2, x3,…

library(gplots)
plotmeans(pageViews ~ adType, data = data.df,
          xlab = "adType", ylab = "pageViews",
          digits=2, col = "black", ccol = "blue", barwidth = 2,
          legends = TRUE, mean.labels = TRUE, frame = TRUE)
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

plotmeans(pageViews ~ restaurantType, data = data.df,
          xlab = "adType", ylab = "pageViews",
          digits=2, col = "black", ccol = "blue", barwidth = 2,
          legends = TRUE, mean.labels = TRUE, frame = TRUE)
## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

Run two-way ANOVA of dependent variable (y) with respect to independent variables with interaction (x1, x2, x3,…)

# two-way ANOVA
twoWayfit <- aov(pageViews ~ adType * restaurantType, data = data.df)
# summary of the ANOVA model
summary(twoWayfit)
##                          Df    Sum Sq   Mean Sq  F value Pr(>F)    
## adType                    2  36582190  18291095   7734.8 <2e-16 ***
## restaurantType            1 738196252 738196252 312161.4 <2e-16 ***
## adType:restaurantType     2   2618217   1309109    553.6 <2e-16 ***
## Residuals             29994  70929518      2365                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Write test inference for Q14.

  • the p-value of adType is <2e-16 (significant), which indicates that the adTypes are associated with significant different pageViews. the p-value of restaurantType is < 2e-16 (significant), which indicates that the restaurantTypes are associated with significant different pageViews. the p-value for the interaction between adType*restaurantType is < 2e-16 (significant), which indicates that the relationships between adType and pageViews depends on the restaurantType

Check ANOVA assumptions for your analysis.

Normality of Residuals

# normal Q-Q plot
plot(twoWayfit, 2)

# Anderson-Darling normality test
library(nortest)
ad.test(data.df$pageViews)
## 
##  Anderson-Darling normality test
## 
## data:  data.df$pageViews
## A = 1432.3, p-value < 2.2e-16

Check ANOVA assumptions for your analysis.

Homogeneity of Variance

# residual versus fitted plot
plot(twoWayfit, 1)

# Levene test for homogeneity of variance
library(car)
leveneTest(pageViews ~ adType * restaurantType, data = data.df)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5  315.24 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Run ANOVA test if the normality is not satisfied.

# Kruskal-Wallis rank sum test
kruskal.test(pageViews ~ interaction(adType, restaurantType), data = data.df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  pageViews by interaction(adType, restaurantType)
## Kruskal-Wallis chi-squared = 24655, df = 5, p-value < 2.2e-16