Class 14 - ANOVA

# Frequency table - Descriptive Statistics
# 1. Number of observations by adtype, restaurant type
addmargins(table(adType))

## adType
## Curr Ads  New Ads   No Ads      Sum 
##    10000    10000    10000    30000

addmargins(table(restaurantType))

## restaurantType
##       chain independent         Sum 
##       12000       18000       30000

# 2. Dependent Vatiable (x) = Page Views
dt <- data.table(restGrades)
dt[, list(Count = .N, 
          mean = round(mean(pageViews), 3),
          sd = round(mean(pageViews), 3),
          median = round(median(pageViews), 3),
          min = min(pageViews),
          max = max(pageViews)),
   by = list(adType)]

##      adType Count    mean      sd median min max
## 1:   No Ads 10000 419.779 419.779    339 145 766
## 2: Curr Ads 10000 501.191 501.191    419 209 929
## 3:  New Ads 10000 483.211 483.211    384 188 918

# In ascending order of page views
# Curr Ads > New Ads > No Ads

# Visualization
# 3. Box plot of ads vs page views
# Dependent Variable (y) = Page Views, Independent Variable (x) = ad type
boxplot(pageViews ~ adType, data = restGrades,
        main = "Boxplot for Page Views",
        xlab = "adType", ylab = "page views")

# In ascending order of page views
# Curr Ads > New Ads > No Ads

# Visualization
# 4. Mean Plots by ad Type
suppressWarnings(plotmeans(pageViews ~ adType, data = restGrades,
          xlab = "adType", ylab = "page views",
          digits=2, col = "black", ccol = "blue", barwidth = 2,
          legends = TRUE, mean.labels = TRUE))

# 5. One way ANOVA
onewayFit <- aov(pageViews ~ adType, data = restGrades)
summary(onewayFit)

##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## adType          2  36582190 18291095   675.9 <2e-16 ***
## Residuals   29997 811743988    27061                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# 6. Check for ANOVA Assumptions
# a. Normality in each group
# Anderson Darling test 
# Null hypothesis (H0): The data is normally distributed
with(restGrades, tapply(pageViews, adType, ad.test))

## $`Curr Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 589.56, p-value < 2.2e-16
## 
## 
## $`New Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 739.4, p-value < 2.2e-16
## 
## 
## $`No Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 671.15, p-value < 2.2e-16

# We reject the null hypothesis since p-value is less than 0.05. Hence, the data is not normally distributed.

# b. Homogeniety of variances
# Null hypothesis (H0): The variance of page views is homogienenous across ad types
leveneTest(pageViews ~ adType, data = restGrades)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     2  57.346 < 2.2e-16 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# We reject null hypothesis (p-value < 0.05) 
# Heterogeneity in page views exists among ad types

# 7. Pairwise comparison test values and plot,
TukeyHSD(onewayFit)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pageViews ~ adType, data = restGrades)
## 
## $adType
##                      diff      lwr      upr p adj
## New Ads-Curr Ads -17.9798 -23.4322 -12.5274     0
## No Ads-Curr Ads  -81.4114 -86.8638 -75.9590     0
## No Ads-New Ads   -63.4316 -68.8840 -57.9792     0

plot(TukeyHSD(onewayFit))

pairwise.t.test(pageViews, adType, data = restGrades, pool.sd = FALSE)

## 
##  Pairwise comparisons using t tests with non-pooled SD 
## 
## data:  pageViews and adType 
## 
##         Curr Ads New Ads
## New Ads 7.3e-14  -      
## No Ads  < 2e-16  < 2e-16
## 
## P value adjustment method: holm

# 8. Run ANOVA test if the normality is not satisfied
# Kruskal-Wallis Rank Sum Test
kruskal.test(pageViews ~ adType, data = restGrades)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  pageViews by adType
## Kruskal-Wallis chi-squared = 2865.7, df = 2, p-value < 2.2e-16

pageViewsTrans <- BoxCoxTrans(pageViews)
pageViewsTrans

## Box-Cox Transformation
## 
## 30000 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   145.0   328.0   391.0   468.1   636.0   929.0 
## 
## Largest/Smallest: 6.41 
## Sample Skewness: 0.451 
## 
## Estimated Lambda: -0.3

# append the transformed variable to dataset
restGrades <- cbind(restGrades, pageViewsNew = predict(pageViewsTrans, pageViews))
attach(restGrades)

## The following objects are masked from restGrades (pos = 3):
## 
##     adType, businessID, pageViews, phoneCalls, reservations,
##     restaurantType

# Change pageViews to transformed pageViews using BoxCox
oneWaytransfit <- aov(pageViewsNew ~ adType, data = restGrades)
# summary of the ANOVA model
summary(oneWaytransfit)

##                Df Sum Sq Mean Sq F value Pr(>F)    
## adType          2   5.44  2.7183   879.2 <2e-16 ***
## Residuals   29997  92.74  0.0031                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# NORMALITY ASSUMTIONS: Comparing Normality of Residuals Before-After Boxcox-transformation
# Before Boxcox-transformation
# residual versus fitted plot
plot(onewayFit, 2)

# After Boxcox-transformation
# residual versus fitted plot
plot(oneWaytransfit, 2)

# 9. Run ANOVA test if variances are not equal
# Welch One-way Test
# anova test when variances are not same
oneway.test(pageViews ~ adType, data = restGrades)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  pageViews and adType
## F = 734.5, num df = 2, denom df = 19934, p-value < 2.2e-16

# EQALITY OF VARIANCES: Comparing Plots for the Homogeneity of Variance Before After Boxcox-transformation
# Before Boxcox-transformations
# residual versus fitted plot before Boxcox-transformation
plot(onewayFit, 1)

# After Boxcox-transformations
# residual versus fitted plot after Boxcox-transformation
plot(oneWaytransfit, 1)

# Two Way ANOVA
# 10. Frequency table - Descriptive Statistics
# by adtype (New, Curr, No) and restaurant type (Chain, Independent)
addmargins(table(adType, restaurantType))

##           restaurantType
## adType     chain independent   Sum
##   Curr Ads  4000        6000 10000
##   New Ads   4000        6000 10000
##   No Ads    4000        6000 10000
##   Sum      12000       18000 30000

# 11. Descriptive Statistics - Frequency table
# a. Page Views
dt[, list(Count = .N,
        mean = round(mean(pageViews), 3),
        sd = round(mean(pageViews), 3),
        median = round(median(pageViews), 3),
        min = min(pageViews),
        max = max(pageViews)),
   by = list(restaurantType, adType)]

##    restaurantType   adType Count    mean      sd median min max
## 1:          chain   No Ads  4000 599.571 599.571    599 437 766
## 2:          chain Curr Ads  4000 690.397 690.397    690 444 929
## 3:          chain  New Ads  4000 690.571 690.571    690 440 918
## 4:    independent   No Ads  6000 299.918 299.918    300 145 450
## 5:    independent Curr Ads  6000 375.053 375.053    375 209 530
## 6:    independent  New Ads  6000 344.971 344.971    345 188 483

# b. Phone Calls
dt[, list(Count = .N,
        mean = round(mean(phoneCalls), 3),
        sd = round(mean(phoneCalls), 3),
        median = round(median(phoneCalls), 3),
        min = min(phoneCalls),
        max = max(phoneCalls)),
   by = list(restaurantType, adType)]

##    restaurantType   adType Count   mean     sd median min max
## 1:          chain   No Ads  4000 40.069 40.069     40  22  58
## 2:          chain Curr Ads  4000 44.021 44.021     44  25  63
## 3:          chain  New Ads  4000 48.079 48.079     48  19  77
## 4:    independent   No Ads  6000 29.986 29.986     30  17  45
## 5:    independent Curr Ads  6000 32.967 32.967     33  20  50
## 6:    independent  New Ads  6000 37.472 37.472     37  22  53

# c. Reservations
dt[, list(Count = .N,
        mean = round(mean(reservations), 3),
        sd = round(mean(reservations), 3),
        median = round(median(reservations), 3),
        min = min(reservations),
        max = max(reservations)),
   by = list(restaurantType, adType)]

##    restaurantType   adType Count   mean     sd median min max
## 1:          chain   No Ads  4000 39.925 39.925     40  20  58
## 2:          chain Curr Ads  4000 40.101 40.101     40  23  59
## 3:          chain  New Ads  4000 48.002 48.002     48  18  79
## 4:    independent   No Ads  6000 29.984 29.984     30  15  48
## 5:    independent Curr Ads  6000 29.968 29.968     30  18  43
## 6:    independent  New Ads  6000 37.466 37.466     37  23  51

# 12. Box plot of page views by ad type and resteraunt type
bwplot(pageViews ~ adType | restaurantType, data = restGrades,
        main = "Boxplot of Page Views vs Ad Type and Resteraunt Type",
        ylab = "Page Views",
        col = "black")

# 13. Interaction plot of page views by ad type and resteraunt type
interaction.plot(adType, restaurantType, pageViews,
                 type = "b", col = c(1:3), leg.bty = "o",
                 leg.bg = "beige", lwd = 2, pch = c(18, 24, 22),
                 xlab = "Ad Type", ylab = "Page Views",
                 main = "Interaction Plot of page views by ad type and resteraunt type")

# 14. two-way ANOVA
twoWayfit <- aov(pageViews ~ adType * restaurantType, data = restGrades)
# summary of the ANOVA model
summary(twoWayfit)

##                          Df    Sum Sq   Mean Sq  F value Pr(>F)    
## adType                    2  36582190  18291095   7734.8 <2e-16 ***
## restaurantType            1 738196252 738196252 312161.4 <2e-16 ***
## adType:restaurantType     2   2618217   1309109    553.6 <2e-16 ***
## Residuals             29994  70929518      2365                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# 15. Inferences from Two-way ANOVA Model
# All the p-values are less than 0.05. Hence, we reject the null hypotheses. We can conclude that there are significant differences in page views between the ad type as well as restaurant type.

# Kruskal-Wallis rank sum test
kruskal.test(pageViewsNew ~ interaction(adType, restaurantType), data = restGrades)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  pageViewsNew by interaction(adType, restaurantType)
## Kruskal-Wallis chi-squared = 24655, df = 5, p-value < 2.2e-16

# 16. ANOVA Assumptions
# a. Normality
# normal Q-Q plot
plot(twoWayfit, 2)

# H0: Data is normally distributed
# Anderson Darling Test
ad.test(pageViews)

## 
##  Anderson-Darling normality test
## 
## data:  pageViews
## A = 1432.3, p-value < 2.2e-16

# Conclusion: Reject Null Hypothesis

# b. Homogeneity
plot(twoWayfit, 1)

# H0: Data has homogeneous variance in page views
# Levene Test 
leveneTest(pageViews ~ adType * restaurantType, data = restGrades)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5  315.24 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Conclusion: Reject Null Hypothesis

# 17. Pairwise comparison test values and plot,
TukeyHSD(twoWayfit)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pageViews ~ adType * restaurantType, data = restGrades)
## 
## $adType
##                      diff       lwr       upr p adj
## New Ads-Curr Ads -17.9798 -19.59161 -16.36799     0
## No Ads-Curr Ads  -81.4114 -83.02321 -79.79959     0
## No Ads-New Ads   -63.4316 -65.04341 -61.81979     0
## 
## $restaurantType
##                        diff      lwr       upr p adj
## independent-chain -320.1988 -321.322 -319.0755     0
## 
## $`adType:restaurantType`
##                                                diff         lwr
## New Ads:chain-Curr Ads:chain                0.17425   -2.924463
## No Ads:chain-Curr Ads:chain               -90.82550  -93.924213
## Curr Ads:independent-Curr Ads:chain      -315.34325 -318.171975
## New Ads:independent-Curr Ads:chain       -345.42575 -348.254475
## No Ads:independent-Curr Ads:chain        -390.47858 -393.307308
## No Ads:chain-New Ads:chain                -90.99975  -94.098463
## Curr Ads:independent-New Ads:chain       -315.51750 -318.346225
## New Ads:independent-New Ads:chain        -345.60000 -348.428725
## No Ads:independent-New Ads:chain         -390.65283 -393.481558
## Curr Ads:independent-No Ads:chain        -224.51775 -227.346475
## New Ads:independent-No Ads:chain         -254.60025 -257.428975
## No Ads:independent-No Ads:chain          -299.65308 -302.481808
## New Ads:independent-Curr Ads:independent  -30.08250  -32.612588
## No Ads:independent-Curr Ads:independent   -75.13533  -77.665422
## No Ads:independent-New Ads:independent    -45.05283  -47.582922
##                                                  upr     p adj
## New Ads:chain-Curr Ads:chain                3.272963 0.9999854
## No Ads:chain-Curr Ads:chain               -87.726787 0.0000000
## Curr Ads:independent-Curr Ads:chain      -312.514525 0.0000000
## New Ads:independent-Curr Ads:chain       -342.597025 0.0000000
## No Ads:independent-Curr Ads:chain        -387.649859 0.0000000
## No Ads:chain-New Ads:chain                -87.901037 0.0000000
## Curr Ads:independent-New Ads:chain       -312.688775 0.0000000
## New Ads:independent-New Ads:chain        -342.771275 0.0000000
## No Ads:independent-New Ads:chain         -387.824109 0.0000000
## Curr Ads:independent-No Ads:chain        -221.689025 0.0000000
## New Ads:independent-No Ads:chain         -251.771525 0.0000000
## No Ads:independent-No Ads:chain          -296.824359 0.0000000
## New Ads:independent-Curr Ads:independent  -27.552412 0.0000000
## No Ads:independent-Curr Ads:independent   -72.605245 0.0000000
## No Ads:independent-New Ads:independent    -42.522745 0.0000000

plot(TukeyHSD(twoWayfit))

pairwise.t.test(pageViews, interaction(adType, restaurantType),
                data = restGrades, 
                p.adjust.method = "BH",
                pool.sd = FALSE)

## 
##  Pairwise comparisons using t tests with non-pooled SD 
## 
## data:  pageViews and interaction(adType, restaurantType) 
## 
##                      Curr Ads.chain New Ads.chain No Ads.chain
## New Ads.chain        0.9            -             -           
## No Ads.chain         <2e-16         <2e-16        -           
## Curr Ads.independent <2e-16         <2e-16        <2e-16      
## New Ads.independent  <2e-16         <2e-16        <2e-16      
## No Ads.independent   <2e-16         <2e-16        <2e-16      
##                      Curr Ads.independent New Ads.independent
## New Ads.chain        -                    -                  
## No Ads.chain         -                    -                  
## Curr Ads.independent -                    -                  
## New Ads.independent  <2e-16               -                  
## No Ads.independent   <2e-16               <2e-16             
## 
## P value adjustment method: BH

# 18. ANOVA test if the normality is not satisfied
# Two-way ANOVA after transformation using boxcox
twoWayTransfit <- aov(pageViewsNew ~ adType * restaurantType, data = restGrades)
# summary of the ANOVA model
summary(twoWayTransfit)

##                          Df Sum Sq Mean Sq F value Pr(>F)    
## adType                    2   5.44    2.72    7525 <2e-16 ***
## restaurantType            1  81.43   81.43  225414 <2e-16 ***
## adType:restaurantType     2   0.48    0.24     670 <2e-16 ***
## Residuals             29994  10.83    0.00                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Before Boxcox-transformation
# normal Q-Q plot before boxcox-transformation
plot(twoWayfit, 2)

# Anderson-Darling normality test after Boxcox-transformation
ad.test(pageViewsNew)

## 
##  Anderson-Darling normality test
## 
## data:  pageViewsNew
## A = 933.12, p-value < 2.2e-16

# 19. ANOVA test if variances are not equal
# Before transformations
# residual versus fitted plot before Boxcox-transformation
plot(twoWayfit, 1)

# After boxcox transformations
# residual versus fitted plot after Boxcox-transformation
plot(twoWayTransfit, 1)

# After boxcox
leveneTest(pageViewsNew ~ adType * restaurantType, data = restGrades)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5   656.9 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Class 14 - ANOVA

Mudit Gupta