Assignment 5

dataset<-read.csv("C:/Users/Anshumaan/Desktop/2018 DAM/session 14/AdvertisingDataV2.csv")
attach(dataset)
View(dataset)

#Q. 1. Create a frequency table using your crucial independent variable (x)
addmargins(table(adType))

## adType
## Curr Ads  New Ads   No Ads      Sum 
##    10000    10000    10000    30000

#Q. 2. Create a table showing mean, sd, median, min, max, count of your crucial dependent variable.
dt <- data.table(dataset)
dt[, list(Count = .N,
        mean = round(mean(pageViews), 3), 
        sd = round(mean(pageViews), 3),
        median = round(median(pageViews), 3),
        min = min(pageViews),
        max = max(pageViews)), 
   by = list(adType)]

##      adType Count    mean      sd median min max
## 1:   No Ads 10000 419.779 419.779    339 145 766
## 2: Curr Ads 10000 501.191 501.191    419 209 929
## 3:  New Ads 10000 483.211 483.211    384 188 918

#Each ad type has 10000 counts each. Highest page views are with current ad followed by new ads followed by no ads, ie
# In ascending order of page views
# Curr Ads > New Ads > No Ads

#Q. 3. Draw box plots, side-by-side, showing the distribution of the dependent variable (y) w.r.t. Independent variable (x).
# Box plot of Page Views vs Ad Type
boxplot(pageViews ~ adType, data = dataset,
        main = "Boxplot of Page Views vs Ad Type",
        xlab = "Ad Type", ylab = "Page Views")

#Q. 4. Draw mean plots, showing the average value of the dependent variable (y) w.r.t. Independent variable (x).
suppressWarnings(plotmeans(pageViews ~ adType, data = dataset,
          xlab = "Ad Type", ylab = "Page Views",
          digits=2, col = "black", ccol = "blue", barwidth = 2,
          legends = TRUE, mean.labels = TRUE, frame = TRUE))

#Q. 5. Run one-way ANOVA of dependent variable (y) with respect to independent variable (x).
#One-way ANOVA 
oneWayfit <- aov(pageViews ~ adType, data = dataset)
#Summary of the ANOVA model
summary(oneWayfit)

##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## adType          2  36582190 18291095   675.9 <2e-16 ***
## Residuals   29997 811743988    27061                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#As the p-value < 0.05, we can conclude that there are significant differences in page views between the different advertising types.

#Q. 6. Check ANOVA assumptions for your analysis.
# Checking for normality in each group
# Anderson Darling test 
# Null hypothesis (H0): The data is normally distributed
with(dataset, tapply(pageViews, adType, ad.test))

## $`Curr Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 589.56, p-value < 2.2e-16
## 
## 
## $`New Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 739.4, p-value < 2.2e-16
## 
## 
## $`No Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 671.15, p-value < 2.2e-16

# We reject the null hypothesis since p-value is less than 0.05. Hence, the data is not normally distributed.

# Check for homogeneity of variance
# Null hypothesis (H0): The variance of page views is homogenous across different types of Ads.
leveneTest(pageViews ~ adType, data = dataset)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     2  57.346 < 2.2e-16 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# We reject the null hypothesis. There is heterogeneity in variance of page views.

#Q. 7. Generate one comparison test values and plot, for one-way ANOVA.
TukeyHSD(oneWayfit)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pageViews ~ adType, data = dataset)
## 
## $adType
##                      diff      lwr      upr p adj
## New Ads-Curr Ads -17.9798 -23.4322 -12.5274     0
## No Ads-Curr Ads  -81.4114 -86.8638 -75.9590     0
## No Ads-New Ads   -63.4316 -68.8840 -57.9792     0

plot(TukeyHSD(oneWayfit))

#Q. 8. Run ANOVA test if the normality is not satisfied. 
# Kruskal-Wallis rank sum test
kruskal.test(pageViews ~ adType, data = dataset)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  pageViews by adType
## Kruskal-Wallis chi-squared = 2865.7, df = 2, p-value < 2.2e-16

pageViewsTrans <- BoxCoxTrans(pageViews)
pageViewsTrans

## Box-Cox Transformation
## 
## 30000 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   145.0   328.0   391.0   468.1   636.0   929.0 
## 
## Largest/Smallest: 6.41 
## Sample Skewness: 0.451 
## 
## Estimated Lambda: -0.3

# append the transformed variable to dataset
dataset <- cbind(dataset, pageViewsNew = predict(pageViewsTrans, pageViews))
attach(dataset)

## The following objects are masked from dataset (pos = 3):
## 
##     adType, businessID, pageViews, phoneCalls, reservations,
##     restaurantType

# Change pageViews to transformed pageViews using BoxCox
oneWaytransfit <- aov(pageViewsNew ~ adType, data = dataset)
# Summary of the ANOVA model
summary(oneWaytransfit)

##                Df Sum Sq Mean Sq F value Pr(>F)    
## adType          2   5.44  2.7183   879.2 <2e-16 ***
## Residuals   29997  92.74  0.0031                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Q. 9.Run ANOVA test if variances are not equal.
# Welch One-way Test
# ANOVA test when variances are not same
oneway.test(pageViews ~ adType, data = dataset)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  pageViews and adType
## F = 734.5, num df = 2, denom df = 19934, p-value < 2.2e-16

#Q. 10. Create a frequency table using your crucial independent variables (x1, x2, x3,...). 
# Ad Type and Restaurant Type
addmargins(table(adType, restaurantType))

##           restaurantType
## adType     chain independent   Sum
##   Curr Ads  4000        6000 10000
##   New Ads   4000        6000 10000
##   No Ads    4000        6000 10000
##   Sum      12000       18000 30000

#Q. 11. Create a frequency table (s) showing mean, sd, median, min, max, count of your crucial dependent variables (x1, x2, x3,...).
# a. Page Views
dt <- data.table(dataset)
dt[, list(Count = .N,
        mean = round(mean(pageViews), 3),
        sd = round(mean(pageViews), 3),
        median = round(median(pageViews), 3),
        min = min(pageViews),
        max = max(pageViews)),
   by = list(restaurantType, adType)]

##    restaurantType   adType Count    mean      sd median min max
## 1:          chain   No Ads  4000 599.571 599.571    599 437 766
## 2:          chain Curr Ads  4000 690.397 690.397    690 444 929
## 3:          chain  New Ads  4000 690.571 690.571    690 440 918
## 4:    independent   No Ads  6000 299.918 299.918    300 145 450
## 5:    independent Curr Ads  6000 375.053 375.053    375 209 530
## 6:    independent  New Ads  6000 344.971 344.971    345 188 483

# b. Phone Calls
dt <- data.table(dataset)
dt[, list(Count = .N,
        mean = round(mean(phoneCalls), 3),
        sd = round(mean(phoneCalls), 3),
        median = round(median(phoneCalls), 3),
        min = min(phoneCalls),
        max = max(phoneCalls)),
   by = list(restaurantType, adType)]

##    restaurantType   adType Count   mean     sd median min max
## 1:          chain   No Ads  4000 40.069 40.069     40  22  58
## 2:          chain Curr Ads  4000 44.021 44.021     44  25  63
## 3:          chain  New Ads  4000 48.079 48.079     48  19  77
## 4:    independent   No Ads  6000 29.986 29.986     30  17  45
## 5:    independent Curr Ads  6000 32.967 32.967     33  20  50
## 6:    independent  New Ads  6000 37.472 37.472     37  22  53

#Q. 12. Draw box plots, side-by-side, showing the distribution of the dependent variable (y) w.r.t. Independent variables (x1, x2, x3,...).
# Box plot of page views by ad type and restaurant type
bwplot(pageViews ~ adType | restaurantType, data = dataset,
        main = "Boxplot of Page Views vs Ad Type and Restaurant Type",
        ylab = "Page Views",
        col = "black")

#Q. 13. Draw mean plots, showing the average value of the dependent variable (y) w.r.t. Independent variables (x1, x2, x3,...).
# Interaction Plot of Page Views by Ad Type and Restaurant Type
interaction.plot(adType, restaurantType, pageViews,
                 type = "b", col = c(1:3), leg.bty = "o",
                 leg.bg = "beige", lwd = 2, pch = c(18, 24, 22),
                 xlab = "Ad Type", ylab = "Page Views",
                 main = "Interaction Plot of Page Views by Ad Type and Restaurant Type")

#Q. 14. Run two-way ANOVA of dependent variable (y) with respect to independent variables with interaction (x1, x2, x3,...).  Choose your crucial independent variables for interaction.
# ANOVA of Page Views by Ad Type and Restaurant Type
twoWayfit <- aov(pageViews ~ adType * restaurantType, data = dataset)
# summary of the ANOVA model
summary(twoWayfit)

##                          Df    Sum Sq   Mean Sq  F value Pr(>F)    
## adType                    2  36582190  18291095   7734.8 <2e-16 ***
## restaurantType            1 738196252 738196252 312161.4 <2e-16 ***
## adType:restaurantType     2   2618217   1309109    553.6 <2e-16 ***
## Residuals             29994  70929518      2365                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Q. 15. Write test inference for Q14.
# All the p-values are less than 0.05. Hence, we reject the null hypotheses. We can conclude that there are significant differences in page views between the ad type as well as restaurant type.

#Q. 16. Check ANOVA assumptions for your analysis.
# H0: Data is normally distributed
# Checking for normality in page Views using AD test
ad.test(pageViews)

## 
##  Anderson-Darling normality test
## 
## data:  pageViews
## A = 1432.3, p-value < 2.2e-16

# Normality test is statistically significant, we reject the null hypothesis. Hence, we cannot assume normality in the data.

# Checking for homogeneity of variance with Levene test
# Null hypothesis (H0): The variance of page views are homogenous between groups.
leveneTest(pageViews ~ adType * restaurantType, data = dataset)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5  315.24 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# The test is statistically significant, we reject the null hypothesis. Hence, we cannot assume the homogeneity of variance between the groups.

#Q. 17. Generate pairwise comparison test values and plot, for two-way ANOVA.
TukeyHSD(twoWayfit)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pageViews ~ adType * restaurantType, data = dataset)
## 
## $adType
##                      diff       lwr       upr p adj
## New Ads-Curr Ads -17.9798 -19.59161 -16.36799     0
## No Ads-Curr Ads  -81.4114 -83.02321 -79.79959     0
## No Ads-New Ads   -63.4316 -65.04341 -61.81979     0
## 
## $restaurantType
##                        diff      lwr       upr p adj
## independent-chain -320.1988 -321.322 -319.0755     0
## 
## $`adType:restaurantType`
##                                                diff         lwr
## New Ads:chain-Curr Ads:chain                0.17425   -2.924463
## No Ads:chain-Curr Ads:chain               -90.82550  -93.924213
## Curr Ads:independent-Curr Ads:chain      -315.34325 -318.171975
## New Ads:independent-Curr Ads:chain       -345.42575 -348.254475
## No Ads:independent-Curr Ads:chain        -390.47858 -393.307308
## No Ads:chain-New Ads:chain                -90.99975  -94.098463
## Curr Ads:independent-New Ads:chain       -315.51750 -318.346225
## New Ads:independent-New Ads:chain        -345.60000 -348.428725
## No Ads:independent-New Ads:chain         -390.65283 -393.481558
## Curr Ads:independent-No Ads:chain        -224.51775 -227.346475
## New Ads:independent-No Ads:chain         -254.60025 -257.428975
## No Ads:independent-No Ads:chain          -299.65308 -302.481808
## New Ads:independent-Curr Ads:independent  -30.08250  -32.612588
## No Ads:independent-Curr Ads:independent   -75.13533  -77.665422
## No Ads:independent-New Ads:independent    -45.05283  -47.582922
##                                                  upr     p adj
## New Ads:chain-Curr Ads:chain                3.272963 0.9999854
## No Ads:chain-Curr Ads:chain               -87.726787 0.0000000
## Curr Ads:independent-Curr Ads:chain      -312.514525 0.0000000
## New Ads:independent-Curr Ads:chain       -342.597025 0.0000000
## No Ads:independent-Curr Ads:chain        -387.649859 0.0000000
## No Ads:chain-New Ads:chain                -87.901037 0.0000000
## Curr Ads:independent-New Ads:chain       -312.688775 0.0000000
## New Ads:independent-New Ads:chain        -342.771275 0.0000000
## No Ads:independent-New Ads:chain         -387.824109 0.0000000
## Curr Ads:independent-No Ads:chain        -221.689025 0.0000000
## New Ads:independent-No Ads:chain         -251.771525 0.0000000
## No Ads:independent-No Ads:chain          -296.824359 0.0000000
## New Ads:independent-Curr Ads:independent  -27.552412 0.0000000
## No Ads:independent-Curr Ads:independent   -72.605245 0.0000000
## No Ads:independent-New Ads:independent    -42.522745 0.0000000

plot(TukeyHSD(twoWayfit))

pairwise.t.test(pageViews, interaction(adType, restaurantType), data = dataset, p.adjust.method = "BH", pool.sd = FALSE)

## 
##  Pairwise comparisons using t tests with non-pooled SD 
## 
## data:  pageViews and interaction(adType, restaurantType) 
## 
##                      Curr Ads.chain New Ads.chain No Ads.chain
## New Ads.chain        0.9            -             -           
## No Ads.chain         <2e-16         <2e-16        -           
## Curr Ads.independent <2e-16         <2e-16        <2e-16      
## New Ads.independent  <2e-16         <2e-16        <2e-16      
## No Ads.independent   <2e-16         <2e-16        <2e-16      
##                      Curr Ads.independent New Ads.independent
## New Ads.chain        -                    -                  
## No Ads.chain         -                    -                  
## Curr Ads.independent -                    -                  
## New Ads.independent  <2e-16               -                  
## No Ads.independent   <2e-16               <2e-16             
## 
## P value adjustment method: BH

#Q. 18. Run ANOVA test if the normality is not satisfied. 
# Two-way ANOVA after transformation using boxcox
twoWayTransfit <- aov(pageViewsNew ~ adType * restaurantType, data = dataset)
# summary of the ANOVA model
summary(twoWayTransfit)

##                          Df Sum Sq Mean Sq F value Pr(>F)    
## adType                    2   5.44    2.72    7525 <2e-16 ***
## restaurantType            1  81.43   81.43  225414 <2e-16 ***
## adType:restaurantType     2   0.48    0.24     670 <2e-16 ***
## Residuals             29994  10.83    0.00                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Anderson-Darling normality test after transformation
ad.test(pageViewsNew)

## 
##  Anderson-Darling normality test
## 
## data:  pageViewsNew
## A = 933.12, p-value < 2.2e-16

#Q. 19. Run ANOVA test if variances are not equal.
# After Log-transformation
leveneTest(pageViewsNew ~ adType * restaurantType, data = dataset)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5   656.9 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Assignment 5 - ANOVA

Anshumaan

27 November 2018