library(data.table)
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(nortest)
library(car)

## Loading required package: carData

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

restaurant<-read.csv("C:/Users/ALKR/Desktop/Term V/DAM/Dataset/AdvertisingDataV2.csv")
attach(restaurant)

Q. 1. Create a frequency table using your crucial independent variable (x)

addmargins(table(adType))

## adType
## Curr Ads  New Ads   No Ads      Sum 
##    10000    10000    10000    30000

Q. 2. Create a table showing mean, sd, median, min, max, count of your crucial dependent variable.

dt <- data.table(restaurant)
dt[, list(Count = .N,
        mean = round(mean(pageViews), 3), 
        sd = round(mean(pageViews), 3),
        median = round(median(pageViews), 3),
        min = min(pageViews),
        max = max(pageViews)), 
   by = list(adType)]

##      adType Count    mean      sd median min max
## 1:   No Ads 10000 419.779 419.779    339 145 766
## 2: Curr Ads 10000 501.191 501.191    419 209 929
## 3:  New Ads 10000 483.211 483.211    384 188 918

Q. 3. Draw box plots, side-by-side, showing the distribution of the dependent variable (y) w.r.t. Independent variable (x).

Box plot of Page Views vs Ad Type

boxplot(pageViews ~ adType, data = restaurant,
        main = "Boxplot of Page Views vs Ad Type",
        xlab = "Ad Type", ylab = "Page Views")

Q. 4. Draw mean plots, showing the average value of the dependent variable (y) w.r.t. Independent variable (x).

plotmeans(pageViews ~ adType, data = restaurant,
          xlab = "Ad Type", ylab = "Page Views",
          digits=2, col = "black", ccol = "blue", barwidth = 2,
          legends = TRUE, mean.labels = TRUE, frame = TRUE)

## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is
## not a graphical parameter

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

Q. 5. Run one-way ANOVA of dependent variable (y) with respect to independent variable (x).

oneWayfit <- aov(pageViews ~ adType, data = restaurant)
summary(oneWayfit)

##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## adType          2  36582190 18291095   675.9 <2e-16 ***
## Residuals   29997 811743988    27061                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value < 0.05, we can conclude that there are significant differences in page views between the different advertising types.

Q. 6. Check ANOVA assumptions for your analysis.

with(restaurant, tapply(pageViews, adType, ad.test))

## $`Curr Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 589.56, p-value < 2.2e-16
## 
## 
## $`New Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 739.4, p-value < 2.2e-16
## 
## 
## $`No Ads`
## 
##  Anderson-Darling normality test
## 
## data:  X[[i]]
## A = 671.15, p-value < 2.2e-16

Anderson Darling test Null hypothesis (H0): The data is normally distributed We reject the null hypothesis since p-value is less than 0.05. Hence, the data is not normally distributed.

leveneTest(pageViews ~ adType, data = restaurant)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     2  57.346 < 2.2e-16 ***
##       29997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check for homogeneity of variance Null hypothesis (H0): The variance of page views is homogenous across different types of Ads. We reject the null hypothesis. There is heterogeneity in variance of page views.

Q. 7. Generate one comparison test values and plot, for one-way ANOVA.

TukeyHSD(oneWayfit)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pageViews ~ adType, data = restaurant)
## 
## $adType
##                      diff      lwr      upr p adj
## New Ads-Curr Ads -17.9798 -23.4322 -12.5274     0
## No Ads-Curr Ads  -81.4114 -86.8638 -75.9590     0
## No Ads-New Ads   -63.4316 -68.8840 -57.9792     0

plot(TukeyHSD(oneWayfit))

Q. 8. Run ANOVA test if the normality is not satisfied.

Kruskal-Wallis rank sum test

kruskal.test(pageViews ~ adType, data = restaurant)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  pageViews by adType
## Kruskal-Wallis chi-squared = 2865.7, df = 2, p-value < 2.2e-16

pageViewsTrans <- BoxCoxTrans(pageViews)
pageViewsTrans

## Box-Cox Transformation
## 
## 30000 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   145.0   328.0   391.0   468.1   636.0   929.0 
## 
## Largest/Smallest: 6.41 
## Sample Skewness: 0.451 
## 
## Estimated Lambda: -0.3

Append the transformed variable to dataset

restaurant <- cbind(restaurant, pageViewsNew = predict(pageViewsTrans, pageViews))
attach(restaurant)

## The following objects are masked from restaurant (pos = 3):
## 
##     adType, businessID, pageViews, phoneCalls, reservations,
##     restaurantType

Change pageViews to transformed pageViews using BoxCox

oneWaytransfit <- aov(pageViewsNew ~ adType, data = restaurant)
summary(oneWaytransfit)

##                Df Sum Sq Mean Sq F value Pr(>F)    
## adType          2   5.44  2.7183   879.2 <2e-16 ***
## Residuals   29997  92.74  0.0031                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Q. 9.Run ANOVA test if variances are not equal.

Welch One-way Test. ANOVA test when variances are not same

oneway.test(pageViews ~ adType, data = restaurant)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  pageViews and adType
## F = 734.5, num df = 2, denom df = 19934, p-value < 2.2e-16

Q. 10. Create a frequency table using your crucial independent variables (x1, x2, x3,…).

addmargins(table(adType, restaurantType))

##           restaurantType
## adType     chain independent   Sum
##   Curr Ads  4000        6000 10000
##   New Ads   4000        6000 10000
##   No Ads    4000        6000 10000
##   Sum      12000       18000 30000

Q. 11. Create a frequency table (s) showing mean, sd, median, min, max, count of your crucial dependent variables (x1, x2, x3,…).

PageViews

dt <- data.table(restaurant)
dt[, list(Count = .N,
        mean = round(mean(pageViews), 3),
        sd = round(mean(pageViews), 3),
        median = round(median(pageViews), 3),
        min = min(pageViews),
        max = max(pageViews)),
   by = list(restaurantType, adType)]

##    restaurantType   adType Count    mean      sd median min max
## 1:          chain   No Ads  4000 599.571 599.571    599 437 766
## 2:          chain Curr Ads  4000 690.397 690.397    690 444 929
## 3:          chain  New Ads  4000 690.571 690.571    690 440 918
## 4:    independent   No Ads  6000 299.918 299.918    300 145 450
## 5:    independent Curr Ads  6000 375.053 375.053    375 209 530
## 6:    independent  New Ads  6000 344.971 344.971    345 188 483

PhoneCalls # b. Phone Calls

dt <- data.table(restaurant)
dt[, list(Count = .N,
        mean = round(mean(phoneCalls), 3),
        sd = round(mean(phoneCalls), 3),
        median = round(median(phoneCalls), 3),
        min = min(phoneCalls),
        max = max(phoneCalls)),
   by = list(restaurantType, adType)]

##    restaurantType   adType Count   mean     sd median min max
## 1:          chain   No Ads  4000 40.069 40.069     40  22  58
## 2:          chain Curr Ads  4000 44.021 44.021     44  25  63
## 3:          chain  New Ads  4000 48.079 48.079     48  19  77
## 4:    independent   No Ads  6000 29.986 29.986     30  17  45
## 5:    independent Curr Ads  6000 32.967 32.967     33  20  50
## 6:    independent  New Ads  6000 37.472 37.472     37  22  53

Q. 12. Draw box plots, side-by-side, showing the distribution of the dependent variable (y) w.r.t. Independent variables (x1, x2, x3,…).

Box plot of page views by ad type and restaurant type

bwplot(pageViews ~ adType | restaurantType, data = restaurant,
        main = "Boxplot of Page Views vs Ad Type and Restaurant Type",
        ylab = "Page Views",
        col = "black")

Q. 13. Draw mean plots, showing the average value of the dependent variable (y) w.r.t. Independent variables (x1, x2, x3,…).

Interaction Plot of Page Views by Ad Type and Restaurant Type

interaction.plot(adType, restaurantType, pageViews,
                 type = "b", col = c(1:3), leg.bty = "o",
                 leg.bg = "beige", lwd = 2, pch = c(18, 24, 22),
                 xlab = "Ad Type", ylab = "Page Views",
                 main = "Interaction Plot of Page Views by Ad Type and Restaurant Type")

Q. 14. Run two-way ANOVA of dependent variable (y) with respect to independent variables with interaction (x1, x2, x3,…). Choose your crucial independent variables for interaction.

ANOVA of Page Views by Ad Type and Restaurant Type

twoWayfit <- aov(pageViews ~ adType * restaurantType, data = restaurant)
summary(twoWayfit)

##                          Df    Sum Sq   Mean Sq  F value Pr(>F)    
## adType                    2  36582190  18291095   7734.8 <2e-16 ***
## restaurantType            1 738196252 738196252 312161.4 <2e-16 ***
## adType:restaurantType     2   2618217   1309109    553.6 <2e-16 ***
## Residuals             29994  70929518      2365                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Q. 15. Write test inference for Q14.

All the p-values are less than 0.05. Hence, we reject the null hypotheses. We can conclude that there are significant differences in page views between the ad type as well as restaurant type.

Q. 16. Check ANOVA assumptions for your analysis.

H0: Data is normally distributed Checking for normality in page Views using AD test

ad.test(pageViews)

## 
##  Anderson-Darling normality test
## 
## data:  pageViews
## A = 1432.3, p-value < 2.2e-16

Normality test is statistically significant, we reject the null hypothesis. Hence, we cannot assume normality in the data.

Checking for homogeneity of variance with Levene test

Null hypothesis (H0): The variance of page views are homogenous between groups.

leveneTest(pageViews ~ adType * restaurantType, data = restaurant)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5  315.24 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The test is statistically significant, we reject the null hypothesis. Hence, we cannot assume the homogeneity of variance between the groups.

Q. 17. Generate pairwise comparison test values and plot, for two-way ANOVA.

TukeyHSD(twoWayfit)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pageViews ~ adType * restaurantType, data = restaurant)
## 
## $adType
##                      diff       lwr       upr p adj
## New Ads-Curr Ads -17.9798 -19.59161 -16.36799     0
## No Ads-Curr Ads  -81.4114 -83.02321 -79.79959     0
## No Ads-New Ads   -63.4316 -65.04341 -61.81979     0
## 
## $restaurantType
##                        diff      lwr       upr p adj
## independent-chain -320.1988 -321.322 -319.0755     0
## 
## $`adType:restaurantType`
##                                                diff         lwr
## New Ads:chain-Curr Ads:chain                0.17425   -2.924463
## No Ads:chain-Curr Ads:chain               -90.82550  -93.924213
## Curr Ads:independent-Curr Ads:chain      -315.34325 -318.171975
## New Ads:independent-Curr Ads:chain       -345.42575 -348.254475
## No Ads:independent-Curr Ads:chain        -390.47858 -393.307308
## No Ads:chain-New Ads:chain                -90.99975  -94.098463
## Curr Ads:independent-New Ads:chain       -315.51750 -318.346225
## New Ads:independent-New Ads:chain        -345.60000 -348.428725
## No Ads:independent-New Ads:chain         -390.65283 -393.481558
## Curr Ads:independent-No Ads:chain        -224.51775 -227.346475
## New Ads:independent-No Ads:chain         -254.60025 -257.428975
## No Ads:independent-No Ads:chain          -299.65308 -302.481808
## New Ads:independent-Curr Ads:independent  -30.08250  -32.612588
## No Ads:independent-Curr Ads:independent   -75.13533  -77.665422
## No Ads:independent-New Ads:independent    -45.05283  -47.582922
##                                                  upr     p adj
## New Ads:chain-Curr Ads:chain                3.272963 0.9999854
## No Ads:chain-Curr Ads:chain               -87.726787 0.0000000
## Curr Ads:independent-Curr Ads:chain      -312.514525 0.0000000
## New Ads:independent-Curr Ads:chain       -342.597025 0.0000000
## No Ads:independent-Curr Ads:chain        -387.649859 0.0000000
## No Ads:chain-New Ads:chain                -87.901037 0.0000000
## Curr Ads:independent-New Ads:chain       -312.688775 0.0000000
## New Ads:independent-New Ads:chain        -342.771275 0.0000000
## No Ads:independent-New Ads:chain         -387.824109 0.0000000
## Curr Ads:independent-No Ads:chain        -221.689025 0.0000000
## New Ads:independent-No Ads:chain         -251.771525 0.0000000
## No Ads:independent-No Ads:chain          -296.824359 0.0000000
## New Ads:independent-Curr Ads:independent  -27.552412 0.0000000
## No Ads:independent-Curr Ads:independent   -72.605245 0.0000000
## No Ads:independent-New Ads:independent    -42.522745 0.0000000

plot(TukeyHSD(twoWayfit))

pairwise.t.test(pageViews, interaction(adType, restaurantType), data = restaurant, p.adjust.method = "BH", pool.sd = FALSE)

## 
##  Pairwise comparisons using t tests with non-pooled SD 
## 
## data:  pageViews and interaction(adType, restaurantType) 
## 
##                      Curr Ads.chain New Ads.chain No Ads.chain
## New Ads.chain        0.9            -             -           
## No Ads.chain         <2e-16         <2e-16        -           
## Curr Ads.independent <2e-16         <2e-16        <2e-16      
## New Ads.independent  <2e-16         <2e-16        <2e-16      
## No Ads.independent   <2e-16         <2e-16        <2e-16      
##                      Curr Ads.independent New Ads.independent
## New Ads.chain        -                    -                  
## No Ads.chain         -                    -                  
## Curr Ads.independent -                    -                  
## New Ads.independent  <2e-16               -                  
## No Ads.independent   <2e-16               <2e-16             
## 
## P value adjustment method: BH

Q. 18. Run ANOVA test if the normality is not satisfied.

Two-way ANOVA after transformation using boxcox

twoWayTransfit <- aov(pageViewsNew ~ adType * restaurantType, data = restaurant)
summary(twoWayTransfit)

##                          Df Sum Sq Mean Sq F value Pr(>F)    
## adType                    2   5.44    2.72    7525 <2e-16 ***
## restaurantType            1  81.43   81.43  225414 <2e-16 ***
## adType:restaurantType     2   0.48    0.24     670 <2e-16 ***
## Residuals             29994  10.83    0.00                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anderson-Darling normality test after transformation

ad.test(pageViewsNew)

## 
##  Anderson-Darling normality test
## 
## data:  pageViewsNew
## A = 933.12, p-value < 2.2e-16

Q. 19. Run ANOVA test if variances are not equal.

After Log-transformation

leveneTest(pageViewsNew ~ adType * restaurantType, data = restaurant)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5   656.9 < 2.2e-16 ***
##       29994                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Restaurant

Sivaramakrishnan R

11/27/2018