Data Analysis using R

One-Way, Two-way anova test

Day7

———————————————————————–

rm(list=ls())
#Loading the data GSS2018.rda
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

srcFdr="D:\\D Drive\\Certificate Course\\data"
fileNm="gss2018.rda"
srcFile=paste(srcFdr,fileNm,sep="\\")
load(srcFile)
gss_2018=GSS
rm(GSS)
#Clean and transform the gss file
gssCln=gss_2018%>%select(HAPPY,SEX,DEGREE,USETECH,AGE)%>%
  mutate(HAPPY=na_if(HAPPY,0))%>%
  mutate(HAPPY=na_if(HAPPY,8))%>%
  mutate(HAPPY=na_if(HAPPY,9))%>%
  mutate(DEGREE=na_if(DEGREE,8))%>%
  mutate(DEGREE=na_if(DEGREE,9))%>%
  mutate(USETECH=na_if(USETECH,-1))%>%
  mutate(USETECH=na_if(USETECH,998))%>%
  mutate(USETECH=na_if(USETECH,999))%>%
  mutate(AGE=na_if(AGE,99))%>%
  mutate(AGE=na_if(AGE,98))%>%
  mutate(SEX=factor(x=SEX,labels = c("M","F")))%>%
  mutate(HAPPY=factor(x=HAPPY,labels = c("Very Happy",
                                         "Pretty Happy",
                                         "Not too Happy")))%>%
  mutate(DEGREE=factor(x=DEGREE,labels = c("<High Sc",
                                         "High Sc",
                                         "JC",
                                         "Bachelor",
                                         "Graduate")))

  
#mean and standard deviation by degree
useDeg=gssCln%>%drop_na(USETECH)%>%
  group_by(DEGREE)%>%
  summarize(aveT=mean(USETECH),
            sdT=sd(USETECH))
useDeg

## # A tibble: 5 × 3
##   DEGREE    aveT   sdT
##   <fct>    <dbl> <dbl>
## 1 <High Sc  24.8  36.2
## 2 High Sc   49.6  38.6
## 3 JC        62.4  35.2
## 4 Bachelor  67.9  32.1
## 5 Graduate  68.7  30.2

#Boxplot for different groups based on degree
gssCln%>%ggplot(aes(x=DEGREE,y=USETECH,fill=DEGREE))+
  geom_boxplot()+
  geom_point(alpha=.4)+
  labs(x="Degree",y="Technology Usage(%)")

## Warning: Removed 936 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 936 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Boxplot for different groups based on degree with random error
gssCln%>%ggplot(aes(x=DEGREE,y=USETECH,fill=DEGREE))+
  geom_boxplot()+
  geom_jitter(alpha=.4)+
  labs(x="Degree",y="Technology Usage(%)")

## Warning: Removed 936 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Removed 936 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Density plot for different degrees
gssCln%>%
  ggplot(aes(x=USETECH,fill=DEGREE))+
  geom_histogram(stat="density")+
  facet_wrap(~DEGREE,nrow=3)+
  theme_minimal()+
  labs(x="Technology Usage",y="Density")

## Warning in geom_histogram(stat = "density"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

## Warning: Removed 936 rows containing non-finite outside the scale range
## (`stat_density()`).

#1st case study for One-way anova
#Data of three groups
water=c(10,12,18,24,36)
juice=c(11,14,19,23,38)
cof=c(12,13,17,25,37)

#Calculate the mean of the groups
mwater=mean(water)
mjuice=mean(juice)
mcof=mean(cof)
#Loading means into a vector
gm=c(mwater,mjuice,mcof)
gm

## [1] 20.0 21.0 20.8

#calculate sum of square within group 
ssw=(water-mwater)^2
ssj=(juice-mjuice)^2
ssc=(cof-mcof)^2
#Calculate mean sum of square within group
MSW=(sum(ssw)+sum(ssj)+sum(ssc))/12
#Calculate grand mean
gb=mean(gm)
gb

## [1] 20.6

#Calculate mean square between groups
MSB=sum(5*(gm-gb)^2)/2
#F-Statitistics
MSB/MSW

## [1] 0.01273885

#P -value from F distribution
pf(MSB/MSW,df1=2,df2=12,lower.tail = FALSE)

## [1] 0.9873553

#Same example using R
grp=c(rep("g1",5),rep("g2",5),rep("g3",5))
grp

##  [1] "g1" "g1" "g1" "g1" "g1" "g2" "g2" "g2" "g2" "g2" "g3" "g3" "g3" "g3" "g3"

t=c(water,juice,cof)
df=data.frame(grp,t)
oneway.test(df$t~df$grp,var.equal = TRUE)

## 
##  One-way analysis of means
## 
## data:  df$t and df$grp
## F = 0.012739, num df = 2, denom df = 12, p-value = 0.9874

#2nd case study for One-way anova
water=c(29,29,30,31,31)
juice=c(17,18,19,19,20)
cof=c(10,11,12,12,13)
#Calculate the mean of the groups
mwater=mean(water)
mjuice=mean(juice)
mcof=mean(cof)
#Loading means into a vector
gm=c(mwater,mjuice,mcof)
gm

## [1] 30.0 18.6 11.6

#calculate sum of square within group 
ssw=(water-mwater)^2
ssj=(juice-mjuice)^2
ssc=(cof-mcof)^2
#Calculate mean sum of square within group
MSW=(sum(ssw)+sum(ssj)+sum(ssc))/12
#Calculate grand mean
gb=mean(gm)
gb

## [1] 20.06667

#Calculate mean square between groups
MSB=sum(5*(gm-gb)^2)/2
#F-Statitistics
MSB/MSW

## [1] 359.3889

#P -value from F distribution
pf(MSB/MSW,df1=2,df2=12,lower.tail = FALSE)

## [1] 1.960539e-11

#Same example using R
grp=c(rep("g1",5),rep("g2",5),rep("g3",5))
grp

##  [1] "g1" "g1" "g1" "g1" "g1" "g2" "g2" "g2" "g2" "g2" "g3" "g3" "g3" "g3" "g3"

t=c(water,juice,cof)
df=data.frame(grp,t)
oneway.test(df$t~df$grp,var.equal = TRUE)

## 
##  One-way analysis of means
## 
## data:  df$t and df$grp
## F = 359.39, num df = 2, denom df = 12, p-value = 1.961e-11

#One way anova for gss case study
oneway.test(gssCln$USETECH~gssCln$DEGREE,var.equal = TRUE)

## 
##  One-way analysis of means
## 
## data:  gssCln$USETECH and gssCln$DEGREE
## F = 43.304, num df = 4, denom df = 1404, p-value < 2.2e-16

#Post hoc analysis - Benferroni
pairwise.t.test(gssCln$USETECH,gssCln$DEGREE,
                p.adj ="bonf" )

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  gssCln$USETECH and gssCln$DEGREE 
## 
##          <High Sc High Sc JC     Bachelor
## High Sc  3.8e-11  -       -      -       
## JC       2.8e-15  0.0022  -      -       
## Bachelor < 2e-16  8.0e-13 1.0000 -       
## Graduate < 2e-16  7.3e-09 1.0000 1.0000  
## 
## P value adjustment method: bonferroni

useDeg

## # A tibble: 5 × 3
##   DEGREE    aveT   sdT
##   <fct>    <dbl> <dbl>
## 1 <High Sc  24.8  36.2
## 2 High Sc   49.6  38.6
## 3 JC        62.4  35.2
## 4 Bachelor  67.9  32.1
## 5 Graduate  68.7  30.2

#Post hoc analysis - TukeyHSd
TukeyHSD(aov(gssCln$USETECH~gssCln$DEGREE))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = gssCln$USETECH ~ gssCln$DEGREE)
## 
## $`gssCln$DEGREE`
##                         diff       lwr      upr     p adj
## High Sc-<High Sc  24.8247754 15.145211 34.50434 0.0000000
## JC-<High Sc       37.6070313 25.201887 50.01218 0.0000000
## Bachelor-<High Sc 43.0859568 32.653180 53.51873 0.0000000
## Graduate-<High Sc 43.9107249 32.256416 55.56503 0.0000000
## JC-High Sc        12.7822558  3.362603 22.20191 0.0020352
## Bachelor-High Sc  18.2611813 11.651711 24.87065 0.0000000
## Graduate-High Sc  19.0859494 10.679691 27.49221 0.0000000
## Bachelor-JC        5.4789255 -4.713166 15.67102 0.5833665
## Graduate-JC        6.3036936 -5.135659 17.74305 0.5592907
## Graduate-Bachelor  0.8247681 -8.438819 10.08835 0.9992282

#Check Normality Assumption
gssCln%>%drop_na(USETECH)%>%
  ggplot(aes(sample=USETECH))+
  stat_qq(fill="blue",alpha=.4)+
  stat_qq_line(colour="red")+
  facet_wrap(~DEGREE)+
  labs(x="Theoritical Distribution",
      y="Trchnolgy Usage")

#Check Normality Assumption test
shapiro.test(gssCln$USETECH)

## 
##  Shapiro-Wilk normality test
## 
## data:  gssCln$USETECH
## W = 0.8645, p-value < 2.2e-16

gssCln%>%drop_na(USETECH)%>%
  group_by(DEGREE)%>%
  summarize(pval=shapiro.test(USETECH)$p.value)

## # A tibble: 5 × 2
##   DEGREE       pval
##   <fct>       <dbl>
## 1 <High Sc 1.83e-14
## 2 High Sc  5.99e-24
## 3 JC       2.92e- 9
## 4 Bachelor 1.22e-16
## 5 Graduate 4.34e-11

#Non-parametric Brown-Forsythe test
np=gssCln%>%drop_na(USETECH)%>%
  group_by(DEGREE)%>%
  mutate(st=abs(USETECH-median(USETECH)))
oneway.test(st~DEGREE,data=np)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  st and DEGREE
## F = 19.747, num df = 4.00, denom df = 364.77, p-value = 9.965e-15

#Two-way anova
#Boxplot by degree and sex
gssCln %>%drop_na(USETECH)%>%
  ggplot(aes(y = USETECH, x = DEGREE,fill=SEX)) +
  geom_boxplot( alpha = .4) +
  scale_fill_manual(values = c("blue", "magenta")) +
  theme_minimal() +
  labs(x = "Educational attainment", y = "Percent of time spent using technology")

#Calcuate mean and deviation for unique value of Degree and Sex
gssCln%>%drop_na(USETECH)%>%
  group_by(DEGREE,SEX)%>%
  summarise(useA=mean(USETECH),
            useSD=sd(USETECH))

## `summarise()` has grouped output by 'DEGREE'. You can override using the
## `.groups` argument.

## # A tibble: 10 × 4
## # Groups:   DEGREE [5]
##    DEGREE   SEX    useA useSD
##    <fct>    <fct> <dbl> <dbl>
##  1 <High Sc M      25.7  35.4
##  2 <High Sc F      23.7  37.5
##  3 High Sc  M      43.5  37.8
##  4 High Sc  F      55.9  38.6
##  5 JC       M      47.0  36.8
##  6 JC       F      70.4  31.7
##  7 Bachelor M      68.0  33.1
##  8 Bachelor F      67.7  31.2
##  9 Graduate M      72.1  29.2
## 10 Graduate F      65.9  30.9

#Mean plot based on degree and sex
gssCln%>%drop_na(USETECH)%>%
  ggplot(aes(x=DEGREE,y=USETECH,colour = SEX))+
  stat_summary(fun.y = mean,geom = "point",size=3)+
  stat_summary(fun.y = mean,geom = "line",aes(group = SEX))+
  theme_minimal()+
  labs(x = "Educational attainment", 
       y = "Percent of time spent using technology")+
  ylim(0,100)

## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Two - way anova
techuseByDegSex=aov(gssCln$USETECH~gssCln$DEGREE*gssCln$SEX)
summary(techuseByDegSex)

##                            Df  Sum Sq Mean Sq F value   Pr(>F)    
## gssCln$DEGREE               4  221301   55325  44.209  < 2e-16 ***
## gssCln$SEX                  1   16473   16473  13.163 0.000296 ***
## gssCln$DEGREE:gssCln$SEX    4   26510    6627   5.296 0.000311 ***
## Residuals                1399 1750775    1251                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 936 observations deleted due to missingness

#Post Hoc analysis
TukeyHSD(techuseByDegSex)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = gssCln$USETECH ~ gssCln$DEGREE * gssCln$SEX)
## 
## $`gssCln$DEGREE`
##                         diff       lwr       upr     p adj
## High Sc-<High Sc  24.8247754 15.244768 34.404783 0.0000000
## JC-<High Sc       37.6070313 25.329478 49.884584 0.0000000
## Bachelor-<High Sc 43.0859568 32.760484 53.411429 0.0000000
## Graduate-<High Sc 43.9107249 32.376284 55.445165 0.0000000
## JC-High Sc        12.7822558  3.459487 22.105024 0.0017563
## Bachelor-High Sc  18.2611813 11.719691 24.802671 0.0000000
## Graduate-High Sc  19.0859494 10.766152 27.405746 0.0000000
## Bachelor-JC        5.4789255 -4.608337 15.566188 0.5733923
## Graduate-JC        6.3036936 -5.018002 17.625389 0.5490670
## Graduate-Bachelor  0.8247681 -8.343540  9.993076 0.9991960
## 
## $`gssCln$SEX`
##        diff      lwr      upr     p adj
## F-M 6.80899 3.108699 10.50928 0.0003174
## 
## $`gssCln$DEGREE:gssCln$SEX`
##                              diff         lwr         upr     p adj
## High Sc:M-<High Sc:M   17.8132060   2.7275183  32.8988937 0.0072699
## JC:M-<High Sc:M        21.3181818  -0.4992077  43.1355713 0.0619111
## Bachelor:M-<High Sc:M  42.3151914  25.7902764  58.8401064 0.0000000
## Graduate:M-<High Sc:M  46.3538961  27.5496712  65.1581210 0.0000000
## <High Sc:F-<High Sc:M  -2.0378788 -22.6075109  18.5317533 0.9999995
## High Sc:F-<High Sc:M   30.1500000  15.0344692  45.2655308 0.0000000
## JC:F-<High Sc:M        44.7418831  26.3028236  63.1809427 0.0000000
## Bachelor:F-<High Sc:M  42.0396406  25.8082011  58.2710800 0.0000000
## Graduate:F-<High Sc:M  40.1813241  22.0984520  58.2641962 0.0000000
## JC:M-High Sc:M          3.5049758 -14.4610385  21.4709901 0.9998264
## Bachelor:M-High Sc:M   24.5019854  13.5542915  35.4496792 0.0000000
## Graduate:M-High Sc:M   28.5406901  14.3851943  42.6961858 0.0000000
## <High Sc:F-High Sc:M  -19.8510848 -36.2793820  -3.4227876 0.0052315
## High Sc:F-High Sc:M    12.3367940   3.6616307  21.0119573 0.0003049
## JC:F-High Sc:M         26.9286771  13.2619985  40.5953557 0.0000000
## Bachelor:F-High Sc:M   24.2264346  13.7269673  34.7259018 0.0000000
## Graduate:F-High Sc:M   22.3681181   9.1859540  35.5502821 0.0000039
## Bachelor:M-JC:M        20.9970096   1.8065820  40.1874372 0.0192892
## Graduate:M-JC:M        25.0357143   3.8508477  46.2205808 0.0071871
## <High Sc:F-JC:M       -23.3560606 -46.1224714  -0.5896498 0.0389231
## High Sc:F-JC:M          8.8318182  -9.1592621  26.8228985 0.8690307
## JC:F-JC:M              23.4237013   2.5622868  44.2851158 0.0141081
## Bachelor:F-JC:M        20.7214588   1.7831557  39.6597618 0.0192858
## Graduate:F-JC:M        18.8631423  -1.6841193  39.4104039 0.1039186
## Graduate:M-Bachelor:M   4.0387047 -11.6416301  19.7190396 0.9983501
## <High Sc:F-Bachelor:M -44.3530702 -62.1121183 -26.5940220 0.0000000
## High Sc:F-Bachelor:M  -12.1651914 -23.1539720  -1.1764108 0.0167764
## JC:F-Bachelor:M         2.4266917 -12.8138117  17.6671952 0.9999688
## Bachelor:F-Bachelor:M  -0.2755508 -12.7548798  12.2037783 1.0000000
## Graduate:F-Bachelor:M  -2.1338673 -16.9414427  12.6737082 0.9999867
## <High Sc:F-Graduate:M -48.3917749 -68.2892584 -28.4942914 0.0000000
## High Sc:F-Graduate:M  -16.2038961 -30.3911918  -2.0166004 0.0113631
## JC:F-Graduate:M        -1.6120130 -19.2981376  16.0741116 0.9999998
## Bachelor:F-Graduate:M  -4.3142555 -19.6849976  11.0564866 0.9967894
## Graduate:F-Graduate:M  -6.1725720 -23.4870269  11.1418829 0.9816675
## High Sc:F-<High Sc:F   32.1878788  15.7321731  48.6435845 0.0000000
## JC:F-<High Sc:F        46.7797619  27.2270154  66.3325084 0.0000000
## Bachelor:F-<High Sc:F  44.0775194  26.5912218  61.5638170 0.0000000
## Graduate:F-<High Sc:F  42.2192029  23.0019908  61.4364150 0.0000000
## JC:F-High Sc:F         14.5918831   0.8922699  28.2914963 0.0261888
## Bachelor:F-High Sc:F   11.8896406   1.3473395  22.4319416 0.0133486
## Graduate:F-High Sc:F   10.0313241  -3.1849820  23.2476303 0.3233313
## Bachelor:F-JC:F        -2.7022425 -17.6240305  12.2195454 0.9999069
## Graduate:F-JC:F        -4.5605590 -21.4777217  12.3566037 0.9976459
## Graduate:F-Bachelor:F  -1.8583165 -16.3376501  12.6210171 0.9999951