rm(list=ls())
#Loading the data GSS2018.rda
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
srcFdr="D:\\D Drive\\Certificate Course\\data"
fileNm="gss2018.rda"
srcFile=paste(srcFdr,fileNm,sep="\\")
load(srcFile)
gss_2018=GSS
rm(GSS)
#Clean and transform the gss file
gssCln=gss_2018%>%select(HAPPY,SEX,DEGREE,USETECH,AGE)%>%
mutate(HAPPY=na_if(HAPPY,0))%>%
mutate(HAPPY=na_if(HAPPY,8))%>%
mutate(HAPPY=na_if(HAPPY,9))%>%
mutate(DEGREE=na_if(DEGREE,8))%>%
mutate(DEGREE=na_if(DEGREE,9))%>%
mutate(USETECH=na_if(USETECH,-1))%>%
mutate(USETECH=na_if(USETECH,998))%>%
mutate(USETECH=na_if(USETECH,999))%>%
mutate(AGE=na_if(AGE,99))%>%
mutate(AGE=na_if(AGE,98))%>%
mutate(SEX=factor(x=SEX,labels = c("M","F")))%>%
mutate(HAPPY=factor(x=HAPPY,labels = c("Very Happy",
"Pretty Happy",
"Not too Happy")))%>%
mutate(DEGREE=factor(x=DEGREE,labels = c("<High Sc",
"High Sc",
"JC",
"Bachelor",
"Graduate")))
#mean and standard deviation by degree
useDeg=gssCln%>%drop_na(USETECH)%>%
group_by(DEGREE)%>%
summarize(aveT=mean(USETECH),
sdT=sd(USETECH))
useDeg
## # A tibble: 5 × 3
## DEGREE aveT sdT
## <fct> <dbl> <dbl>
## 1 <High Sc 24.8 36.2
## 2 High Sc 49.6 38.6
## 3 JC 62.4 35.2
## 4 Bachelor 67.9 32.1
## 5 Graduate 68.7 30.2
#Boxplot for different groups based on degree
gssCln%>%ggplot(aes(x=DEGREE,y=USETECH,fill=DEGREE))+
geom_boxplot()+
geom_point(alpha=.4)+
labs(x="Degree",y="Technology Usage(%)")
## Warning: Removed 936 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 936 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Boxplot for different groups based on degree with random error
gssCln%>%ggplot(aes(x=DEGREE,y=USETECH,fill=DEGREE))+
geom_boxplot()+
geom_jitter(alpha=.4)+
labs(x="Degree",y="Technology Usage(%)")
## Warning: Removed 936 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Removed 936 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Density plot for different degrees
gssCln%>%
ggplot(aes(x=USETECH,fill=DEGREE))+
geom_histogram(stat="density")+
facet_wrap(~DEGREE,nrow=3)+
theme_minimal()+
labs(x="Technology Usage",y="Density")
## Warning in geom_histogram(stat = "density"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
## Warning: Removed 936 rows containing non-finite outside the scale range
## (`stat_density()`).

#1st case study for One-way anova
#Data of three groups
water=c(10,12,18,24,36)
juice=c(11,14,19,23,38)
cof=c(12,13,17,25,37)
#Calculate the mean of the groups
mwater=mean(water)
mjuice=mean(juice)
mcof=mean(cof)
#Loading means into a vector
gm=c(mwater,mjuice,mcof)
gm
## [1] 20.0 21.0 20.8
#calculate sum of square within group
ssw=(water-mwater)^2
ssj=(juice-mjuice)^2
ssc=(cof-mcof)^2
#Calculate mean sum of square within group
MSW=(sum(ssw)+sum(ssj)+sum(ssc))/12
#Calculate grand mean
gb=mean(gm)
gb
## [1] 20.6
#Calculate mean square between groups
MSB=sum(5*(gm-gb)^2)/2
#F-Statitistics
MSB/MSW
## [1] 0.01273885
#P -value from F distribution
pf(MSB/MSW,df1=2,df2=12,lower.tail = FALSE)
## [1] 0.9873553
#Same example using R
grp=c(rep("g1",5),rep("g2",5),rep("g3",5))
grp
## [1] "g1" "g1" "g1" "g1" "g1" "g2" "g2" "g2" "g2" "g2" "g3" "g3" "g3" "g3" "g3"
t=c(water,juice,cof)
df=data.frame(grp,t)
oneway.test(df$t~df$grp,var.equal = TRUE)
##
## One-way analysis of means
##
## data: df$t and df$grp
## F = 0.012739, num df = 2, denom df = 12, p-value = 0.9874
#2nd case study for One-way anova
water=c(29,29,30,31,31)
juice=c(17,18,19,19,20)
cof=c(10,11,12,12,13)
#Calculate the mean of the groups
mwater=mean(water)
mjuice=mean(juice)
mcof=mean(cof)
#Loading means into a vector
gm=c(mwater,mjuice,mcof)
gm
## [1] 30.0 18.6 11.6
#calculate sum of square within group
ssw=(water-mwater)^2
ssj=(juice-mjuice)^2
ssc=(cof-mcof)^2
#Calculate mean sum of square within group
MSW=(sum(ssw)+sum(ssj)+sum(ssc))/12
#Calculate grand mean
gb=mean(gm)
gb
## [1] 20.06667
#Calculate mean square between groups
MSB=sum(5*(gm-gb)^2)/2
#F-Statitistics
MSB/MSW
## [1] 359.3889
#P -value from F distribution
pf(MSB/MSW,df1=2,df2=12,lower.tail = FALSE)
## [1] 1.960539e-11
#Same example using R
grp=c(rep("g1",5),rep("g2",5),rep("g3",5))
grp
## [1] "g1" "g1" "g1" "g1" "g1" "g2" "g2" "g2" "g2" "g2" "g3" "g3" "g3" "g3" "g3"
t=c(water,juice,cof)
df=data.frame(grp,t)
oneway.test(df$t~df$grp,var.equal = TRUE)
##
## One-way analysis of means
##
## data: df$t and df$grp
## F = 359.39, num df = 2, denom df = 12, p-value = 1.961e-11
#One way anova for gss case study
oneway.test(gssCln$USETECH~gssCln$DEGREE,var.equal = TRUE)
##
## One-way analysis of means
##
## data: gssCln$USETECH and gssCln$DEGREE
## F = 43.304, num df = 4, denom df = 1404, p-value < 2.2e-16
#Post hoc analysis - Benferroni
pairwise.t.test(gssCln$USETECH,gssCln$DEGREE,
p.adj ="bonf" )
##
## Pairwise comparisons using t tests with pooled SD
##
## data: gssCln$USETECH and gssCln$DEGREE
##
## <High Sc High Sc JC Bachelor
## High Sc 3.8e-11 - - -
## JC 2.8e-15 0.0022 - -
## Bachelor < 2e-16 8.0e-13 1.0000 -
## Graduate < 2e-16 7.3e-09 1.0000 1.0000
##
## P value adjustment method: bonferroni
useDeg
## # A tibble: 5 × 3
## DEGREE aveT sdT
## <fct> <dbl> <dbl>
## 1 <High Sc 24.8 36.2
## 2 High Sc 49.6 38.6
## 3 JC 62.4 35.2
## 4 Bachelor 67.9 32.1
## 5 Graduate 68.7 30.2
#Post hoc analysis - TukeyHSd
TukeyHSD(aov(gssCln$USETECH~gssCln$DEGREE))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gssCln$USETECH ~ gssCln$DEGREE)
##
## $`gssCln$DEGREE`
## diff lwr upr p adj
## High Sc-<High Sc 24.8247754 15.145211 34.50434 0.0000000
## JC-<High Sc 37.6070313 25.201887 50.01218 0.0000000
## Bachelor-<High Sc 43.0859568 32.653180 53.51873 0.0000000
## Graduate-<High Sc 43.9107249 32.256416 55.56503 0.0000000
## JC-High Sc 12.7822558 3.362603 22.20191 0.0020352
## Bachelor-High Sc 18.2611813 11.651711 24.87065 0.0000000
## Graduate-High Sc 19.0859494 10.679691 27.49221 0.0000000
## Bachelor-JC 5.4789255 -4.713166 15.67102 0.5833665
## Graduate-JC 6.3036936 -5.135659 17.74305 0.5592907
## Graduate-Bachelor 0.8247681 -8.438819 10.08835 0.9992282
#Check Normality Assumption
gssCln%>%drop_na(USETECH)%>%
ggplot(aes(sample=USETECH))+
stat_qq(fill="blue",alpha=.4)+
stat_qq_line(colour="red")+
facet_wrap(~DEGREE)+
labs(x="Theoritical Distribution",
y="Trchnolgy Usage")

#Check Normality Assumption test
shapiro.test(gssCln$USETECH)
##
## Shapiro-Wilk normality test
##
## data: gssCln$USETECH
## W = 0.8645, p-value < 2.2e-16
gssCln%>%drop_na(USETECH)%>%
group_by(DEGREE)%>%
summarize(pval=shapiro.test(USETECH)$p.value)
## # A tibble: 5 × 2
## DEGREE pval
## <fct> <dbl>
## 1 <High Sc 1.83e-14
## 2 High Sc 5.99e-24
## 3 JC 2.92e- 9
## 4 Bachelor 1.22e-16
## 5 Graduate 4.34e-11
#Non-parametric Brown-Forsythe test
np=gssCln%>%drop_na(USETECH)%>%
group_by(DEGREE)%>%
mutate(st=abs(USETECH-median(USETECH)))
oneway.test(st~DEGREE,data=np)
##
## One-way analysis of means (not assuming equal variances)
##
## data: st and DEGREE
## F = 19.747, num df = 4.00, denom df = 364.77, p-value = 9.965e-15
#Two-way anova
#Boxplot by degree and sex
gssCln %>%drop_na(USETECH)%>%
ggplot(aes(y = USETECH, x = DEGREE,fill=SEX)) +
geom_boxplot( alpha = .4) +
scale_fill_manual(values = c("blue", "magenta")) +
theme_minimal() +
labs(x = "Educational attainment", y = "Percent of time spent using technology")

#Calcuate mean and deviation for unique value of Degree and Sex
gssCln%>%drop_na(USETECH)%>%
group_by(DEGREE,SEX)%>%
summarise(useA=mean(USETECH),
useSD=sd(USETECH))
## `summarise()` has grouped output by 'DEGREE'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 4
## # Groups: DEGREE [5]
## DEGREE SEX useA useSD
## <fct> <fct> <dbl> <dbl>
## 1 <High Sc M 25.7 35.4
## 2 <High Sc F 23.7 37.5
## 3 High Sc M 43.5 37.8
## 4 High Sc F 55.9 38.6
## 5 JC M 47.0 36.8
## 6 JC F 70.4 31.7
## 7 Bachelor M 68.0 33.1
## 8 Bachelor F 67.7 31.2
## 9 Graduate M 72.1 29.2
## 10 Graduate F 65.9 30.9
#Mean plot based on degree and sex
gssCln%>%drop_na(USETECH)%>%
ggplot(aes(x=DEGREE,y=USETECH,colour = SEX))+
stat_summary(fun.y = mean,geom = "point",size=3)+
stat_summary(fun.y = mean,geom = "line",aes(group = SEX))+
theme_minimal()+
labs(x = "Educational attainment",
y = "Percent of time spent using technology")+
ylim(0,100)
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Two - way anova
techuseByDegSex=aov(gssCln$USETECH~gssCln$DEGREE*gssCln$SEX)
summary(techuseByDegSex)
## Df Sum Sq Mean Sq F value Pr(>F)
## gssCln$DEGREE 4 221301 55325 44.209 < 2e-16 ***
## gssCln$SEX 1 16473 16473 13.163 0.000296 ***
## gssCln$DEGREE:gssCln$SEX 4 26510 6627 5.296 0.000311 ***
## Residuals 1399 1750775 1251
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 936 observations deleted due to missingness
#Post Hoc analysis
TukeyHSD(techuseByDegSex)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gssCln$USETECH ~ gssCln$DEGREE * gssCln$SEX)
##
## $`gssCln$DEGREE`
## diff lwr upr p adj
## High Sc-<High Sc 24.8247754 15.244768 34.404783 0.0000000
## JC-<High Sc 37.6070313 25.329478 49.884584 0.0000000
## Bachelor-<High Sc 43.0859568 32.760484 53.411429 0.0000000
## Graduate-<High Sc 43.9107249 32.376284 55.445165 0.0000000
## JC-High Sc 12.7822558 3.459487 22.105024 0.0017563
## Bachelor-High Sc 18.2611813 11.719691 24.802671 0.0000000
## Graduate-High Sc 19.0859494 10.766152 27.405746 0.0000000
## Bachelor-JC 5.4789255 -4.608337 15.566188 0.5733923
## Graduate-JC 6.3036936 -5.018002 17.625389 0.5490670
## Graduate-Bachelor 0.8247681 -8.343540 9.993076 0.9991960
##
## $`gssCln$SEX`
## diff lwr upr p adj
## F-M 6.80899 3.108699 10.50928 0.0003174
##
## $`gssCln$DEGREE:gssCln$SEX`
## diff lwr upr p adj
## High Sc:M-<High Sc:M 17.8132060 2.7275183 32.8988937 0.0072699
## JC:M-<High Sc:M 21.3181818 -0.4992077 43.1355713 0.0619111
## Bachelor:M-<High Sc:M 42.3151914 25.7902764 58.8401064 0.0000000
## Graduate:M-<High Sc:M 46.3538961 27.5496712 65.1581210 0.0000000
## <High Sc:F-<High Sc:M -2.0378788 -22.6075109 18.5317533 0.9999995
## High Sc:F-<High Sc:M 30.1500000 15.0344692 45.2655308 0.0000000
## JC:F-<High Sc:M 44.7418831 26.3028236 63.1809427 0.0000000
## Bachelor:F-<High Sc:M 42.0396406 25.8082011 58.2710800 0.0000000
## Graduate:F-<High Sc:M 40.1813241 22.0984520 58.2641962 0.0000000
## JC:M-High Sc:M 3.5049758 -14.4610385 21.4709901 0.9998264
## Bachelor:M-High Sc:M 24.5019854 13.5542915 35.4496792 0.0000000
## Graduate:M-High Sc:M 28.5406901 14.3851943 42.6961858 0.0000000
## <High Sc:F-High Sc:M -19.8510848 -36.2793820 -3.4227876 0.0052315
## High Sc:F-High Sc:M 12.3367940 3.6616307 21.0119573 0.0003049
## JC:F-High Sc:M 26.9286771 13.2619985 40.5953557 0.0000000
## Bachelor:F-High Sc:M 24.2264346 13.7269673 34.7259018 0.0000000
## Graduate:F-High Sc:M 22.3681181 9.1859540 35.5502821 0.0000039
## Bachelor:M-JC:M 20.9970096 1.8065820 40.1874372 0.0192892
## Graduate:M-JC:M 25.0357143 3.8508477 46.2205808 0.0071871
## <High Sc:F-JC:M -23.3560606 -46.1224714 -0.5896498 0.0389231
## High Sc:F-JC:M 8.8318182 -9.1592621 26.8228985 0.8690307
## JC:F-JC:M 23.4237013 2.5622868 44.2851158 0.0141081
## Bachelor:F-JC:M 20.7214588 1.7831557 39.6597618 0.0192858
## Graduate:F-JC:M 18.8631423 -1.6841193 39.4104039 0.1039186
## Graduate:M-Bachelor:M 4.0387047 -11.6416301 19.7190396 0.9983501
## <High Sc:F-Bachelor:M -44.3530702 -62.1121183 -26.5940220 0.0000000
## High Sc:F-Bachelor:M -12.1651914 -23.1539720 -1.1764108 0.0167764
## JC:F-Bachelor:M 2.4266917 -12.8138117 17.6671952 0.9999688
## Bachelor:F-Bachelor:M -0.2755508 -12.7548798 12.2037783 1.0000000
## Graduate:F-Bachelor:M -2.1338673 -16.9414427 12.6737082 0.9999867
## <High Sc:F-Graduate:M -48.3917749 -68.2892584 -28.4942914 0.0000000
## High Sc:F-Graduate:M -16.2038961 -30.3911918 -2.0166004 0.0113631
## JC:F-Graduate:M -1.6120130 -19.2981376 16.0741116 0.9999998
## Bachelor:F-Graduate:M -4.3142555 -19.6849976 11.0564866 0.9967894
## Graduate:F-Graduate:M -6.1725720 -23.4870269 11.1418829 0.9816675
## High Sc:F-<High Sc:F 32.1878788 15.7321731 48.6435845 0.0000000
## JC:F-<High Sc:F 46.7797619 27.2270154 66.3325084 0.0000000
## Bachelor:F-<High Sc:F 44.0775194 26.5912218 61.5638170 0.0000000
## Graduate:F-<High Sc:F 42.2192029 23.0019908 61.4364150 0.0000000
## JC:F-High Sc:F 14.5918831 0.8922699 28.2914963 0.0261888
## Bachelor:F-High Sc:F 11.8896406 1.3473395 22.4319416 0.0133486
## Graduate:F-High Sc:F 10.0313241 -3.1849820 23.2476303 0.3233313
## Bachelor:F-JC:F -2.7022425 -17.6240305 12.2195454 0.9999069
## Graduate:F-JC:F -4.5605590 -21.4777217 12.3566037 0.9976459
## Graduate:F-Bachelor:F -1.8583165 -16.3376501 12.6210171 0.9999951