#Load nhanes_2015-2016_ch6.csv file and other libraries
rm(list=ls())
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(tidyr)
library(lsr)
library(moments)
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
#Loading the data
setwd("D:\\D Drive\\Ph.D. Course Work\\Ph.D. 2024\\Data")
pres=read_csv("nhanes_2015-2016_ch6.csv")
## Rows: 9544 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cycle, file_name
## dbl (69): SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDAGEMN, RIDRETH1, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(pres)
## [1] "SEQN" "cycle" "SDDSRVYR" "RIDSTATR" "RIAGENDR"
## [6] "RIDAGEYR" "RIDAGEMN" "RIDRETH1" "RIDRETH3" "RIDEXMON"
## [11] "RIDEXAGM" "DMQMILIZ" "DMQADFC" "DMDBORN4" "DMDCITZN"
## [16] "DMDYRSUS" "DMDEDUC3" "DMDEDUC2" "DMDMARTL" "RIDEXPRG"
## [21] "SIALANG" "SIAPROXY" "SIAINTRP" "FIALANG" "FIAPROXY"
## [26] "FIAINTRP" "MIALANG" "MIAPROXY" "MIAINTRP" "AIALANGA"
## [31] "DMDHHSIZ" "DMDFMSIZ" "DMDHHSZA" "DMDHHSZB" "DMDHHSZE"
## [36] "DMDHRGND" "DMDHRAGE" "DMDHRBR4" "DMDHREDU" "DMDHRMAR"
## [41] "DMDHSEDU" "WTINT2YR" "WTMEC2YR" "SDMVPSU" "SDMVSTRA"
## [46] "INDHHIN2" "INDFMIN2" "INDFMPIR" "PEASCCT1" "BPXCHR"
## [51] "BPAARM" "BPACSZ" "BPXPLS" "BPXPULS" "BPXPTY"
## [56] "BPXML1" "BPXSY1" "BPXDI1" "BPAEN1" "BPXSY2"
## [61] "BPXDI2" "BPAEN2" "BPXSY3" "BPXDI3" "BPAEN3"
## [66] "BPXSY4" "BPXDI4" "BPAEN4" "file_name" "begin_year"
## [71] "end_year"
# 1 sample t-test
presCln1=pres%>%drop_na(BPXSY1)%>%rename(systolic=BPXSY1)%>%
rename(daistolic=BPXDI1)
#Plot the histogram for systolic and diastolic pressure
plot1=presCln1%>%ggplot(aes(x=systolic,fill = systolic>120))+
geom_histogram(color="white")+
theme_minimal()+
labs(x="Systolic Pressure (mmhg)",y="particpants")+
scale_fill_manual(values=c("lightgreen","orange"),
labels=c("Normal","at risk"),
name="sytolic\nblood pressure")
plot2=presCln1%>%ggplot(aes(x=daistolic,fill = daistolic>80))+
geom_histogram(color="white")+
theme_minimal()+
labs(x="diastolic Pressure (mmhg)",y="particpants")+
scale_fill_manual(values=c("lightgreen","orange"),
labels=c("Normal","at risk"),
name="diastolic\nblood pressure")
grid.arrange(plot1,plot2,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

samp_stat1=presCln1%>%summarize(avgSP=mean(systolic),
sdSP=sd(systolic),
n=n())
#Density Plot
presCln1%>%ggplot(aes(x=systolic))+
geom_density()

#Boxplot
presCln1%>%ggplot(aes(y=systolic))+geom_boxplot()+
theme_minimal()

#T test intermidiate calculation
effect=samp_stat1$avgSP -120
noise_SE=samp_stat1$sdSP/sqrt(samp_stat1$n)
tstat=effect/noise_SE
tstat
## [1] 2.449079
x=seq(-3,3,.01)
prob=dt(x,samp_stat1$n-1)
tdgraph=data.frame(x,prob)
ggplot(tdgraph,aes(x=x,y=prob))+
geom_point(colour="red")+
labs(x="t",y="probability",title = "t-distribution")

#p_value
pt(tstat,samp_stat1$n-1,lower.tail = FALSE)
## [1] 0.007173036
#critical region with significance lvl 5%
qt(.025,samp_stat1$n-1,lower.tail = FALSE)
## [1] 1.960296
# t-test by R
t.test(pres$BPXSY1,mu=120,alternative = "greater")
##
## One Sample t-test
##
## data: pres$BPXSY1
## t = 2.4491, df = 7144, p-value = 0.007173
## alternative hypothesis: true mean is greater than 120
## 95 percent confidence interval:
## 120.1771 Inf
## sample estimates:
## mean of x
## 120.5394
t.test(pres$BPXSY1,mu=120,alternative = "two.sided")
##
## One Sample t-test
##
## data: pres$BPXSY1
## t = 2.4491, df = 7144, p-value = 0.01435
## alternative hypothesis: true mean is not equal to 120
## 95 percent confidence interval:
## 120.1077 120.9711
## sample estimates:
## mean of x
## 120.5394
#95% confidence Interval
tUpper=qt(.025,samp_stat1$n-1,lower.tail = FALSE)
tLower=qt(.025,samp_stat1$n-1,lower.tail = TRUE)
ciU=samp_stat1$avgSP+noise_SE*tUpper
ciL=samp_stat1$avgSP+noise_SE*tLower
ci=c(ciL,ciU)
ci
## [1] 120.1077 120.9711
#99% confidence Interval
tUpper=qt(.005,samp_stat1$n-1,lower.tail = FALSE)
tLower=qt(.005,samp_stat1$n-1,lower.tail = TRUE)
ciU=samp_stat1$avgSP+noise_SE*tUpper
ciL=samp_stat1$avgSP+noise_SE*tLower
ci=c(ciL,ciU)
ci
## [1] 119.9719 121.1069
#Effect size statistics
d=effect/samp_stat1$sdSP
d
## [1] 0.02897354
cohensD(pres$BPXSY1,mu=120)
## [1] 0.02897354
#People age >=65
samp_stat65=pres%>%drop_na(BPXSY1)%>%filter(RIDAGEYR>=65)
t.test(samp_stat65$BPXSY1,mu=120)
##
## One Sample t-test
##
## data: samp_stat65$BPXSY1
## t = 29.238, df = 1232, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 120
## 95 percent confidence interval:
## 135.4832 137.7106
## sample estimates:
## mean of x
## 136.5969
# 2 sample t-test
unique(pres$RIAGENDR)
## [1] 1 2
#change the value 1 to male and 2 to female
presCln21=pres%>%
mutate(RIAGENDR=as.character(RIAGENDR))%>%
mutate(RIAGENDR=recode_factor(RIAGENDR,'1'="M",'2'="F"))%>%
rename(sex=RIAGENDR,systolic=BPXSY1)
#Mean systolic pressure for male and female
samp_stat2=presCln21%>%drop_na(systolic)%>%
group_by(sex)%>%
summarize(avep=mean(systolic),
sdp=sd(systolic),
n=n())
#density plot of Mean systolic pressure for male and female
presCln21%>%
ggplot(aes(x=systolic,fill=sex))+
geom_density(alpha=.4)+
labs(x="systolic Blood Pressure",y="Probability density")
## Warning: Removed 2399 rows containing non-finite outside the scale range
## (`stat_density()`).

#Calculating t statistics
tstat=(samp_stat2[1,2]-samp_stat2[2,2])/
sqrt(samp_stat2[1,3]^2/samp_stat2[1,4]+samp_stat2[2,3]^2/samp_stat2[2,4])
tstat
## avep
## 1 7.313489
#t distribution curve with df=7143
x=seq(-8,8,.01)
prob=dt(x,7143)
tdgraph=data.frame(x,prob)
ggplot(tdgraph,aes(x=x,y=prob))+
geom_point(colour="red")+
labs(x="t",y="probability",title = "t-distribution")

#Calculating p-value for two tailed test
p_value=pt(q=-7.313489,df=7143,lower.tail = TRUE)+
pt(q=7.313489,df=7143,lower.tail = FALSE)
p_value
## [1] 2.886288e-13
p_value=2*pt(q=7.313489,df=7143,lower.tail = FALSE)
p_value
## [1] 2.886288e-13
p_value=2*pt(q=-7.313489,df=7143,lower.tail = TRUE)
p_value
## [1] 2.886288e-13
p_value=2*pt(q=7.313489,df=7143,lower.tail = FALSE)
p_value
## [1] 2.886288e-13
#2-sample t-test in R assuming variance unequal
t.test(presCln21$systolic~presCln21$sex)
##
## Welch Two Sample t-test
##
## data: presCln21$systolic by presCln21$sex
## t = 7.3135, df = 7143, p-value = 2.886e-13
## alternative hypothesis: true difference in means between group M and group F is not equal to 0
## 95 percent confidence interval:
## 2.347882 4.067432
## sample estimates:
## mean in group M mean in group F
## 122.1767 118.9690
#2-sample t-test in R assuming variance equal
t.test(presCln21$systolic~presCln21$sex,var.equal=TRUE)
##
## Two Sample t-test
##
## data: presCln21$systolic by presCln21$sex
## t = 7.3071, df = 7143, p-value = 3.026e-13
## alternative hypothesis: true difference in means between group M and group F is not equal to 0
## 95 percent confidence interval:
## 2.347126 4.068187
## sample estimates:
## mean in group M mean in group F
## 122.1767 118.9690
#Effectsize statistics
d=(samp_stat2[1,2]-samp_stat2[2,2])/
sqrt((samp_stat2[1,3]^2+samp_stat2[2,3]^2)/2)
d
## avep
## 1 0.1730045
cohensD(presCln21$systolic~presCln21$sex,method="unequal")
## [1] 0.1730045
#Create the difference of systolic pressure
presCln22=pres%>%rename(systolic1=BPXSY1)%>%
rename(systolic2=BPXSY2)%>%
mutate(diff=systolic1-systolic2)%>%
mutate(RIAGENDR=recode_factor(RIAGENDR,"1"="M","2"="F"))%>%
rename(sex=RIAGENDR)%>%
drop_na(diff)
#Plot the histogram for pressure difference
presCln22%>%ggplot(aes(x=diff))+
geom_histogram(fill="blue",color="white")+
theme_minimal()+
labs(x="Systolic Pressure Difference (mmhg)",y="frequecy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

samp_stat3=presCln22%>%summarize(diffa=mean(diff),
diffsd=sd(diff),
n=n())
#T test intermidiate calculation
effect=samp_stat3$diffa
noise_SE=samp_stat3$diffsd/sqrt(samp_stat3$n)
tstat=effect/noise_SE
tstat
## [1] 9.376237
t.test(presCln22$systolic1,presCln22$systolic2,paired=TRUE)
##
## Paired t-test
##
## data: presCln22$systolic1 and presCln22$systolic2
## t = 9.3762, df = 7100, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.4310514 0.6589360
## sample estimates:
## mean difference
## 0.5449937
#Effectsize statistics
d=effect/samp_stat3$diffsd
d
## [1] 0.1112676
cohensD(pres$BPXSY1,pres$BPXSY2,method="paired")
## [1] 0.1112676
#t-test Assumptions - 1 sample t -test
presCln1%>%ggplot(aes(x=systolic))+
geom_histogram(fill="blue",color="white")+
theme_minimal()+
labs(x="Systolic Pressure (mmhg)",y="particpants")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Q-Q plot
presCln1%>%ggplot(aes(sample=systolic))+
stat_qq(colour="blue",alpha=.4)+
stat_qq_line(colour="red")+
labs(x="Theritcal Normal distribution",
y="systolic presurre")

#Calculating skewness
skewness(presCln1$systolic)
## [1] 1.070148
#t-test Assumptions - independent sample t -test
presCln21%>%ggplot(aes(x=systolic))+
geom_histogram(fill="blue",color="white")+
theme_minimal()+
facet_wrap(~sex,nrow = 2)+
labs(x="Systolic Pressure (mmhg)",y="particpants")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2399 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Calculating skewness for Female
skewness(presCln21%>%filter(sex=="F")%>%select(systolic),
na.rm=TRUE)
## systolic
## 1.118464
#Calculating skewness for Male
skewness(presCln21%>%filter(sex=="M")%>%select(systolic),
na.rm=TRUE)
## systolic
## 1.061398
#Q-Q plot
presCln21%>%ggplot(aes(sample=systolic))+
stat_qq(colour="blue",alpha=.4)+
stat_qq_line(colour="red")+
facet_wrap(~sex)+
labs(x="Theritcal Normal distribution",
y="systolic presurre")
## Warning: Removed 2399 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 2399 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

#t-test Assumptions - dependent sample t -test
presCln22%>%ggplot(aes(x=diff))+
geom_histogram(fill="blue",color="white")+
theme_minimal()+
labs(x="Systolic Pressure Difference (mmhg)",y="frequecy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

skewness(presCln22%>%select(diff))
## diff
## 0.2351292
#Q-Q plot
presCln22%>%ggplot(aes(sample=diff))+
stat_qq(fill="blue",alpha=.4)+
stat_qq_line(colour="red")+
labs(x="Theritcal Normal distribution",
y="systolic presurre")+
theme_minimal()

#non-parametric test for 1 sample t test
median(presCln1$systolic,na.rm=TRUE)
## [1] 118
SIGN.test(presCln1$systolic,md=120)
##
## One-sample Sign-Test
##
## data: presCln1$systolic
## s = 3004, p-value < 2.2e-16
## alternative hypothesis: true median is not equal to 120
## 95 percent confidence interval:
## 116 118
## sample estimates:
## median of x
## 118
##
## Achieved and Interpolated Confidence Intervals:
##
## Conf.Level L.E.pt U.E.pt
## Lower Achieved CI 0.9477 116 118
## Interpolated CI 0.9500 116 118
## Upper Achieved CI 0.9505 116 118
#non-parametric test for dependent 2 sample t test
median(presCln22$diff,na.rm=TRUE)
## [1] 0
wilcox.test(presCln22$systolic1,presCln22$systolic2,paired=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: presCln22$systolic1 and presCln22$systolic2
## V = 9549959, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
#non-parametric test for independent 2 sample t test
wilcox.test(presCln21$systolic~presCln21$sex,paird=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: presCln21$systolic by presCln21$sex
## W = 7186882, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0