Data Analysis using R

Two sample t test case study(parametric & non parametric)

Day5

———————————————————————–

#Load nhanes_2015-2016_ch6.csv file and other libraries

rm(list=ls())
library(readr)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(tidyr)
library(lsr)
library(moments)
library(BSDA)

## Loading required package: lattice

## 
## Attaching package: 'BSDA'

## The following object is masked from 'package:datasets':
## 
##     Orange

#Loading the data
setwd("D:\\D Drive\\Ph.D. Course Work\\Ph.D. 2024\\Data")
pres=read_csv("nhanes_2015-2016_ch6.csv")

## Rows: 9544 Columns: 71

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): cycle, file_name
## dbl (69): SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDAGEMN, RIDRETH1, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

names(pres)

##  [1] "SEQN"       "cycle"      "SDDSRVYR"   "RIDSTATR"   "RIAGENDR"  
##  [6] "RIDAGEYR"   "RIDAGEMN"   "RIDRETH1"   "RIDRETH3"   "RIDEXMON"  
## [11] "RIDEXAGM"   "DMQMILIZ"   "DMQADFC"    "DMDBORN4"   "DMDCITZN"  
## [16] "DMDYRSUS"   "DMDEDUC3"   "DMDEDUC2"   "DMDMARTL"   "RIDEXPRG"  
## [21] "SIALANG"    "SIAPROXY"   "SIAINTRP"   "FIALANG"    "FIAPROXY"  
## [26] "FIAINTRP"   "MIALANG"    "MIAPROXY"   "MIAINTRP"   "AIALANGA"  
## [31] "DMDHHSIZ"   "DMDFMSIZ"   "DMDHHSZA"   "DMDHHSZB"   "DMDHHSZE"  
## [36] "DMDHRGND"   "DMDHRAGE"   "DMDHRBR4"   "DMDHREDU"   "DMDHRMAR"  
## [41] "DMDHSEDU"   "WTINT2YR"   "WTMEC2YR"   "SDMVPSU"    "SDMVSTRA"  
## [46] "INDHHIN2"   "INDFMIN2"   "INDFMPIR"   "PEASCCT1"   "BPXCHR"    
## [51] "BPAARM"     "BPACSZ"     "BPXPLS"     "BPXPULS"    "BPXPTY"    
## [56] "BPXML1"     "BPXSY1"     "BPXDI1"     "BPAEN1"     "BPXSY2"    
## [61] "BPXDI2"     "BPAEN2"     "BPXSY3"     "BPXDI3"     "BPAEN3"    
## [66] "BPXSY4"     "BPXDI4"     "BPAEN4"     "file_name"  "begin_year"
## [71] "end_year"

# 1 sample t-test
presCln1=pres%>%drop_na(BPXSY1)%>%rename(systolic=BPXSY1)%>%
  rename(daistolic=BPXDI1)
#Plot the histogram for systolic and diastolic pressure
plot1=presCln1%>%ggplot(aes(x=systolic,fill = systolic>120))+
  geom_histogram(color="white")+
  theme_minimal()+
  labs(x="Systolic Pressure (mmhg)",y="particpants")+
  scale_fill_manual(values=c("lightgreen","orange"),
                    labels=c("Normal","at risk"),
                    name="sytolic\nblood pressure")

plot2=presCln1%>%ggplot(aes(x=daistolic,fill = daistolic>80))+
  geom_histogram(color="white")+
  theme_minimal()+
  labs(x="diastolic Pressure (mmhg)",y="particpants")+
  scale_fill_manual(values=c("lightgreen","orange"),
                    labels=c("Normal","at risk"),
                    name="diastolic\nblood pressure")

grid.arrange(plot1,plot2,ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

samp_stat1=presCln1%>%summarize(avgSP=mean(systolic),
                                   sdSP=sd(systolic),
                                   n=n())
#Density Plot
presCln1%>%ggplot(aes(x=systolic))+
  geom_density()

#Boxplot
presCln1%>%ggplot(aes(y=systolic))+geom_boxplot()+
  theme_minimal()

#T test intermidiate calculation
effect=samp_stat1$avgSP -120
noise_SE=samp_stat1$sdSP/sqrt(samp_stat1$n)

tstat=effect/noise_SE
tstat

## [1] 2.449079

x=seq(-3,3,.01)
prob=dt(x,samp_stat1$n-1)
tdgraph=data.frame(x,prob)
ggplot(tdgraph,aes(x=x,y=prob))+
  geom_point(colour="red")+
  labs(x="t",y="probability",title = "t-distribution")

#p_value
pt(tstat,samp_stat1$n-1,lower.tail = FALSE)

## [1] 0.007173036

#critical region with significance lvl 5%
qt(.025,samp_stat1$n-1,lower.tail = FALSE)

## [1] 1.960296

# t-test by R
t.test(pres$BPXSY1,mu=120,alternative = "greater")

## 
##  One Sample t-test
## 
## data:  pres$BPXSY1
## t = 2.4491, df = 7144, p-value = 0.007173
## alternative hypothesis: true mean is greater than 120
## 95 percent confidence interval:
##  120.1771      Inf
## sample estimates:
## mean of x 
##  120.5394

t.test(pres$BPXSY1,mu=120,alternative = "two.sided")

## 
##  One Sample t-test
## 
## data:  pres$BPXSY1
## t = 2.4491, df = 7144, p-value = 0.01435
## alternative hypothesis: true mean is not equal to 120
## 95 percent confidence interval:
##  120.1077 120.9711
## sample estimates:
## mean of x 
##  120.5394

#95% confidence Interval
tUpper=qt(.025,samp_stat1$n-1,lower.tail = FALSE)
tLower=qt(.025,samp_stat1$n-1,lower.tail = TRUE)
ciU=samp_stat1$avgSP+noise_SE*tUpper
ciL=samp_stat1$avgSP+noise_SE*tLower
ci=c(ciL,ciU)
ci

## [1] 120.1077 120.9711

#99% confidence Interval
tUpper=qt(.005,samp_stat1$n-1,lower.tail = FALSE)
tLower=qt(.005,samp_stat1$n-1,lower.tail = TRUE)
ciU=samp_stat1$avgSP+noise_SE*tUpper
ciL=samp_stat1$avgSP+noise_SE*tLower
ci=c(ciL,ciU)
ci

## [1] 119.9719 121.1069

#Effect size statistics 
d=effect/samp_stat1$sdSP
d

## [1] 0.02897354

cohensD(pres$BPXSY1,mu=120)

## [1] 0.02897354

#People age >=65
samp_stat65=pres%>%drop_na(BPXSY1)%>%filter(RIDAGEYR>=65)
t.test(samp_stat65$BPXSY1,mu=120)

## 
##  One Sample t-test
## 
## data:  samp_stat65$BPXSY1
## t = 29.238, df = 1232, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 120
## 95 percent confidence interval:
##  135.4832 137.7106
## sample estimates:
## mean of x 
##  136.5969

# 2 sample t-test
unique(pres$RIAGENDR)

## [1] 1 2

#change the value 1 to male and 2 to female
presCln21=pres%>%
  mutate(RIAGENDR=as.character(RIAGENDR))%>%
  mutate(RIAGENDR=recode_factor(RIAGENDR,'1'="M",'2'="F"))%>%
  rename(sex=RIAGENDR,systolic=BPXSY1)
#Mean systolic pressure for male and female
samp_stat2=presCln21%>%drop_na(systolic)%>%
  group_by(sex)%>%
  summarize(avep=mean(systolic),
            sdp=sd(systolic),
            n=n())
#density plot of Mean systolic pressure for male and female
presCln21%>%
  ggplot(aes(x=systolic,fill=sex))+
  geom_density(alpha=.4)+
  labs(x="systolic Blood Pressure",y="Probability density")

## Warning: Removed 2399 rows containing non-finite outside the scale range
## (`stat_density()`).

#Calculating t statistics
tstat=(samp_stat2[1,2]-samp_stat2[2,2])/
  sqrt(samp_stat2[1,3]^2/samp_stat2[1,4]+samp_stat2[2,3]^2/samp_stat2[2,4])
tstat

##       avep
## 1 7.313489

#t distribution curve with df=7143
x=seq(-8,8,.01)
prob=dt(x,7143)
tdgraph=data.frame(x,prob)
ggplot(tdgraph,aes(x=x,y=prob))+
  geom_point(colour="red")+
  labs(x="t",y="probability",title = "t-distribution")

#Calculating p-value for two tailed test
p_value=pt(q=-7.313489,df=7143,lower.tail = TRUE)+
  pt(q=7.313489,df=7143,lower.tail = FALSE)
p_value

## [1] 2.886288e-13

p_value=2*pt(q=7.313489,df=7143,lower.tail = FALSE)
p_value

## [1] 2.886288e-13

p_value=2*pt(q=-7.313489,df=7143,lower.tail = TRUE)
p_value

## [1] 2.886288e-13

p_value=2*pt(q=7.313489,df=7143,lower.tail = FALSE)
p_value

## [1] 2.886288e-13

#2-sample t-test in R assuming variance unequal
t.test(presCln21$systolic~presCln21$sex)

## 
##  Welch Two Sample t-test
## 
## data:  presCln21$systolic by presCln21$sex
## t = 7.3135, df = 7143, p-value = 2.886e-13
## alternative hypothesis: true difference in means between group M and group F is not equal to 0
## 95 percent confidence interval:
##  2.347882 4.067432
## sample estimates:
## mean in group M mean in group F 
##        122.1767        118.9690

#2-sample t-test in R assuming variance equal
t.test(presCln21$systolic~presCln21$sex,var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  presCln21$systolic by presCln21$sex
## t = 7.3071, df = 7143, p-value = 3.026e-13
## alternative hypothesis: true difference in means between group M and group F is not equal to 0
## 95 percent confidence interval:
##  2.347126 4.068187
## sample estimates:
## mean in group M mean in group F 
##        122.1767        118.9690

#Effectsize statistics
d=(samp_stat2[1,2]-samp_stat2[2,2])/
  sqrt((samp_stat2[1,3]^2+samp_stat2[2,3]^2)/2)
d

##        avep
## 1 0.1730045

cohensD(presCln21$systolic~presCln21$sex,method="unequal")

## [1] 0.1730045

#Create the difference of systolic pressure
presCln22=pres%>%rename(systolic1=BPXSY1)%>%
  rename(systolic2=BPXSY2)%>%
  mutate(diff=systolic1-systolic2)%>%
  mutate(RIAGENDR=recode_factor(RIAGENDR,"1"="M","2"="F"))%>%
  rename(sex=RIAGENDR)%>%
  drop_na(diff)
#Plot the histogram for pressure difference
presCln22%>%ggplot(aes(x=diff))+
  geom_histogram(fill="blue",color="white")+
  theme_minimal()+
  labs(x="Systolic Pressure Difference (mmhg)",y="frequecy")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

samp_stat3=presCln22%>%summarize(diffa=mean(diff),
                    diffsd=sd(diff),
                    n=n())
#T test intermidiate calculation
effect=samp_stat3$diffa
noise_SE=samp_stat3$diffsd/sqrt(samp_stat3$n)

tstat=effect/noise_SE
tstat

## [1] 9.376237

t.test(presCln22$systolic1,presCln22$systolic2,paired=TRUE)

## 
##  Paired t-test
## 
## data:  presCln22$systolic1 and presCln22$systolic2
## t = 9.3762, df = 7100, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  0.4310514 0.6589360
## sample estimates:
## mean difference 
##       0.5449937

#Effectsize statistics
d=effect/samp_stat3$diffsd
d

## [1] 0.1112676

cohensD(pres$BPXSY1,pres$BPXSY2,method="paired")

## [1] 0.1112676

#t-test Assumptions - 1 sample t -test
presCln1%>%ggplot(aes(x=systolic))+
  geom_histogram(fill="blue",color="white")+
  theme_minimal()+
  labs(x="Systolic Pressure (mmhg)",y="particpants")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Q-Q plot
presCln1%>%ggplot(aes(sample=systolic))+
  stat_qq(colour="blue",alpha=.4)+
  stat_qq_line(colour="red")+
  labs(x="Theritcal Normal distribution",
       y="systolic presurre")

#Calculating skewness
skewness(presCln1$systolic)

## [1] 1.070148

#t-test Assumptions - independent sample t -test
presCln21%>%ggplot(aes(x=systolic))+
  geom_histogram(fill="blue",color="white")+
  theme_minimal()+
  facet_wrap(~sex,nrow = 2)+
  labs(x="Systolic Pressure (mmhg)",y="particpants")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2399 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Calculating skewness for Female
skewness(presCln21%>%filter(sex=="F")%>%select(systolic),
         na.rm=TRUE)

## systolic 
## 1.118464

#Calculating skewness for Male
skewness(presCln21%>%filter(sex=="M")%>%select(systolic),
         na.rm=TRUE)

## systolic 
## 1.061398

#Q-Q plot
presCln21%>%ggplot(aes(sample=systolic))+
  stat_qq(colour="blue",alpha=.4)+
  stat_qq_line(colour="red")+
  facet_wrap(~sex)+
  labs(x="Theritcal Normal distribution",
       y="systolic presurre")

## Warning: Removed 2399 rows containing non-finite outside the scale range
## (`stat_qq()`).

## Warning: Removed 2399 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

#t-test Assumptions - dependent sample t -test
presCln22%>%ggplot(aes(x=diff))+
  geom_histogram(fill="blue",color="white")+
  theme_minimal()+
  labs(x="Systolic Pressure Difference (mmhg)",y="frequecy")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

skewness(presCln22%>%select(diff))

##      diff 
## 0.2351292

#Q-Q plot
presCln22%>%ggplot(aes(sample=diff))+
  stat_qq(fill="blue",alpha=.4)+
  stat_qq_line(colour="red")+
  labs(x="Theritcal Normal distribution",
       y="systolic presurre")+
  theme_minimal()

#non-parametric test for 1 sample t test
median(presCln1$systolic,na.rm=TRUE)

## [1] 118

SIGN.test(presCln1$systolic,md=120)

## 
##  One-sample Sign-Test
## 
## data:  presCln1$systolic
## s = 3004, p-value < 2.2e-16
## alternative hypothesis: true median is not equal to 120
## 95 percent confidence interval:
##  116 118
## sample estimates:
## median of x 
##         118 
## 
## Achieved and Interpolated Confidence Intervals: 
## 
##                   Conf.Level L.E.pt U.E.pt
## Lower Achieved CI     0.9477    116    118
## Interpolated CI       0.9500    116    118
## Upper Achieved CI     0.9505    116    118

#non-parametric test for dependent 2 sample t test
median(presCln22$diff,na.rm=TRUE)

## [1] 0

wilcox.test(presCln22$systolic1,presCln22$systolic2,paired=TRUE)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  presCln22$systolic1 and presCln22$systolic2
## V = 9549959, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

#non-parametric test for independent 2 sample t test
wilcox.test(presCln21$systolic~presCln21$sex,paird=FALSE)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  presCln21$systolic by presCln21$sex
## W = 7186882, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0