This article is a practice about some basic statistical analysis,
which is a part
of FETP basis data analysis course.
library(tidyverse)
japan <- haven::read_dta("japan_studen.dta")
head(japan$knowledge,10)
## [1] 87 93 85 76 76 80 77 99 73 65
## knowledge variable is a continous data
## Is knowledge variable a normal distribution?
#1.Histogram plot
hist(japan$knowledge, col ="pink",
xlab = "Knowledge",
main = "Histogram of knowledge",
breaks = 20)
#2.Q-Q plot
qqnorm(japan$knowledge)
qqline(japan$knowledge,
lwd=2,
col= "red")
#3.Shapiro test
shapiro.test(japan$knowledge)
##
## Shapiro-Wilk normality test
##
## data: japan$knowledge
## W = 0.98977, p-value = 0.01269
Conclusion knowledge variable is a normal distribution.
Let do some inference.
## Find 95% CI of knowledge in population.
knowledge<- lm(japan$knowledge~1)
confint(knowledge, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 79.48577 82.11534
Example of distibution test.
##Use japan_covid_viral_load dataset.
japan_viral_load <- haven::read_dta("japan_covid_viralload.dta")
summary(japan_viral_load$vl_init)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.41 7.91 72.14 15533.14 1876.45 175000.00
# Let exam japan_viral_load$vl_init distribution
hist(japan_viral_load$vl_init,
col = "pink",
breaks = 10,
main = "Histogram of viral load on normal scale",
xlab = "Number of viral load")
##Histogram show skewed to right
# Change to log scale to fix this skewed problem
hist(log(japan_viral_load$vl_init),
col = "pink",
main = "Histogram of viral load on log scale",
xlab = "Number of viral load on log scale")
shapiro.test(log(japan_viral_load$vl_init))
##
## Shapiro-Wilk normality test
##
## data: log(japan_viral_load$vl_init)
## W = 0.94775, p-value = 0.1739
qqnorm(log(japan_viral_load$vl_init))
qqline(log(japan_viral_load$vl_init),
col="red",
lwd=2)
## All show normal distribution
Example of statistical comparison.
## t test comparison for continuous data example
burnout <- haven::read_dta("burnout_french.dta")
hist(burnout$burnoutscore)
## Assume burnoutscore is normal distributed
boxplot(burnout$burnoutscore~burnout$GENDER,
xlab = "Gender",
ylab = "Burnout score",
col = "darkseagreen2",
main ="Boxplot of burnout score for each gender")
Is this a statistical significantly difference?
## Use t test for comparing bunoutscore by gender.
t.test(x=filter(burnout,GENDER==1)$burnoutscore,
y=filter(burnout,GENDER==2)$burnoutscore)
##
## Welch Two Sample t-test
##
## data: filter(burnout, GENDER == 1)$burnoutscore and filter(burnout, GENDER == 2)$burnoutscore
## t = 4.238, df = 846.42, p-value = 2.502e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.513348 6.849630
## sample estimates:
## mean of x mean of y
## 72.77855 68.09707
Conclusion : According to P value <0.01,so burnout score for each
gender is
a statistical significantly difference.
## ANOVA example for comparison more than 2 groups
boxplot(burnout$burnoutscore~burnout$STATUS,
xlab = "Status",
ylab = "Burnout score",
col = "darkseagreen",
main = "Boxplot of burnout score for each status")
summary(aov(data = burnout, burnoutscore~STATUS))
## Df Sum Sq Mean Sq F value Pr(>F)
## STATUS 1 804 804.0 2.253 0.134
## Residuals 1320 471060 356.9
## 53 observations deleted due to missingness
According to P value more than 0.05 ,so burnout score for each gender
is
not a statistical significantly difference.
## Paired t test example
fasting <- haven::read_dta("fasting.dta")
## fasting group 5 pre-fasting vs post-fasting
t.test(x = filter(fasting,fast_group==5)$HDLCmmollpre,
y = filter(fasting,fast_group==5)$HDLCmmollpost,
paired = TRUE)
##
## Paired t-test
##
## data: filter(fasting, fast_group == 5)$HDLCmmollpre and filter(fasting, fast_group == 5)$HDLCmmollpost
## t = 21.673, df = 655, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1730440 0.2075231
## sample estimates:
## mean of the differences
## 0.1902835
## fasting group 10 pre-fasting vs post-fasting
t.test(x = filter(fasting,fast_group==10)$HDLCmmollpre,
y = filter(fasting,fast_group==10)$HDLCmmollpost,
paired = TRUE)
##
## Paired t-test
##
## data: filter(fasting, fast_group == 10)$HDLCmmollpre and filter(fasting, fast_group == 10)$HDLCmmollpost
## t = 21.032, df = 529, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2083191 0.2512432
## sample estimates:
## mean of the differences
## 0.2297811
P value show both comparison are a statistical significant.
## non-normal distribution comparison example
ghana_covid <- haven::read_dta("ghana_covid.dta")
hist(ghana_covid$Cycle_Threshold,
main = "Histogram of CT value",
xlab = "CT value",
col = "cornflowerblue")
summary(ghana_covid$Cycle_Threshold)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.09 27.58 34.59 32.38 38.11 45.45
# skwed to right
boxplot(ghana_covid$Cycle_Threshold~ghana_covid$Sex,
xlab ="Sex (F= female, M= male)",
ylab ="CT value",
col ="pink",
main = "Boxplot of CT value for each sex")
#Is this a statistical significant ?
# use Wilcoxon's test instead of t test because of non parametric.
wilcox.test(filter(ghana_covid,sexno ==1)$Cycle_Threshold,
filter(ghana_covid,sexno ==0)$Cycle_Threshold )
##
## Wilcoxon rank sum test with continuity correction
##
## data: filter(ghana_covid, sexno == 1)$Cycle_Threshold and filter(ghana_covid, sexno == 0)$Cycle_Threshold
## W = 9211987, p-value = 0.0007103
## alternative hypothesis: true location shift is not equal to 0
P value show statistical significantly difference in CT value for each sex.
## More than 2 group and non normal distribution comparison
ghana_covid<- ghana_covid%>%
mutate("age_group"=case_when(Age_years %in% c(0:50) ~1,
Age_years %in% c(51:70) ~2,
Age_years >= 71 ~3))
boxplot(ghana_covid$Cycle_Threshold~ghana_covid$age_group,
xlab = "Age group",
ylab = "CT value",
col = "orange",
main = "Boxplot of CT value for age group" )
##Is this a statistical significant ?
#Since Cycle_Threshold is non normal distribution as mention previously,
#to compare more than 2 group we will use Kruskal-Wallis 's test
kruskal.test(ghana_covid$Cycle_Threshold~ghana_covid$age_group,
na.action ="na.omit" )
##
## Kruskal-Wallis rank sum test
##
## data: ghana_covid$Cycle_Threshold by ghana_covid$age_group
## Kruskal-Wallis chi-squared = 6.104, df = 2, p-value = 0.04726
According to P-value , CT value for each age group is statistical significantly difference.