This article is a practice about some basic statistical analysis, which is a part
of FETP basis data analysis course.

library(tidyverse)
japan <- haven::read_dta("japan_studen.dta")

head(japan$knowledge,10)
##  [1] 87 93 85 76 76 80 77 99 73 65
## knowledge variable is a continous data
## Is knowledge variable a normal distribution?
#1.Histogram plot
hist(japan$knowledge, col ="pink",
     xlab = "Knowledge", 
     main = "Histogram of knowledge",
     breaks = 20)

#2.Q-Q plot
qqnorm(japan$knowledge)
qqline(japan$knowledge, 
       lwd=2, 
       col= "red")

#3.Shapiro test
shapiro.test(japan$knowledge)
## 
##  Shapiro-Wilk normality test
## 
## data:  japan$knowledge
## W = 0.98977, p-value = 0.01269

Conclusion knowledge variable is a normal distribution.
Let do some inference.

## Find 95% CI of knowledge in population.
knowledge<- lm(japan$knowledge~1)
confint(knowledge, level=0.95)
##                2.5 %   97.5 %
## (Intercept) 79.48577 82.11534

Example of distibution test.

##Use japan_covid_viral_load dataset.
japan_viral_load <- haven::read_dta("japan_covid_viralload.dta")
summary(japan_viral_load$vl_init)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.41      7.91     72.14  15533.14   1876.45 175000.00
# Let exam japan_viral_load$vl_init distribution
hist(japan_viral_load$vl_init, 
     col = "pink",
     breaks = 10,
     main = "Histogram of viral load on normal scale",
     xlab = "Number of viral load")

##Histogram show skewed to right
# Change to log scale to fix this skewed problem
hist(log(japan_viral_load$vl_init),
     col = "pink",
     main = "Histogram of viral load on log scale",
     xlab = "Number of viral load on log scale")

shapiro.test(log(japan_viral_load$vl_init))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(japan_viral_load$vl_init)
## W = 0.94775, p-value = 0.1739
qqnorm(log(japan_viral_load$vl_init))
qqline(log(japan_viral_load$vl_init), 
       col="red", 
       lwd=2)

## All show normal distribution

Example of statistical comparison.

## t test comparison for continuous data example
burnout <- haven::read_dta("burnout_french.dta")

hist(burnout$burnoutscore)

## Assume burnoutscore is normal distributed

boxplot(burnout$burnoutscore~burnout$GENDER,
        xlab = "Gender",
        ylab = "Burnout score",
        col = "darkseagreen2",
        main ="Boxplot of burnout score for each gender")

Is this a statistical significantly difference?

## Use t test for comparing bunoutscore by gender.
t.test(x=filter(burnout,GENDER==1)$burnoutscore,
       y=filter(burnout,GENDER==2)$burnoutscore)
## 
##  Welch Two Sample t-test
## 
## data:  filter(burnout, GENDER == 1)$burnoutscore and filter(burnout, GENDER == 2)$burnoutscore
## t = 4.238, df = 846.42, p-value = 2.502e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.513348 6.849630
## sample estimates:
## mean of x mean of y 
##  72.77855  68.09707

Conclusion : According to P value <0.01,so burnout score for each gender is
a statistical significantly difference.

## ANOVA example for comparison more than 2 groups 

boxplot(burnout$burnoutscore~burnout$STATUS,
        xlab = "Status",
        ylab = "Burnout score",
        col = "darkseagreen",
        main = "Boxplot of burnout score for each status")

summary(aov(data = burnout, burnoutscore~STATUS))
##               Df Sum Sq Mean Sq F value Pr(>F)
## STATUS         1    804   804.0   2.253  0.134
## Residuals   1320 471060   356.9               
## 53 observations deleted due to missingness

According to P value more than 0.05 ,so burnout score for each gender is
not a statistical significantly difference.

## Paired t test example

fasting <- haven::read_dta("fasting.dta")

## fasting group 5 pre-fasting vs post-fasting
t.test(x = filter(fasting,fast_group==5)$HDLCmmollpre,
       y = filter(fasting,fast_group==5)$HDLCmmollpost,
       paired = TRUE)
## 
##  Paired t-test
## 
## data:  filter(fasting, fast_group == 5)$HDLCmmollpre and filter(fasting, fast_group == 5)$HDLCmmollpost
## t = 21.673, df = 655, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1730440 0.2075231
## sample estimates:
## mean of the differences 
##               0.1902835
## fasting group 10 pre-fasting vs post-fasting
t.test(x = filter(fasting,fast_group==10)$HDLCmmollpre,
       y = filter(fasting,fast_group==10)$HDLCmmollpost,
       paired = TRUE)
## 
##  Paired t-test
## 
## data:  filter(fasting, fast_group == 10)$HDLCmmollpre and filter(fasting, fast_group == 10)$HDLCmmollpost
## t = 21.032, df = 529, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2083191 0.2512432
## sample estimates:
## mean of the differences 
##               0.2297811

P value show both comparison are a statistical significant.

## non-normal distribution comparison example
ghana_covid <- haven::read_dta("ghana_covid.dta")
hist(ghana_covid$Cycle_Threshold,
     main = "Histogram of CT value",
     xlab = "CT value",
     col = "cornflowerblue")

summary(ghana_covid$Cycle_Threshold)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.09   27.58   34.59   32.38   38.11   45.45
# skwed to right
boxplot(ghana_covid$Cycle_Threshold~ghana_covid$Sex, 
        xlab ="Sex (F= female, M= male)",
        ylab ="CT value",
        col ="pink",
        main = "Boxplot of CT value for each sex")

#Is this a statistical significant ?
# use Wilcoxon's test instead of t test because of non parametric.
wilcox.test(filter(ghana_covid,sexno ==1)$Cycle_Threshold,
            filter(ghana_covid,sexno ==0)$Cycle_Threshold )
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  filter(ghana_covid, sexno == 1)$Cycle_Threshold and filter(ghana_covid, sexno == 0)$Cycle_Threshold
## W = 9211987, p-value = 0.0007103
## alternative hypothesis: true location shift is not equal to 0

P value show statistical significantly difference in CT value for each sex.

## More than 2 group and non normal distribution comparison
ghana_covid<- ghana_covid%>%
                  mutate("age_group"=case_when(Age_years %in% c(0:50) ~1,
                                               Age_years %in% c(51:70) ~2,
                                               Age_years >= 71 ~3))

boxplot(ghana_covid$Cycle_Threshold~ghana_covid$age_group,
        xlab = "Age group",
        ylab = "CT value",
        col = "orange",
        main = "Boxplot of CT value for age group" )

##Is this a statistical significant ?
#Since Cycle_Threshold is non normal distribution as mention previously,
#to compare more than 2 group we will use Kruskal-Wallis 's test
kruskal.test(ghana_covid$Cycle_Threshold~ghana_covid$age_group,
             na.action  ="na.omit" )
## 
##  Kruskal-Wallis rank sum test
## 
## data:  ghana_covid$Cycle_Threshold by ghana_covid$age_group
## Kruskal-Wallis chi-squared = 6.104, df = 2, p-value = 0.04726

According to P-value , CT value for each age group is statistical significantly difference.