Requaired packages

library(ggplot2)
library(ggpubr)
library(reshape2)
library(car)
#install.packages("reshape2")

one sample t_test

Manually calculation

set.seed(123)
sample_data <- rnorm( 30 , mean = 50, sd = 10)

mean(sample_data)
## [1] 49.52896
sd(sample_data)
## [1] 9.810307
# mu_0 = 55
SE = sd(sample_data) / sqrt(length(sample_data))

t_cal <- (mean(sample_data) - 55)/ SE
print(t_cal)
## [1] -3.054553
t_crit <- qt(0.05 / 2 , df = length(sample_data) - 1, lower.tail = TRUE)

t_crit
## [1] -2.04523
t_cal <= t_crit # decision : reject null
## [1] TRUE
pt(t_cal, df = 29, lower.tail = T) + pt(-t_cal, df = 29, lower.tail = F) # To find p value
## [1] 0.0047971
mean(sample_data) - abs(t_crit)*SE # Lower limit of CI
## [1] 45.86573
mean(sample_data) + abs(t_crit)*SE # Upper limit of CI
## [1] 53.19219
mean(sample_data) + c(-1,1)* abs(t_crit)*SE # in one line
## [1] 45.86573 53.19219

Direct calculation

t.test(sample_data, mu = 55)
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 95 percent confidence interval:
##  45.86573 53.19219
## sample estimates:
## mean of x 
##  49.52896
t.test(sample_data, mu = 55,conf.level = 0.99)
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 99 percent confidence interval:
##  44.59198 54.46595
## sample estimates:
## mean of x 
##  49.52896
t.test(sample_data, mu = 55,conf.level = 0.90)
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 90 percent confidence interval:
##  46.48564 52.57228
## sample estimates:
## mean of x 
##  49.52896

Testing normality using Shaphiro_wilk test

shapiro.test(sample_data)
## 
##  Shapiro-Wilk normality test
## 
## data:  sample_data
## W = 0.97894, p-value = 0.7966

\(H_0 :\) Data follows normal distributiion.
\(H_1 :\) Data does not follow normal distributiion.

since the p-value is greater than the level of significance (\(\alpha\) = 0.05) , we do not have enough statistical evidence to reject the null hypothesis.

Hypothesis:
\(H_0: \mu = 50\)
$H_1: $

Using function, perform the two tailed t-test:

t.test(sample_data, mu = 50)
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -0.26299, df = 29, p-value = 0.7944
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
##  45.86573 53.19219
## sample estimates:
## mean of x 
##  49.52896

since the p-value is greater than the level of significance (\(\alpha\) = 0.05) , we do not have enough statistical evidence to reject the null hypothesis.

Hypothesis:
\(H_0: \mu <= 50\)
$H_1: > 50 $

Using function, perform the one tailed t-test:

t.test(sample_data, mu = 50, alternative = "greater")
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -0.26299, df = 29, p-value = 0.6028
## alternative hypothesis: true mean is greater than 50
## 95 percent confidence interval:
##  46.48564      Inf
## sample estimates:
## mean of x 
##  49.52896

since the p-value is greater than the level of significance (\(\alpha\) = 0.05) , we do not have enough statistical evidence to reject the null hypothesis.

Hypothesis:
\(H_0: \mu => 50\)
$H_1: < 50 $

Using function, perform the one tailed t-test:

t.test(sample_data, mu = 50, alternative = "less")
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -0.26299, df = 29, p-value = 0.3972
## alternative hypothesis: true mean is less than 50
## 95 percent confidence interval:
##      -Inf 52.57228
## sample estimates:
## mean of x 
##  49.52896

since the p-value is greater than the level of significance (\(\alpha\) = 0.05) , we do not have enough statistical evidence to reject the null hypothesis.

Visualization

ggplot(data.frame(value = sample_data) , aes(x = value))+
  geom_histogram(aes(y = after_stat(density)), bins = 10, fill = "blue", alpha = 0.5)+
  geom_density(color = "red" , linewidth = 1) +
  labs(title = "Sample data distribution" , x = "Value " , y = "Density")+
  theme_minimal()

Two sample t test or independent sample t test

Using build in dataset : mtcars (comparing mpg for automatic vs manual cars)a;

data("mtcars")

Split into two groups based on transmission type :

auto_mpg <- mtcars$mpg[mtcars$am == 0 ] # automatic 
manual_mpg <- mtcars$mpg[mtcars$am == 1] # Manual

check variance homogeneity (Levene’s test):

leveneTest( mpg ~ factor(am), data = mtcars , center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
##       Df F value  Pr(>F)  
## group  1  4.1876 0.04957 *
##       30                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest( mpg ~ factor(am), data = mtcars , center = "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
##       Df F value  Pr(>F)  
## group  1   5.921 0.02113 *
##       30                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

since the p-value is less than the level of significance (\(\alpha\) = 0.05) , we have enough statistical evidence to reject the null hypothesis.

Significance code p-value range Meaning *** (3 star) p < 0.001 Highly significant ** (2 star) p < 0.01 Very significant * (1 star) p < 0.05 Significant . (dot) p < 0.1 Weak evidence / marginal (none) p ≥ 0.1 Not significant

Perform two sample t test :

t.test(auto_mpg ,manual_mpg  , var.equal = TRUE) # Assuming unequal variance
## 
##  Two Sample t-test
## 
## data:  auto_mpg and manual_mpg
## t = -4.1061, df = 30, p-value = 0.000285
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.84837  -3.64151
## sample estimates:
## mean of x mean of y 
##  17.14737  24.39231

Hypothesis:
\(H_0:\) Automatic cars and manual cars have equal average mpg.
\(H_1:\) Automatic cars and manual cars have unequal average mpg.

since the p-value is less than the level of significance (\(\alpha\) = 0.05) , we have enough statistical evidence to reject the null hypothesis.

welch’s t-test for unequal variance :

t.test(auto_mpg ,manual_mpg  , var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  auto_mpg and manual_mpg
## t = -3.7671, df = 18.332, p-value = 0.001374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11.280194  -3.209684
## sample estimates:
## mean of x mean of y 
##  17.14737  24.39231

since the p-value is less than the level of significance (\(\alpha\) = 0.05) , we have enough statistical evidence to reject the null hypothesis.

Visualization the data

ggplot(mtcars,aes(x = factor(am), y = mpg , fill = factor(am))) +
  geom_boxplot(alpha = 0.6) +
  geom_jitter(width = 0.2 , alpha = 0.7) +
  labs(title = "MPG Comparison : Automatic vs Manual" , x = " Transmission (0 = Auto , 1 = Manual)" , y = "Miles per Gallon")+
  scale_fill_manual(values = c("blue" , "red"), labels = c("Automatic" , "Manual"))+
  theme_minimal()

Paired sample t-test

Generate 30 observations for each group :

set.seed(123)
before <- round(rnorm(30, mean = 300, sd = 10), 0)

summary(before)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   280.0   293.2   299.5   299.6   305.0   318.0
after <- before + round(rnorm(30, mean = 5, sd = 5 ), 0)
summary(after)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   283.0   298.2   305.0   305.5   312.2   325.0

Perform Paired t test :

t.test(after, before, paired = TRUE, alternative = "two.sided")
## 
##  Paired t-test
## 
## data:  after and before
## t = 7.7811, df = 29, p-value = 1.398e-08
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  4.373779 7.492888
## sample estimates:
## mean difference 
##        5.933333
df <- data.frame(
  ID = 1:30,
  Before = before,
  After = after
)

Hypothesis:
\(H_0:\) Before and after the true GRE average score of the students stays the same.
\(H_1:\) Before and after the true GRE average score of the students does not stay the same.

Perform Paired t test :

t.test(x = after, y = before, paired = TRUE, alternative = "two.sided")
## 
##  Paired t-test
## 
## data:  after and before
## t = 7.7811, df = 29, p-value = 1.398e-08
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  4.373779 7.492888
## sample estimates:
## mean difference 
##        5.933333

visualization

df_long <- melt(df, id.vars = "ID")
ggplot(df_long,aes(x = variable, y = value,group = ID))+
  geom_point(aes(colour = variable), size = 3)+
  geom_line(alpha = 0.85 ,colour = "blue")+
  labs(
    title = "Paired samples (Before vs After)",
    y = "Values" , x = "Condition"
   ) +
  theme_minimal()

ggpaired(df_long, 
         x = "variable",
         y = "value" ,
         color = "variable" , line.color = "gray" , line.size = 0.4, palette = "jco")+
  stat_compare_means(paired = TRUE , method = "t.test")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.