if echo = false, then no code will show in output

Packages

Loading required packages:

library(ggplot2)
library(dplyr)
#install.packages("reshape2")
library(scales)
library(car)
library(melt)
library(data.table)
library(reshape2)

Setting global theme

theme_set(theme_bw())

By applying hard code (manually)

set.seed(123)
sample_data <- rnorm(30, mean = 50, sd = 10)
sample_data
##  [1] 44.39524 47.69823 65.58708 50.70508 51.29288 67.15065 54.60916 37.34939
##  [9] 43.13147 45.54338 62.24082 53.59814 54.00771 51.10683 44.44159 67.86913
## [17] 54.97850 30.33383 57.01356 45.27209 39.32176 47.82025 39.73996 42.71109
## [25] 43.74961 33.13307 58.37787 51.53373 38.61863 62.53815
mean(sample_data)
## [1] 49.52896
sd(sample_data)
## [1] 9.810307
# mu_0 = 55 (we assumed this value)
# Calculating standard error
SE = sd(sample_data)/ sqrt(length(sample_data))


# determination of calculated t value 
t_cal <- (mean(sample_data) - 55) / SE
print(t_cal)
## [1] -3.054553
# determination of critical t value at 95% level of significance
t_crit <- qt(.05/2, df = length(sample_data) - 1, lower.tail = TRUE)
print(t_crit)
## [1] -2.04523
t_cal <= t_crit   # Decision is reject null
## [1] TRUE
# determination of p value (two sided tail)
pt(t_cal, df= 29, lower.tail = TRUE) + pt(-t_cal, df= 29, lower.tail = FALSE)
## [1] 0.0047971
# determination of confidence interval (lower and upper tail consecutively)
mean(sample_data) - abs(t_crit) * SE
## [1] 45.86573
mean(sample_data) + abs(t_crit) * SE
## [1] 53.19219
# Confidence interval Combinely 
mean(sample_data) + c(-1, 1) *abs(t_crit) * SE
## [1] 45.86573 53.19219

Using formulla

t.test(sample_data, mu = 55)   # at 5% level of significance(default)
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 95 percent confidence interval:
##  45.86573 53.19219
## sample estimates:
## mean of x 
##  49.52896
t.test(sample_data, mu = 55, conf.level = 0.99) # at 1% level of significance
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 99 percent confidence interval:
##  44.59198 54.46595
## sample estimates:
## mean of x 
##  49.52896
t.test(sample_data, mu = 55, conf.level = 0.90) # at 10% level of significance
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 90 percent confidence interval:
##  46.48564 52.57228
## sample estimates:
## mean of x 
##  49.52896

Using function (formulla)

Generate sample data

set.seed(123)
sample_data <- rnorm(30, mean = 50, sd = 10)

Normality test using shapiro-wilk test

shapiro.test(sample_data)
## 
##  Shapiro-Wilk normality test
## 
## data:  sample_data
## W = 0.97894, p-value = 0.7966

\(H_0:\) Data follows normal distribution.
\(H_1:\) Data does not follows normal distribution.

Since the p-value is greater than level of significance (\(\alpha\)= 0.05), we do not have enough statistical evidence to reject the null hypothesis.

One sample t-test

Hypothesis:
\(H_0: \mu = 50\)
\(H_1: \mu \neq 50\)

using function, perform the two tailed t-test

t.test(sample_data, mu= 50)
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -0.26299, df = 29, p-value = 0.7944
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
##  45.86573 53.19219
## sample estimates:
## mean of x 
##  49.52896

Hypothesis:
\(H_0: \mu <=50\)
\(H_1: \mu >50\)

using function, perform the one tailed t-test when alternative is greater

t.test(sample_data, mu= 50, alternative = "greater")
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -0.26299, df = 29, p-value = 0.6028
## alternative hypothesis: true mean is greater than 50
## 95 percent confidence interval:
##  46.48564      Inf
## sample estimates:
## mean of x 
##  49.52896

hypothesis:
\(H_0: \mu >= 58\)
\(H_1: \mu < 58\)

Using function, perform the one tailed t-test when alternative is less

t.test(sample_data, mu= 58, alternative = "less")
## 
##  One Sample t-test
## 
## data:  sample_data
## t = -4.7295, df = 29, p-value = 2.689e-05
## alternative hypothesis: true mean is less than 58
## 95 percent confidence interval:
##      -Inf 52.57228
## sample estimates:
## mean of x 
##  49.52896

Visualization

ggplot(data.frame(value = sample_data), aes(x= value))+
  geom_histogram(aes(y= after_stat(density)), bins = 10, fill= "blue", alpha = 0.5)+
  geom_density(color= "red", linewidth = 1)+
  labs(
    title = "Sample data distribution",
    x= "Value",
    y= "Density"
  )

Two sample t test

Use build-in dataset: mtcars (comparing mpg for automatic vs manual cars)

data("mtcars")


# Split into two group based on transmission type
auto_mpg <- mtcars$mpg[mtcars$am == 0]    # Automatic
manual_mpg <- mtcars$mpg [mtcars$ am == 1]  # Manual

\(H_O:\) Automatic cars and manual cars have equal average
\(H_1:\) Automatic cars and manual cars have unequal average

Mean value

mean(auto_mpg)
## [1] 17.14737
mean(manual_mpg)
## [1] 24.39231

Variance

var(auto_mpg)
## [1] 14.6993
var(manual_mpg)
## [1] 38.02577

Normality test for both groups

shapiro.test(auto_mpg)
## 
##  Shapiro-Wilk normality test
## 
## data:  auto_mpg
## W = 0.97677, p-value = 0.8987
shapiro.test(manual_mpg)
## 
##  Shapiro-Wilk normality test
## 
## data:  manual_mpg
## W = 0.9458, p-value = 0.5363

Check variance homogeneity (Levene’s test)

leveneTest(mpg ~ factor(am), data = mtcars, center= "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
##       Df F value  Pr(>F)  
## group  1   5.921 0.02113 *
##       30                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Perform two sample t-test

t.test(auto_mpg, manual_mpg, var.equal = TRUE) 
## 
##  Two Sample t-test
## 
## data:  auto_mpg and manual_mpg
## t = -4.1061, df = 30, p-value = 0.000285
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.84837  -3.64151
## sample estimates:
## mean of x mean of y 
##  17.14737  24.39231
# From var value we see that variance is not equal of this two sample. 
#but we assume that variance is equal. so var.equal= TREU

Welch Two Sample t-test

t.test(auto_mpg, manual_mpg, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  auto_mpg and manual_mpg
## t = -3.7671, df = 18.332, p-value = 0.001374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11.280194  -3.209684
## sample estimates:
## mean of x mean of y 
##  17.14737  24.39231
# In real life scenario we found most of cases variance of to sample is not equal.
# So var.equal = false and it is called welch's two sample t-test

Visualization

ggplot(mtcars, aes( x= factor(am), y = mpg, fill = factor(am)))+
  geom_boxplot(aplha = 0.6)+
  geom_jitter( width = 0.2, alpha= .7)+
  labs(
    title = "MPG comparison: Automaticn vs Manual",
    x= "Transmission (0 = Auto, 1 = Manual)",
    y= "Miles per gallon"
  )+
  scale_fill_manual(values = c("blue", "red"),
                    labels = c("Automatic", "Manual"))
## Warning in geom_boxplot(aplha = 0.6): Ignoring unknown parameters: `aplha`

Paired sample t-test

Generate 30 observation

set.seed(123)
before <- round(rnorm(30, mean = 300, sd = 10), 0)
summary(before)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   280.0   293.2   299.5   299.6   305.0   318.0
after <- before + round(rnorm(30, mean = 5, sd = 5), 0)
summary(after)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   283.0   298.2   305.0   305.5   312.2   325.0
df <- data.frame(
  ID = 1:30,
  Before = before,
  After = after
)

\(H_0:\) Before and after the course the true GRE average score of the students stays the same \(H_1:\) Before and after the course the true GRE average score of the students does not remain same

Perform paired t-test

t.test( x= after, y= before, paired = TRUE, alterative = "two.sided")
## 
##  Paired t-test
## 
## data:  after and before
## t = 7.7811, df = 29, p-value = 1.398e-08
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  4.373779 7.492888
## sample estimates:
## mean difference 
##        5.933333

#Visualization

df_long <- melt(df, id.vars = "ID")
ggplot(df_long, aes(x= variable, y= value, group = ID))+
  geom_point(aes(color = variable), size = 2)+
  geom_line(alpha= 0.8)+
  labs(
    title = "Paired sample (Before vs after)",
    x = "Conditions",
    y = "Values"
  )

ggpaired( df_long,
          x= "variable",
          y= "value",
          color = "variable",
          line.color = "grey",
          line.size = .5,
          palette = "jco")+
  stat_compare_means(paired = TRUE, method = "t.test")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.