library(ggplot2)
library(dplyr)
#install.packages("reshape2")
library(scales)
library(car)
library(melt)
library(data.table)
library(reshape2)
theme_set(theme_bw())
set.seed(123)
sample_data <- rnorm(30, mean = 50, sd = 10)
sample_data
## [1] 44.39524 47.69823 65.58708 50.70508 51.29288 67.15065 54.60916 37.34939
## [9] 43.13147 45.54338 62.24082 53.59814 54.00771 51.10683 44.44159 67.86913
## [17] 54.97850 30.33383 57.01356 45.27209 39.32176 47.82025 39.73996 42.71109
## [25] 43.74961 33.13307 58.37787 51.53373 38.61863 62.53815
mean(sample_data)
## [1] 49.52896
sd(sample_data)
## [1] 9.810307
# mu_0 = 55 (we assumed this value)
# Calculating standard error
SE = sd(sample_data)/ sqrt(length(sample_data))
# determination of calculated t value
t_cal <- (mean(sample_data) - 55) / SE
print(t_cal)
## [1] -3.054553
# determination of critical t value at 95% level of significance
t_crit <- qt(.05/2, df = length(sample_data) - 1, lower.tail = TRUE)
print(t_crit)
## [1] -2.04523
t_cal <= t_crit # Decision is reject null
## [1] TRUE
# determination of p value (two sided tail)
pt(t_cal, df= 29, lower.tail = TRUE) + pt(-t_cal, df= 29, lower.tail = FALSE)
## [1] 0.0047971
# determination of confidence interval (lower and upper tail consecutively)
mean(sample_data) - abs(t_crit) * SE
## [1] 45.86573
mean(sample_data) + abs(t_crit) * SE
## [1] 53.19219
# Confidence interval Combinely
mean(sample_data) + c(-1, 1) *abs(t_crit) * SE
## [1] 45.86573 53.19219
t.test(sample_data, mu = 55) # at 5% level of significance(default)
##
## One Sample t-test
##
## data: sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 95 percent confidence interval:
## 45.86573 53.19219
## sample estimates:
## mean of x
## 49.52896
t.test(sample_data, mu = 55, conf.level = 0.99) # at 1% level of significance
##
## One Sample t-test
##
## data: sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 99 percent confidence interval:
## 44.59198 54.46595
## sample estimates:
## mean of x
## 49.52896
t.test(sample_data, mu = 55, conf.level = 0.90) # at 10% level of significance
##
## One Sample t-test
##
## data: sample_data
## t = -3.0546, df = 29, p-value = 0.004797
## alternative hypothesis: true mean is not equal to 55
## 90 percent confidence interval:
## 46.48564 52.57228
## sample estimates:
## mean of x
## 49.52896
Generate sample data
set.seed(123)
sample_data <- rnorm(30, mean = 50, sd = 10)
shapiro.test(sample_data)
##
## Shapiro-Wilk normality test
##
## data: sample_data
## W = 0.97894, p-value = 0.7966
\(H_0:\) Data follows normal
distribution.
\(H_1:\) Data does not follows normal
distribution.
Since the p-value is greater than level of significance (\(\alpha\)= 0.05), we do not have enough statistical evidence to reject the null hypothesis.
One sample t-test
Hypothesis:
\(H_0: \mu = 50\)
\(H_1: \mu \neq 50\)
using function, perform the two tailed t-test
t.test(sample_data, mu= 50)
##
## One Sample t-test
##
## data: sample_data
## t = -0.26299, df = 29, p-value = 0.7944
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
## 45.86573 53.19219
## sample estimates:
## mean of x
## 49.52896
Hypothesis:
\(H_0: \mu <=50\)
\(H_1: \mu >50\)
using function, perform the one tailed t-test when alternative is greater
t.test(sample_data, mu= 50, alternative = "greater")
##
## One Sample t-test
##
## data: sample_data
## t = -0.26299, df = 29, p-value = 0.6028
## alternative hypothesis: true mean is greater than 50
## 95 percent confidence interval:
## 46.48564 Inf
## sample estimates:
## mean of x
## 49.52896
hypothesis:
\(H_0: \mu >= 58\)
\(H_1: \mu < 58\)
Using function, perform the one tailed t-test when alternative is less
t.test(sample_data, mu= 58, alternative = "less")
##
## One Sample t-test
##
## data: sample_data
## t = -4.7295, df = 29, p-value = 2.689e-05
## alternative hypothesis: true mean is less than 58
## 95 percent confidence interval:
## -Inf 52.57228
## sample estimates:
## mean of x
## 49.52896
ggplot(data.frame(value = sample_data), aes(x= value))+
geom_histogram(aes(y= after_stat(density)), bins = 10, fill= "blue", alpha = 0.5)+
geom_density(color= "red", linewidth = 1)+
labs(
title = "Sample data distribution",
x= "Value",
y= "Density"
)
Use build-in dataset: mtcars (comparing mpg for automatic vs manual cars)
data("mtcars")
# Split into two group based on transmission type
auto_mpg <- mtcars$mpg[mtcars$am == 0] # Automatic
manual_mpg <- mtcars$mpg [mtcars$ am == 1] # Manual
\(H_O:\) Automatic cars and manual
cars have equal average
\(H_1:\) Automatic cars and manual cars
have unequal average
Mean value
mean(auto_mpg)
## [1] 17.14737
mean(manual_mpg)
## [1] 24.39231
Variance
var(auto_mpg)
## [1] 14.6993
var(manual_mpg)
## [1] 38.02577
Normality test for both groups
shapiro.test(auto_mpg)
##
## Shapiro-Wilk normality test
##
## data: auto_mpg
## W = 0.97677, p-value = 0.8987
shapiro.test(manual_mpg)
##
## Shapiro-Wilk normality test
##
## data: manual_mpg
## W = 0.9458, p-value = 0.5363
Check variance homogeneity (Levene’s test)
leveneTest(mpg ~ factor(am), data = mtcars, center= "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 1 5.921 0.02113 *
## 30
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Perform two sample t-test
t.test(auto_mpg, manual_mpg, var.equal = TRUE)
##
## Two Sample t-test
##
## data: auto_mpg and manual_mpg
## t = -4.1061, df = 30, p-value = 0.000285
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.84837 -3.64151
## sample estimates:
## mean of x mean of y
## 17.14737 24.39231
# From var value we see that variance is not equal of this two sample.
#but we assume that variance is equal. so var.equal= TREU
Welch Two Sample t-test
t.test(auto_mpg, manual_mpg, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: auto_mpg and manual_mpg
## t = -3.7671, df = 18.332, p-value = 0.001374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11.280194 -3.209684
## sample estimates:
## mean of x mean of y
## 17.14737 24.39231
# In real life scenario we found most of cases variance of to sample is not equal.
# So var.equal = false and it is called welch's two sample t-test
ggplot(mtcars, aes( x= factor(am), y = mpg, fill = factor(am)))+
geom_boxplot(aplha = 0.6)+
geom_jitter( width = 0.2, alpha= .7)+
labs(
title = "MPG comparison: Automaticn vs Manual",
x= "Transmission (0 = Auto, 1 = Manual)",
y= "Miles per gallon"
)+
scale_fill_manual(values = c("blue", "red"),
labels = c("Automatic", "Manual"))
## Warning in geom_boxplot(aplha = 0.6): Ignoring unknown parameters: `aplha`
Generate 30 observation
set.seed(123)
before <- round(rnorm(30, mean = 300, sd = 10), 0)
summary(before)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 280.0 293.2 299.5 299.6 305.0 318.0
after <- before + round(rnorm(30, mean = 5, sd = 5), 0)
summary(after)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 283.0 298.2 305.0 305.5 312.2 325.0
df <- data.frame(
ID = 1:30,
Before = before,
After = after
)
\(H_0:\) Before and after the course the true GRE average score of the students stays the same \(H_1:\) Before and after the course the true GRE average score of the students does not remain same
Perform paired t-test
t.test( x= after, y= before, paired = TRUE, alterative = "two.sided")
##
## Paired t-test
##
## data: after and before
## t = 7.7811, df = 29, p-value = 1.398e-08
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 4.373779 7.492888
## sample estimates:
## mean difference
## 5.933333
#Visualization
df_long <- melt(df, id.vars = "ID")
ggplot(df_long, aes(x= variable, y= value, group = ID))+
geom_point(aes(color = variable), size = 2)+
geom_line(alpha= 0.8)+
labs(
title = "Paired sample (Before vs after)",
x = "Conditions",
y = "Values"
)
ggpaired( df_long,
x= "variable",
y= "value",
color = "variable",
line.color = "grey",
line.size = .5,
palette = "jco")+
stat_compare_means(paired = TRUE, method = "t.test")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.