library(car)
## Loading required package: carData
library(readxl)
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following objects are masked from 'package:carData':
##
## Vocab, Wool
## The following object is masked from 'package:datasets':
##
## Orange
library(kableExtra)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
apple = read_excel('datasets.xlsx', sheet = 'z_t_test', range = 'B4:C24')
apple
## # A tibble: 20 × 2
## sr. wts
## <dbl> <dbl>
## 1 1 92.4
## 2 2 89.2
## 3 3 88.4
## 4 4 91.4
## 5 5 91.1
## 6 6 77.2
## 7 7 97.0
## 8 8 82.2
## 9 9 85.6
## 10 10 90
## 11 11 90.0
## 12 12 87.8
## 13 13 91.7
## 14 14 92.1
## 15 15 88.8
## 16 16 87.7
## 17 17 86.8
## 18 18 87.8
## 19 19 88.4
## 20 20 93.5
# H0: mean weight of apples = 90 g
mean(apple$wts)
## [1] 88.9535
# Assumption
# Normality (boxplot, qqPlot)
boxplot(apple$wts, col = 'red')

qqPlot(apple$wts)

## [1] 6 7
# data seems not be normal
# still we will run t.test just for demonstration
# mu = 90, to which we are comparing
t.test(apple$wts, mu = 90)
##
## One Sample t-test
##
## data: apple$wts
## t = -1.1189, df = 19, p-value = 0.2771
## alternative hypothesis: true mean is not equal to 90
## 95 percent confidence interval:
## 86.99592 90.91108
## sample estimates:
## mean of x
## 88.9535
# p > 0.05, sufficient evidence to support H0
# H0 cannot be rejected
# Therefore, we can conclude that the average weight of the 20 apples is statistically equal to 90 g.
# If population parameter is known and sample size is large (n>=30), z-test can be done
z.test(apple$wts, mu = 92, sigma.x = 5)
##
## One-sample z-Test
##
## data: apple$wts
## z = -2.7249, p-value = 0.006433
## alternative hypothesis: true mean is not equal to 92
## 95 percent confidence interval:
## 86.76219 91.14481
## sample estimates:
## mean of x
## 88.9535
# H0: true mean = 92
# p < 0.05, insufficient evidence, H0 rejected
# Conclusion: true mean is not equal to 92
# Check the CI: [86.76, 91.14]: does not contain 92.
# Q2: Read dataset
baby = read_excel('datasets.xlsx', sheet = 'z_t_test', range = 'L23:Q498')
str(baby)
## tibble [475 × 6] (S3: tbl_df/tbl/data.frame)
## $ sr. : num [1:475] 1 2 3 4 5 6 7 8 9 10 ...
## $ Sex : num [1:475] 1 2 1 1 1 2 2 2 1 1 ...
## $ Age_mothers : num [1:475] 26 20 24 20 53 51 84 10 56 30 ...
## $ Height : num [1:475] 78 86 73 80 97 98 102 66 93 80 ...
## $ Weight : num [1:475] 9.8 9.5 10 10 15 14.1 15.5 8.5 14 10.5 ...
## $ Breast_feeding: num [1:475] 1 1 1 1 0 0 0 1 0 1 ...
# add a variable named 'BMI' using mutate()
# z-score = standardized score
baby = baby %>% mutate(BMI = Weight/(Height/100)^2)
baby = baby %>% mutate(z_weight = (Weight - mean(Weight))/sd(Weight))
str(baby)
## tibble [475 × 8] (S3: tbl_df/tbl/data.frame)
## $ sr. : num [1:475] 1 2 3 4 5 6 7 8 9 10 ...
## $ Sex : num [1:475] 1 2 1 1 1 2 2 2 1 1 ...
## $ Age_mothers : num [1:475] 26 20 24 20 53 51 84 10 56 30 ...
## $ Height : num [1:475] 78 86 73 80 97 98 102 66 93 80 ...
## $ Weight : num [1:475] 9.8 9.5 10 10 15 14.1 15.5 8.5 14 10.5 ...
## $ Breast_feeding: num [1:475] 1 1 1 1 0 0 0 1 0 1 ...
## $ BMI : num [1:475] 16.1 12.8 18.8 15.6 15.9 ...
## $ z_weight : num [1:475] -0.471 -0.558 -0.412 -0.412 1.051 ...
kbl(head(baby))
sr.
|
Sex
|
Age_mothers
|
Height
|
Weight
|
Breast_feeding
|
BMI
|
z_weight
|
1
|
1
|
26
|
78
|
9.8
|
1
|
16.10782
|
-0.4706895
|
2
|
2
|
20
|
86
|
9.5
|
1
|
12.84478
|
-0.5584933
|
3
|
1
|
24
|
73
|
10.0
|
1
|
18.76525
|
-0.4121537
|
4
|
1
|
20
|
80
|
10.0
|
1
|
15.62500
|
-0.4121537
|
5
|
1
|
53
|
97
|
15.0
|
0
|
15.94218
|
1.0512416
|
6
|
2
|
51
|
98
|
14.1
|
0
|
14.68138
|
0.7878304
|
summary(baby$z_weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.60725 -0.85117 0.02686 0.00000 0.75856 2.95366
# Count the number of babies with z-score less than -2
category = cut(baby$z_weight, breaks = c(-Inf, -2, Inf),
labels = c('Wasted', 'Normal'),
right = FALSE)
# frequency or percentage table of category
table(category)*100/length(category)
## category
## Wasted Normal
## 2.105263 97.894737
# Ans: 2.11% of the babies are wasted.
# H0: Average ages of mothers of male babies and female babies are equal
# Two sample t-test
# Assumption:
# Both groups normally distributed
# Equal variance
# Sex: Factor variables with two levels (two groups)
baby$Sex = as.factor(baby$Sex)
str(baby)
## tibble [475 × 8] (S3: tbl_df/tbl/data.frame)
## $ sr. : num [1:475] 1 2 3 4 5 6 7 8 9 10 ...
## $ Sex : Factor w/ 2 levels "1","2": 1 2 1 1 1 2 2 2 1 1 ...
## $ Age_mothers : num [1:475] 26 20 24 20 53 51 84 10 56 30 ...
## $ Height : num [1:475] 78 86 73 80 97 98 102 66 93 80 ...
## $ Weight : num [1:475] 9.8 9.5 10 10 15 14.1 15.5 8.5 14 10.5 ...
## $ Breast_feeding: num [1:475] 1 1 1 1 0 0 0 1 0 1 ...
## $ BMI : num [1:475] 16.1 12.8 18.8 15.6 15.9 ...
## $ z_weight : num [1:475] -0.471 -0.558 -0.412 -0.412 1.051 ...
boxplot(Age_mothers ~ Sex, data = baby)

# Bring two plots in a single canvas, 1 row, 2 columns
par(mfrow = c(1,2))
# for males
qqnorm(baby$Age_mothers[baby$Sex == 1])
qqline(baby$Age_mothers[baby$Sex == 1])
# for females
qqnorm(baby$Age_mothers[baby$Sex == 2])
qqline(baby$Age_mothers[baby$Sex == 2])

par(mfrow = c(1,1))
# qqPlot() in car package can drwas qqplot in a single function
qqPlot(Age_mothers ~ Sex, data = baby)

# For large sample, we can consider this as normally distributed
# Check equality of variance (Levene Test)
# H0: Variances are equal for both groups
car::leveneTest(Age_mothers ~ Sex, data = baby)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 4.3132 0.03836 *
## 473
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, variances are not equal
# t test is not applicable, Welch test is suggested
# t.test() function is for both t test and Welch test
# t test
t.test(Age_mothers ~ Sex, data = baby, var.equal = TRUE)
##
## Two Sample t-test
##
## data: Age_mothers by Sex
## t = 1.7464, df = 473, p-value = 0.08139
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## -0.4613116 7.8323660
## sample estimates:
## mean in group 1 mean in group 2
## 37.39070 33.70517
# Welch test
t.test(Age_mothers ~ Sex, data = baby, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: Age_mothers by Sex
## t = 1.7495, df = 472.58, p-value = 0.08086
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## -0.4540092 7.8250636
## sample estimates:
## mean in group 1 mean in group 2
## 37.39070 33.70517
# Wilcoxon test, if the data is not normal
wilcox.test(Age_mothers ~ Sex, data = baby)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Age_mothers by Sex
## W = 30675, p-value = 0.09632
## alternative hypothesis: true location shift is not equal to 0
# H0: There is no association between the sexes and their breast feeding preferences
# Chi-square test
contingency = table(baby$Sex, baby$Breast_feeding)
contingency
##
## 0 1 2
## 1 133 110 0
## 2 117 114 1
chisq.test(contingency)
## Warning in chisq.test(contingency): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: contingency
## X-squared = 1.8417, df = 2, p-value = 0.3982
# Cell frequency is less than 5 for more than 20% of the cells, so we cannot rely on chi-square test
# Fisher's exact test is suggested
fisher.test(contingency)
##
## Fisher's Exact Test for Count Data
##
## data: contingency
## p-value = 0.3821
## alternative hypothesis: two.sided
# p > 0.05, H0 cannot be rejected
# Conclusion: Breast feeding preference is not associated with the sexes of the babies
# H0: There is no correlation between the height and weight of the babies
scatterplot(x = baby$Height, y = baby$Weight)

# Pearson correlation test
cor.test(baby$Height, baby$Weight)
##
## Pearson's product-moment correlation
##
## data: baby$Height and baby$Weight
## t = 56.116, df = 473, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9195956 0.9432609
## sample estimates:
## cor
## 0.9324209
# Same formula without dollar notation
with(data = baby, cor.test(Height, Weight))
##
## Pearson's product-moment correlation
##
## data: Height and Weight
## t = 56.116, df = 473, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9195956 0.9432609
## sample estimates:
## cor
## 0.9324209
# p < 0.05, H0 rejected
# r = 0.93, strong positive correlation
# Conclusion: Weight of the babies increases with the increase in their height.
# Paired t-test
# H0: There is no difference in the weight of the students before and after the diet in 60 days interval
# Read the dataset
students = read_excel('datasets.xlsx', sheet = 'z_t_test', range = 'C31:E61')
head(students)
## # A tibble: 6 × 3
## sr. before after
## <dbl> <dbl> <dbl>
## 1 1 76.2 74.9
## 2 2 92.2 91.0
## 3 3 85.9 85.0
## 4 4 76.5 74.2
## 5 5 53.6 51.5
## 6 6 75.9 73.3
# The data is paired: before-after situation, two observations for the same individuals
# before and after in two columns, so we cannot use the formula for t.test()
# Moreover, paired t-test function does not need formula
t.test(students$before, students$after, paired = TRUE)
##
## Paired t-test
##
## data: students$before and students$after
## t = 3.7797, df = 29, p-value = 0.000725
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.3393565 1.1396511
## sample estimates:
## mean difference
## 0.7395038
# p < 0.05, H0 rejected
# Conclusion: There is a significant difference in the weight of the students before and after the diet in 60 days interval
# Mean difference is 0.74 kg, which is significant
# Check the assumptions
# Normality: qqplot
qqPlot(students$before - students$after)

## [1] 15 16
# Boxplot() in car package writes axis title
Boxplot(students$before - students$after)

# So, the differences are normally distributed
# This differences are checked against the null hypothesis of zero mean difference