library(car)
## Loading required package: carData
library(readxl)
library(BSDA)
## Loading required package: lattice
## 
## Attaching package: 'BSDA'
## The following objects are masked from 'package:carData':
## 
##     Vocab, Wool
## The following object is masked from 'package:datasets':
## 
##     Orange
library(kableExtra)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::recode()     masks car::recode()
## ✖ purrr::some()       masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
apple = read_excel('datasets.xlsx', sheet = 'z_t_test', range = 'B4:C24')

apple
## # A tibble: 20 × 2
##      sr.   wts
##    <dbl> <dbl>
##  1     1  92.4
##  2     2  89.2
##  3     3  88.4
##  4     4  91.4
##  5     5  91.1
##  6     6  77.2
##  7     7  97.0
##  8     8  82.2
##  9     9  85.6
## 10    10  90  
## 11    11  90.0
## 12    12  87.8
## 13    13  91.7
## 14    14  92.1
## 15    15  88.8
## 16    16  87.7
## 17    17  86.8
## 18    18  87.8
## 19    19  88.4
## 20    20  93.5
# H0: mean weight of apples = 90 g
mean(apple$wts)
## [1] 88.9535
# Assumption
# Normality (boxplot, qqPlot)
boxplot(apple$wts, col = 'red')

qqPlot(apple$wts)

## [1] 6 7
# data seems not be normal
# still we will run t.test just for demonstration
# mu = 90, to which we are comparing
t.test(apple$wts, mu = 90)
## 
##  One Sample t-test
## 
## data:  apple$wts
## t = -1.1189, df = 19, p-value = 0.2771
## alternative hypothesis: true mean is not equal to 90
## 95 percent confidence interval:
##  86.99592 90.91108
## sample estimates:
## mean of x 
##   88.9535
# p > 0.05, sufficient evidence to support H0
# H0 cannot be rejected
# Therefore, we can conclude that the average weight of the 20 apples is statistically equal to 90 g.

# If population parameter is known and sample size is large (n>=30), z-test can be done


z.test(apple$wts, mu = 92, sigma.x = 5)
## 
##  One-sample z-Test
## 
## data:  apple$wts
## z = -2.7249, p-value = 0.006433
## alternative hypothesis: true mean is not equal to 92
## 95 percent confidence interval:
##  86.76219 91.14481
## sample estimates:
## mean of x 
##   88.9535
# H0: true mean = 92
# p < 0.05, insufficient evidence, H0 rejected
# Conclusion: true mean is not equal to 92
# Check the CI: [86.76, 91.14]: does not contain 92.

# Q2: Read dataset
baby = read_excel('datasets.xlsx', sheet = 'z_t_test', range = 'L23:Q498')

str(baby)
## tibble [475 × 6] (S3: tbl_df/tbl/data.frame)
##  $ sr.           : num [1:475] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Sex           : num [1:475] 1 2 1 1 1 2 2 2 1 1 ...
##  $ Age_mothers   : num [1:475] 26 20 24 20 53 51 84 10 56 30 ...
##  $ Height        : num [1:475] 78 86 73 80 97 98 102 66 93 80 ...
##  $ Weight        : num [1:475] 9.8 9.5 10 10 15 14.1 15.5 8.5 14 10.5 ...
##  $ Breast_feeding: num [1:475] 1 1 1 1 0 0 0 1 0 1 ...
# add a variable named 'BMI' using mutate()
# z-score = standardized score

baby = baby %>% mutate(BMI = Weight/(Height/100)^2)
baby = baby %>% mutate(z_weight = (Weight - mean(Weight))/sd(Weight))

str(baby)
## tibble [475 × 8] (S3: tbl_df/tbl/data.frame)
##  $ sr.           : num [1:475] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Sex           : num [1:475] 1 2 1 1 1 2 2 2 1 1 ...
##  $ Age_mothers   : num [1:475] 26 20 24 20 53 51 84 10 56 30 ...
##  $ Height        : num [1:475] 78 86 73 80 97 98 102 66 93 80 ...
##  $ Weight        : num [1:475] 9.8 9.5 10 10 15 14.1 15.5 8.5 14 10.5 ...
##  $ Breast_feeding: num [1:475] 1 1 1 1 0 0 0 1 0 1 ...
##  $ BMI           : num [1:475] 16.1 12.8 18.8 15.6 15.9 ...
##  $ z_weight      : num [1:475] -0.471 -0.558 -0.412 -0.412 1.051 ...
kbl(head(baby))
sr. Sex Age_mothers Height Weight Breast_feeding BMI z_weight
1 1 26 78 9.8 1 16.10782 -0.4706895
2 2 20 86 9.5 1 12.84478 -0.5584933
3 1 24 73 10.0 1 18.76525 -0.4121537
4 1 20 80 10.0 1 15.62500 -0.4121537
5 1 53 97 15.0 0 15.94218 1.0512416
6 2 51 98 14.1 0 14.68138 0.7878304
summary(baby$z_weight)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.60725 -0.85117  0.02686  0.00000  0.75856  2.95366
# Count the number of babies with z-score less than -2

category = cut(baby$z_weight, breaks = c(-Inf, -2, Inf),
    labels = c('Wasted', 'Normal'),
    right = FALSE)
# frequency or percentage table of category
table(category)*100/length(category)
## category
##    Wasted    Normal 
##  2.105263 97.894737
# Ans: 2.11% of the babies are wasted.

# H0: Average ages of mothers of male babies and female babies are equal
# Two sample t-test
# Assumption:
# Both groups normally distributed
# Equal variance
# Sex: Factor variables with two levels (two groups)

baby$Sex = as.factor(baby$Sex)
str(baby)
## tibble [475 × 8] (S3: tbl_df/tbl/data.frame)
##  $ sr.           : num [1:475] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Sex           : Factor w/ 2 levels "1","2": 1 2 1 1 1 2 2 2 1 1 ...
##  $ Age_mothers   : num [1:475] 26 20 24 20 53 51 84 10 56 30 ...
##  $ Height        : num [1:475] 78 86 73 80 97 98 102 66 93 80 ...
##  $ Weight        : num [1:475] 9.8 9.5 10 10 15 14.1 15.5 8.5 14 10.5 ...
##  $ Breast_feeding: num [1:475] 1 1 1 1 0 0 0 1 0 1 ...
##  $ BMI           : num [1:475] 16.1 12.8 18.8 15.6 15.9 ...
##  $ z_weight      : num [1:475] -0.471 -0.558 -0.412 -0.412 1.051 ...
boxplot(Age_mothers ~ Sex, data = baby)

# Bring two plots in a single canvas, 1 row, 2 columns

par(mfrow = c(1,2))
# for males
qqnorm(baby$Age_mothers[baby$Sex == 1])
qqline(baby$Age_mothers[baby$Sex == 1])
# for females
qqnorm(baby$Age_mothers[baby$Sex == 2])
qqline(baby$Age_mothers[baby$Sex == 2])

par(mfrow = c(1,1))


# qqPlot() in car package can drwas qqplot in a single function

qqPlot(Age_mothers ~ Sex, data = baby)

# For large sample, we can consider this as normally distributed

# Check equality of variance (Levene Test)
# H0: Variances are equal for both groups
car::leveneTest(Age_mothers ~ Sex, data = baby)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   1  4.3132 0.03836 *
##       473                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, variances are not equal
# t test is not applicable, Welch test is suggested
# t.test() function is for both t test and Welch test

# t test
t.test(Age_mothers ~ Sex, data = baby, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  Age_mothers by Sex
## t = 1.7464, df = 473, p-value = 0.08139
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.4613116  7.8323660
## sample estimates:
## mean in group 1 mean in group 2 
##        37.39070        33.70517
# Welch test
t.test(Age_mothers ~ Sex, data = baby, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  Age_mothers by Sex
## t = 1.7495, df = 472.58, p-value = 0.08086
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.4540092  7.8250636
## sample estimates:
## mean in group 1 mean in group 2 
##        37.39070        33.70517
# Wilcoxon test, if the data is not normal
wilcox.test(Age_mothers ~ Sex, data = baby)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Age_mothers by Sex
## W = 30675, p-value = 0.09632
## alternative hypothesis: true location shift is not equal to 0
# H0: There is no association between the sexes and their breast feeding preferences

# Chi-square test
contingency = table(baby$Sex, baby$Breast_feeding)
contingency
##    
##       0   1   2
##   1 133 110   0
##   2 117 114   1
chisq.test(contingency)
## Warning in chisq.test(contingency): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  contingency
## X-squared = 1.8417, df = 2, p-value = 0.3982
# Cell frequency is less than 5 for more than 20% of the cells, so we cannot rely on chi-square test
# Fisher's exact test is suggested
fisher.test(contingency)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency
## p-value = 0.3821
## alternative hypothesis: two.sided
# p > 0.05, H0 cannot be rejected
# Conclusion: Breast feeding preference is not associated with the sexes of the babies

# H0: There is no correlation between the height and weight of the babies
scatterplot(x = baby$Height, y = baby$Weight)

# Pearson correlation test
cor.test(baby$Height, baby$Weight)
## 
##  Pearson's product-moment correlation
## 
## data:  baby$Height and baby$Weight
## t = 56.116, df = 473, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9195956 0.9432609
## sample estimates:
##       cor 
## 0.9324209
# Same formula without dollar notation
with(data = baby, cor.test(Height, Weight))
## 
##  Pearson's product-moment correlation
## 
## data:  Height and Weight
## t = 56.116, df = 473, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9195956 0.9432609
## sample estimates:
##       cor 
## 0.9324209
# p < 0.05, H0 rejected
# r = 0.93, strong positive correlation
# Conclusion: Weight of the babies increases with the increase in their height.

# Paired t-test
# H0: There is no difference in the weight of the students before and after the diet in 60 days interval

# Read the dataset
students = read_excel('datasets.xlsx', sheet = 'z_t_test', range = 'C31:E61')
head(students)
## # A tibble: 6 × 3
##     sr. before after
##   <dbl>  <dbl> <dbl>
## 1     1   76.2  74.9
## 2     2   92.2  91.0
## 3     3   85.9  85.0
## 4     4   76.5  74.2
## 5     5   53.6  51.5
## 6     6   75.9  73.3
# The data is paired: before-after situation, two observations for the same individuals
# before and after in two columns, so we cannot use the formula for t.test()
# Moreover, paired t-test function does not need formula

t.test(students$before, students$after, paired = TRUE)
## 
##  Paired t-test
## 
## data:  students$before and students$after
## t = 3.7797, df = 29, p-value = 0.000725
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  0.3393565 1.1396511
## sample estimates:
## mean difference 
##       0.7395038
# p < 0.05, H0 rejected
# Conclusion: There is a significant difference in the weight of the students before and after the diet in 60 days interval
# Mean difference is 0.74 kg, which is significant

# Check the assumptions
# Normality: qqplot
qqPlot(students$before - students$after)

## [1] 15 16
# Boxplot() in car package writes axis title
Boxplot(students$before - students$after)

# So, the differences are normally distributed
# This differences are checked against the null hypothesis of zero mean difference