mydata <- read.table("./student-mat.csv", header = TRUE, sep = ";", dec = ",")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata <- mydata %>%
select(sex, age, studytime, absences, G1)
head(mydata)
## sex age studytime absences G1
## 1 F 18 2 6 5
## 2 F 17 2 4 5
## 3 F 15 2 10 7
## 4 F 15 3 2 15
## 5 F 16 2 4 6
## 6 M 16 2 10 15
Variables:
Cortez, P. (2008). Student Performance [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5TG7T.
mydata <- mydata %>%
rename(gender = sex, grade = G1)
mydata$gender <- factor(mydata$gender,
levels = c("F", "M"),
labels = c("Female", "Male"))
mydata$studytime <- factor(mydata$studytime,
levels = c(1, 2, 3, 4),
labels = c("< 2 hours", "2 to 5 hours", "5 to 10 hours", "> 10 hours"),
ordered = TRUE)
summary(mydata)
## gender age studytime absences
## Female:208 Min. :15.0 < 2 hours :105 Min. : 0.000
## Male :187 1st Qu.:16.0 2 to 5 hours :198 1st Qu.: 0.000
## Median :17.0 5 to 10 hours: 65 Median : 4.000
## Mean :16.7 > 10 hours : 27 Mean : 5.709
## 3rd Qu.:18.0 3rd Qu.: 8.000
## Max. :22.0 Max. :75.000
## grade
## Min. : 3.00
## 1st Qu.: 8.00
## Median :11.00
## Mean :10.91
## 3rd Qu.:13.00
## Max. :19.00
Interpretation
library(psych)
describeBy(mydata$grade, group = mydata$gender)
##
## Descriptive statistics by group
## group: Female
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 208 10.62 3.23 10 10.46 2.97 4 19 15 0.36 -0.68 0.22
## ------------------------------------------------------------
## group: Male
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 187 11.23 3.39 11 11.19 4.45 3 19 16 0.1 -0.72 0.25
**Interpretation:*
Research Question Is there a difference in average mathematics grade between male and female students?
First assumption Grade is a numeric variable
Second assumption Grade is normally distributed within the population of male and female students
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
Grade_Male <- ggplot(mydata[mydata$gender == "Male", ], aes(x = grade)) +
geom_histogram(binwidth = 1, col = "black", fill = "purple") +
theme_linedraw() +
ylab("Points") +
ggtitle("Male Students' Grades")
Grade_Female <- ggplot(mydata[mydata$gender == "Female", ], aes(x = grade)) +
geom_histogram(binwidth = 1, col = "black", fill = "purple") +
theme_linedraw() +
ylab("Points") +
ggtitle("Female Students' Grades")
library(ggpubr)
ggarrange(Grade_Male, Grade_Female,
ncol = 2, nrow = 1)
library(ggpubr)
ggqqplot(mydata,
"grade",
facet.by = "gender")
Third Assumption Grade has the same variance in both male and female students population
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
leveneTest(mydata$grade, group = mydata$gender)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.4903 0.4842
## 393
t.test(mydata$grade ~ mydata$gender,
var.equal = TRUE,
alternative = "two.sided")
##
## Two Sample t-test
##
## data: mydata$grade by mydata$gender
## t = -1.8284, df = 393, p-value = 0.06825
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -1.26541466 0.04590623
## sample estimates:
## mean in group Female mean in group Male
## 10.62019 11.22995
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
effectsize::cohens_d(mydata$grade ~ mydata$gender,
pooled_sd = FALSE)
## Cohen's d | 95% CI
## -------------------------
## -0.18 | [-0.38, 0.01]
##
## - Estimated using un-pooled SD.
interpret_cohens_d(0.18, rules = "sawilowsky2009")
## [1] "very small"
## (Rules: sawilowsky2009)
Results - Based on the sample data, we fail to reject the null hypothesis that the average mathematics grade of male and female students does not differ (p = 0.07).The effect size is very small (d = 0.18)
Research Question Is there a difference in mathematics grade between male and female students?
wilcox.test(mydata$grade ~ mydata$gender,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon rank sum test
##
## data: mydata$grade by mydata$gender
## W = 17322, p-value = 0.05951
## alternative hypothesis: true location shift is not equal to 0
library(effectsize)
effectsize(wilcox.test(mydata$grade ~ mydata$gender,
correct = FALSE,
exact = FALSE,
alternative = "two.sided"))
## r (rank biserial) | 95% CI
## ---------------------------------
## -0.11 | [-0.22, 0.00]
interpret_rank_biserial(0.11)
## [1] "small"
## (Rules: funder2019)
Results - Based on the sample data, we find that the mathematics grade for male and female students do not differ (p = 0.06). The effect size is small (d = 0.11)