V tomto cvičení používam databázu z Kaggle:
Students Performance in Exams
(https://www.kaggle.com/datasets/spscientist/students-performance-in-exams)
CSV súbor som si stiahla a uložila do priečinka:
udaje/StudentsPerformance.csv
Dataset obsahuje údaje o výsledkoch študentov z troch testov (matematika, čítanie, písanie) a o viacerých socio-demografických charakteristikách (pohlavie, etnicita, vzdelanie rodičov, typ obeda, príprava na test).
Budem analyzovať vzťah medzi:
Moja hypotéza: lepšie výsledky v ostatných predmetoch a vyššie vzdelanie rodičov zvyšujú matematické skóre.
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(lmtest)
library(sandwich)
library(car)
## Loading required package: carData
library(ggplot2)
library(patchwork)
rm(list = ls())
udaje_raw <- read.csv("udaje/StudentsPerformance.csv",
dec = ".", sep = ",", header = TRUE)
names(udaje_raw) <- make.names(names(udaje_raw))
udaje <- udaje_raw[, c("gender",
"race.ethnicity",
"parental.level.of.education",
"lunch",
"test.preparation.course",
"math.score",
"reading.score",
"writing.score")]
summary(udaje)
## gender race.ethnicity parental.level.of.education
## Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## lunch test.preparation.course math.score reading.score
## Length:1000 Length:1000 Min. : 0.00 Min. : 17.00
## Class :character Class :character 1st Qu.: 57.00 1st Qu.: 59.00
## Mode :character Mode :character Median : 66.00 Median : 70.00
## Mean : 66.09 Mean : 69.17
## 3rd Qu.: 77.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00
## writing.score
## Min. : 10.00
## 1st Qu.: 57.75
## Median : 69.00
## Mean : 68.05
## 3rd Qu.: 79.00
## Max. :100.00
udaje$gender_num <- ifelse(udaje$gender == "female", 1, 0)
edu_map <- c(
"some high school" = 1,
"high school" = 2,
"some college" = 3,
"associate's degree" = 4,
"bachelor's degree" = 5,
"master's degree" = 6
)
udaje$parent_edu_num <- edu_map[as.character(udaje$parental.level.of.education)]
udaje$avg_rw <- (udaje$reading.score + udaje$writing.score) / 2
summary(udaje[, c("math.score", "reading.score", "writing.score",
"avg_rw", "gender_num", "parent_edu_num")])
## math.score reading.score writing.score avg_rw
## Min. : 0.00 Min. : 17.00 Min. : 10.00 Min. : 13.50
## 1st Qu.: 57.00 1st Qu.: 59.00 1st Qu.: 57.75 1st Qu.: 58.50
## Median : 66.00 Median : 70.00 Median : 69.00 Median : 69.50
## Mean : 66.09 Mean : 69.17 Mean : 68.05 Mean : 68.61
## 3rd Qu.: 77.00 3rd Qu.: 79.00 3rd Qu.: 79.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00 Max. :100.00 Max. :100.00
## gender_num parent_edu_num
## Min. :0.000 Min. :1.000
## 1st Qu.:0.000 1st Qu.:2.000
## Median :1.000 Median :3.000
## Mean :0.518 Mean :3.081
## 3rd Qu.:1.000 3rd Qu.:4.000
## Max. :1.000 Max. :6.000
numeric_cols <- sapply(udaje, is.numeric)
column_medians <- sapply(udaje[, numeric_cols], median, na.rm = TRUE)
for (col in names(column_medians)) {
udaje[[col]][is.na(udaje[[col]])] <- column_medians[col]
}
summary(udaje[, numeric_cols])
## math.score reading.score writing.score gender_num
## Min. : 0.00 Min. : 17.00 Min. : 10.00 Min. :0.000
## 1st Qu.: 57.00 1st Qu.: 59.00 1st Qu.: 57.75 1st Qu.:0.000
## Median : 66.00 Median : 70.00 Median : 69.00 Median :1.000
## Mean : 66.09 Mean : 69.17 Mean : 68.05 Mean :0.518
## 3rd Qu.: 77.00 3rd Qu.: 79.00 3rd Qu.: 79.00 3rd Qu.:1.000
## Max. :100.00 Max. :100.00 Max. :100.00 Max. :1.000
## parent_edu_num avg_rw
## Min. :1.000 Min. : 13.50
## 1st Qu.:2.000 1st Qu.: 58.50
## Median :3.000 Median : 69.50
## Mean :3.081 Mean : 68.61
## 3rd Qu.:4.000 3rd Qu.: 79.00
## Max. :6.000 Max. :100.00
par(mfrow = c(2, 3))
par(mar = c(4, 4, 2, 1))
boxplot(udaje$math.score, main = "math.score", col = "lightblue")
boxplot(udaje$reading.score, main = "reading.score", col = "lightblue")
boxplot(udaje$writing.score, main = "writing.score", col = "lightblue")
boxplot(udaje$avg_rw, main = "avg_rw", col = "lightblue")
boxplot(udaje$parent_edu_num,main = "parent_edu_num",col = "lightblue")
boxplot(udaje$gender_num, main = "gender_num", col = "lightblue")
mtext("Boxploty vybraných premenných (StudentsPerformance)",
outer = TRUE, cex = 1.2, font = 2)
par(mfrow = c(1, 1))
math.score ~ avg_rw + parent_edu_num + gender_num
model <- lm(math.score ~ avg_rw + parent_edu_num + gender_num,
data = udaje)
summary(model)
##
## Call:
## lm(formula = math.score ~ avg_rw + parent_edu_num + gender_num,
## data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.6682 -3.9717 -0.1471 4.1783 17.6076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.01784 0.94077 7.460 1.89e-13 ***
## avg_rw 0.97110 0.01388 69.974 < 2e-16 ***
## parent_edu_num -0.27224 0.13464 -2.022 0.0434 *
## gender_num -12.97018 0.39951 -32.465 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.066 on 996 degrees of freedom
## Multiple R-squared: 0.8405, Adjusted R-squared: 0.84
## F-statistic: 1749 on 3 and 996 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model)
par(mfrow = c(1, 1))
residuals_m1 <- residuals(model)
jarque.bera.test(residuals_m1)
##
## Jarque Bera Test
##
## data: residuals_m1
## X-squared = 1.0505, df = 2, p-value = 0.5914
outlierTest(model)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 456 -3.099328 0.0019941 NA
math.score ~ log(avg_rw) + parent_edu_num
model2 <- lm(math.score ~ I(log(avg_rw)) + parent_edu_num,
data = udaje)
summary(model2)
##
## Call:
## lm(formula = math.score ~ I(log(avg_rw)) + parent_edu_num, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.6105 -6.2275 -0.0694 6.1839 26.9306
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -148.9856 4.9715 -29.968 <2e-16 ***
## I(log(avg_rw)) 51.2798 1.2034 42.612 <2e-16 ***
## parent_edu_num -0.1337 0.1978 -0.676 0.499
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.921 on 997 degrees of freedom
## Multiple R-squared: 0.6546, Adjusted R-squared: 0.6539
## F-statistic: 944.6 on 2 and 997 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model2)
par(mfrow = c(1, 1))
residuals_m2 <- residuals(model2)
jarque.bera.test(residuals_m2)
##
## Jarque Bera Test
##
## data: residuals_m2
## X-squared = 7.3008, df = 2, p-value = 0.02598
outlierTest(model2)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 597 3.074119 0.0021687 NA
udaje$resid_m1 <- residuals_m1
udaje$resid_m2 <- residuals_m2
p1 <- ggplot(udaje, aes(x = avg_rw, y = resid_m1^2)) +
geom_point(alpha = .6) +
geom_smooth(method = "loess", se = FALSE, color = "red")
p2 <- ggplot(udaje, aes(x = parent_edu_num, y = resid_m1^2)) +
geom_point(alpha = .6) +
geom_smooth(method = "loess", se = FALSE, color = "red")
p1 + p2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
p3 <- ggplot(udaje, aes(x = log(avg_rw), y = resid_m2^2)) +
geom_point(alpha = .6) +
geom_smooth(method = "loess", se = FALSE, color = "red")
p4 <- ggplot(udaje, aes(x = parent_edu_num, y = resid_m2^2)) +
geom_point(alpha = .6) +
geom_smooth(method = "loess", se = FALSE, color = "red")
p3 + p4
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 2.4302, df = 3, p-value = 0.488
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 11.021, df = 2, p-value = 0.004045
coeftest(model, vcov = vcovHC(model))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.017835 0.928549 7.5579 9.272e-14 ***
## avg_rw 0.971098 0.013521 71.8227 < 2.2e-16 ***
## parent_edu_num -0.272240 0.131759 -2.0662 0.03907 *
## gender_num -12.970180 0.393774 -32.9381 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Výsledky ukazujú, že najväčší vplyv na výsledok z matematiky majú
priemerné skóre z čítania a písania a vzdelanie rodičov.
Študenti s lepšími výsledkami v ostatných predmetoch a s rodičmi s
vyšším vzdelaním dosahujú vyššie skóre z matematiky.
Logaritmická transformácia premennnej avg_rw zlepšila
diagnostiku modelu a pomohla odstrániť problémy s
heteroskedasticitou.
Druhý model sa preto javí ako stabilnejší opis vzťahu medzi skúmanými
faktormi a výkonom v matematike.