Úvod

V tomto cvičení používam databázu z Kaggle:

Students Performance in Exams
(https://www.kaggle.com/datasets/spscientist/students-performance-in-exams)

CSV súbor som si stiahla a uložila do priečinka:

udaje/StudentsPerformance.csv

Dataset obsahuje údaje o výsledkoch študentov z troch testov (matematika, čítanie, písanie) a o viacerých socio-demografických charakteristikách (pohlavie, etnicita, vzdelanie rodičov, typ obeda, príprava na test).

Budem analyzovať vzťah medzi:

Moja hypotéza: lepšie výsledky v ostatných predmetoch a vyššie vzdelanie rodičov zvyšujú matematické skóre.


Balíčky

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lmtest)
library(sandwich)
library(car)
## Loading required package: carData
library(ggplot2)
library(patchwork)
rm(list = ls())

Načítanie údajov

udaje_raw <- read.csv("udaje/StudentsPerformance.csv",
                      dec = ".", sep = ",", header = TRUE)
 
names(udaje_raw) <- make.names(names(udaje_raw))
 
udaje <- udaje_raw[, c("gender",
                       "race.ethnicity",
                       "parental.level.of.education",
                       "lunch",
                       "test.preparation.course",
                       "math.score",
                       "reading.score",
                       "writing.score")]
 
summary(udaje)
##     gender          race.ethnicity     parental.level.of.education
##  Length:1000        Length:1000        Length:1000                
##  Class :character   Class :character   Class :character           
##  Mode  :character   Mode  :character   Mode  :character           
##                                                                   
##                                                                   
##                                                                   
##     lunch           test.preparation.course   math.score     reading.score   
##  Length:1000        Length:1000             Min.   :  0.00   Min.   : 17.00  
##  Class :character   Class :character        1st Qu.: 57.00   1st Qu.: 59.00  
##  Mode  :character   Mode  :character        Median : 66.00   Median : 70.00  
##                                             Mean   : 66.09   Mean   : 69.17  
##                                             3rd Qu.: 77.00   3rd Qu.: 79.00  
##                                             Max.   :100.00   Max.   :100.00  
##  writing.score   
##  Min.   : 10.00  
##  1st Qu.: 57.75  
##  Median : 69.00  
##  Mean   : 68.05  
##  3rd Qu.: 79.00  
##  Max.   :100.00

Prevod kategórií na číselné premenné

udaje$gender_num <- ifelse(udaje$gender == "female", 1, 0)
 
edu_map <- c(
  "some high school"   = 1,
  "high school"        = 2,
  "some college"       = 3,
  "associate's degree" = 4,
  "bachelor's degree"  = 5,
  "master's degree"    = 6
)
 
udaje$parent_edu_num <- edu_map[as.character(udaje$parental.level.of.education)]
 
udaje$avg_rw <- (udaje$reading.score + udaje$writing.score) / 2
 
summary(udaje[, c("math.score", "reading.score", "writing.score",
                  "avg_rw", "gender_num", "parent_edu_num")])
##    math.score     reading.score    writing.score        avg_rw      
##  Min.   :  0.00   Min.   : 17.00   Min.   : 10.00   Min.   : 13.50  
##  1st Qu.: 57.00   1st Qu.: 59.00   1st Qu.: 57.75   1st Qu.: 58.50  
##  Median : 66.00   Median : 70.00   Median : 69.00   Median : 69.50  
##  Mean   : 66.09   Mean   : 69.17   Mean   : 68.05   Mean   : 68.61  
##  3rd Qu.: 77.00   3rd Qu.: 79.00   3rd Qu.: 79.00   3rd Qu.: 79.00  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00  
##    gender_num    parent_edu_num 
##  Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.000   1st Qu.:2.000  
##  Median :1.000   Median :3.000  
##  Mean   :0.518   Mean   :3.081  
##  3rd Qu.:1.000   3rd Qu.:4.000  
##  Max.   :1.000   Max.   :6.000

Imputácia chýbajúcich hodnôt

numeric_cols <- sapply(udaje, is.numeric)
 
column_medians <- sapply(udaje[, numeric_cols], median, na.rm = TRUE)
 
for (col in names(column_medians)) {
  udaje[[col]][is.na(udaje[[col]])] <- column_medians[col]
}
 
summary(udaje[, numeric_cols])
##    math.score     reading.score    writing.score      gender_num   
##  Min.   :  0.00   Min.   : 17.00   Min.   : 10.00   Min.   :0.000  
##  1st Qu.: 57.00   1st Qu.: 59.00   1st Qu.: 57.75   1st Qu.:0.000  
##  Median : 66.00   Median : 70.00   Median : 69.00   Median :1.000  
##  Mean   : 66.09   Mean   : 69.17   Mean   : 68.05   Mean   :0.518  
##  3rd Qu.: 77.00   3rd Qu.: 79.00   3rd Qu.: 79.00   3rd Qu.:1.000  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :1.000  
##  parent_edu_num      avg_rw      
##  Min.   :1.000   Min.   : 13.50  
##  1st Qu.:2.000   1st Qu.: 58.50  
##  Median :3.000   Median : 69.50  
##  Mean   :3.081   Mean   : 68.61  
##  3rd Qu.:4.000   3rd Qu.: 79.00  
##  Max.   :6.000   Max.   :100.00

Boxploty premenných

par(mfrow = c(2, 3))
par(mar  = c(4, 4, 2, 1))
 
boxplot(udaje$math.score,    main = "math.score",    col = "lightblue")
boxplot(udaje$reading.score, main = "reading.score", col = "lightblue")
boxplot(udaje$writing.score, main = "writing.score", col = "lightblue")
boxplot(udaje$avg_rw,        main = "avg_rw",        col = "lightblue")
boxplot(udaje$parent_edu_num,main = "parent_edu_num",col = "lightblue")
boxplot(udaje$gender_num,    main = "gender_num",    col = "lightblue")
 
mtext("Boxploty vybraných premenných (StudentsPerformance)",
      outer = TRUE, cex = 1.2, font = 2)

par(mfrow = c(1, 1))

Lineárny model 1

math.score ~ avg_rw + parent_edu_num + gender_num

model <- lm(math.score ~ avg_rw + parent_edu_num + gender_num,
            data = udaje)
 
summary(model)
## 
## Call:
## lm(formula = math.score ~ avg_rw + parent_edu_num + gender_num, 
##     data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.6682  -3.9717  -0.1471   4.1783  17.6076 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.01784    0.94077   7.460 1.89e-13 ***
## avg_rw           0.97110    0.01388  69.974  < 2e-16 ***
## parent_edu_num  -0.27224    0.13464  -2.022   0.0434 *  
## gender_num     -12.97018    0.39951 -32.465  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.066 on 996 degrees of freedom
## Multiple R-squared:  0.8405, Adjusted R-squared:   0.84 
## F-statistic:  1749 on 3 and 996 DF,  p-value: < 2.2e-16

Diagnostické grafy modelu 1

par(mfrow = c(2, 2))
plot(model)

par(mfrow = c(1, 1))

Test normality a odľahlých hodnôt

residuals_m1 <- residuals(model)
 
jarque.bera.test(residuals_m1)
## 
##  Jarque Bera Test
## 
## data:  residuals_m1
## X-squared = 1.0505, df = 2, p-value = 0.5914
outlierTest(model)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferroni p
## 456 -3.099328          0.0019941           NA

Lineárny model 2

math.score ~ log(avg_rw) + parent_edu_num

model2 <- lm(math.score ~ I(log(avg_rw)) + parent_edu_num,
             data = udaje)
 
summary(model2)
## 
## Call:
## lm(formula = math.score ~ I(log(avg_rw)) + parent_edu_num, data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.6105  -6.2275  -0.0694   6.1839  26.9306 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -148.9856     4.9715 -29.968   <2e-16 ***
## I(log(avg_rw))   51.2798     1.2034  42.612   <2e-16 ***
## parent_edu_num   -0.1337     0.1978  -0.676    0.499    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.921 on 997 degrees of freedom
## Multiple R-squared:  0.6546, Adjusted R-squared:  0.6539 
## F-statistic: 944.6 on 2 and 997 DF,  p-value: < 2.2e-16

Diagnostické grafy modelu 2

par(mfrow = c(2, 2))
plot(model2)

par(mfrow = c(1, 1))

Normalita a odľahlé hodnoty – model 2

residuals_m2 <- residuals(model2)
 
jarque.bera.test(residuals_m2)
## 
##  Jarque Bera Test
## 
## data:  residuals_m2
## X-squared = 7.3008, df = 2, p-value = 0.02598
outlierTest(model2)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 597 3.074119          0.0021687           NA

Heteroskedasticita – vizuálne hodnotenie

udaje$resid_m1 <- residuals_m1
udaje$resid_m2 <- residuals_m2

Model 1

p1 <- ggplot(udaje, aes(x = avg_rw, y = resid_m1^2)) +
  geom_point(alpha = .6) +
  geom_smooth(method = "loess", se = FALSE, color = "red")
 
p2 <- ggplot(udaje, aes(x = parent_edu_num, y = resid_m1^2)) +
  geom_point(alpha = .6) +
  geom_smooth(method = "loess", se = FALSE, color = "red")
 
p1 + p2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Model 2

p3 <- ggplot(udaje, aes(x = log(avg_rw), y = resid_m2^2)) +
  geom_point(alpha = .6) +
  geom_smooth(method = "loess", se = FALSE, color = "red")
 
p4 <- ggplot(udaje, aes(x = parent_edu_num, y = resid_m2^2)) +
  geom_point(alpha = .6) +
  geom_smooth(method = "loess", se = FALSE, color = "red")
 
p3 + p4
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'


Breusch–Pagan test

bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 2.4302, df = 3, p-value = 0.488
bptest(model2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 11.021, df = 2, p-value = 0.004045

White robustné štandardné chyby

coeftest(model, vcov = vcovHC(model))
## 
## t test of coefficients:
## 
##                  Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)      7.017835   0.928549   7.5579 9.272e-14 ***
## avg_rw           0.971098   0.013521  71.8227 < 2.2e-16 ***
## parent_edu_num  -0.272240   0.131759  -2.0662   0.03907 *  
## gender_num     -12.970180   0.393774 -32.9381 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Záver

Výsledky ukazujú, že najväčší vplyv na výsledok z matematiky majú priemerné skóre z čítania a písania a vzdelanie rodičov.
Študenti s lepšími výsledkami v ostatných predmetoch a s rodičmi s vyšším vzdelaním dosahujú vyššie skóre z matematiky.

Logaritmická transformácia premennnej avg_rw zlepšila diagnostiku modelu a pomohla odstrániť problémy s heteroskedasticitou.
Druhý model sa preto javí ako stabilnejší opis vzťahu medzi skúmanými faktormi a výkonom v matematike.