ÚVOD – výber databázy

V tomto cvičení pracujem s reálnym CSV súborom
StudentsPerformance.csv uloženým v priečinku udaje.

Ide o známu Kaggle databázu testov z matematiky, čítania a písania.
Cieľom je modelovať výsledok z matematiky pomocou čítania a písania.


BALÍČKY

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lmtest)
library(sandwich)
library(car)
## Loading required package: carData
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
rm(list = ls())

IMPORT A PRÍPRAVA DATABÁZY

Načítanie údajov zo súboru udaje/StudentsPerformance.csv

data_raw <- read.csv(
  "udaje/StudentsPerformance.csv",
  header = TRUE,
  sep = ",",
  dec = "."
)

Premenujem dôležité stĺpce

data <- data_raw %>%
  rename(
    math_score = math.score,
    reading_score = reading.score,
    writing_score = writing.score
  ) %>%
  select(math_score, reading_score, writing_score)

Základná tabuľka

data %>% 
  head(10) %>% 
  kable(caption = "Prvých 10 riadkov datasetu") %>% 
  kable_styling(full_width = FALSE)
Prvých 10 riadkov datasetu
math_score reading_score writing_score
72 72 74
69 90 88
90 95 93
47 57 44
76 78 75
71 83 78
88 95 92
40 43 39
64 64 67
38 60 50

Základný popis premenných

summary(data)
##    math_score     reading_score    writing_score   
##  Min.   :  0.00   Min.   : 17.00   Min.   : 10.00  
##  1st Qu.: 57.00   1st Qu.: 59.00   1st Qu.: 57.75  
##  Median : 66.00   Median : 70.00   Median : 69.00  
##  Mean   : 66.09   Mean   : 69.17   Mean   : 68.05  
##  3rd Qu.: 77.00   3rd Qu.: 79.00   3rd Qu.: 79.00  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00

BOXPLOTY – kontrola anomálií

par(mfrow = c(1, 3))
boxplot(data$math_score, main = "Math score", col = "lightblue")
boxplot(data$reading_score, main = "Reading score", col = "lightpink")
boxplot(data$writing_score, main = "Writing score", col = "lightgreen")

par(mfrow = c(1, 1))

MODEL 1 – základná lineárna regresia

model1 <- lm(math_score ~ reading_score + writing_score, data = data)
cat("\n*** MODEL 1 – Math ~ Reading + Writing ***\n")
## 
## *** MODEL 1 – Math ~ Reading + Writing ***
summary(model1)
## 
## Call:
## lm(formula = math_score ~ reading_score + writing_score, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.8779  -6.1750   0.2693   6.0184  24.8727 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.52409    1.32823   5.665 1.93e-08 ***
## reading_score  0.60129    0.06304   9.538  < 2e-16 ***
## writing_score  0.24942    0.06057   4.118 4.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.667 on 997 degrees of freedom
## Multiple R-squared:  0.674,  Adjusted R-squared:  0.6733 
## F-statistic:  1031 on 2 and 997 DF,  p-value: < 2.2e-16

Diagnostické grafy

par(mfrow = c(2, 2))
plot(model1)

par(mfrow = c(1, 1))

Normalita a outliery

cat("\nJarque-Bera test (model 1):\n")
## 
## Jarque-Bera test (model 1):
jarque.bera.test(residuals(model1))
## 
##  Jarque Bera Test
## 
## data:  residuals(model1)
## X-squared = 6.8011, df = 2, p-value = 0.03335
cat("\nOutlier test (model 1):\n")
## 
## Outlier test (model 1):
outlierTest(model1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 951 2.882065          0.0040356           NA

MODEL 2 – upravený model pre lepšiu interpretáciu

model2 <- lm(math_score ~ reading_score + writing_score, data = data)
cat("\n*** MODEL 2 – Math ~ Reading + Writing ***\n")
## 
## *** MODEL 2 – Math ~ Reading + Writing ***
summary(model2)
## 
## Call:
## lm(formula = math_score ~ reading_score + writing_score, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.8779  -6.1750   0.2693   6.0184  24.8727 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.52409    1.32823   5.665 1.93e-08 ***
## reading_score  0.60129    0.06304   9.538  < 2e-16 ***
## writing_score  0.24942    0.06057   4.118 4.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.667 on 997 degrees of freedom
## Multiple R-squared:  0.674,  Adjusted R-squared:  0.6733 
## F-statistic:  1031 on 2 and 997 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model2)

par(mfrow = c(1, 1))
cat("\nJarque-Bera test (model 2):\n")
## 
## Jarque-Bera test (model 2):
jarque.bera.test(residuals(model2))
## 
##  Jarque Bera Test
## 
## data:  residuals(model2)
## X-squared = 6.8011, df = 2, p-value = 0.03335

ZÁVER