V tomto cvičení pracujem s reálnym CSV súborom
StudentsPerformance.csv uloženým v priečinku
udaje.
Ide o známu Kaggle databázu testov z matematiky, čítania a
písania.
Cieľom je modelovať výsledok z matematiky pomocou čítania a písania.
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(lmtest)
library(sandwich)
library(car)
## Loading required package: carData
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
rm(list = ls())
data_raw <- read.csv(
"udaje/StudentsPerformance.csv",
header = TRUE,
sep = ",",
dec = "."
)
data %>%
head(10) %>%
kable(caption = "Prvých 10 riadkov datasetu") %>%
kable_styling(full_width = FALSE)
| math_score | reading_score | writing_score |
|---|---|---|
| 72 | 72 | 74 |
| 69 | 90 | 88 |
| 90 | 95 | 93 |
| 47 | 57 | 44 |
| 76 | 78 | 75 |
| 71 | 83 | 78 |
| 88 | 95 | 92 |
| 40 | 43 | 39 |
| 64 | 64 | 67 |
| 38 | 60 | 50 |
summary(data)
## math_score reading_score writing_score
## Min. : 0.00 Min. : 17.00 Min. : 10.00
## 1st Qu.: 57.00 1st Qu.: 59.00 1st Qu.: 57.75
## Median : 66.00 Median : 70.00 Median : 69.00
## Mean : 66.09 Mean : 69.17 Mean : 68.05
## 3rd Qu.: 77.00 3rd Qu.: 79.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00 Max. :100.00
par(mfrow = c(1, 3))
boxplot(data$math_score, main = "Math score", col = "lightblue")
boxplot(data$reading_score, main = "Reading score", col = "lightpink")
boxplot(data$writing_score, main = "Writing score", col = "lightgreen")
par(mfrow = c(1, 1))
model1 <- lm(math_score ~ reading_score + writing_score, data = data)
cat("\n*** MODEL 1 – Math ~ Reading + Writing ***\n")
##
## *** MODEL 1 – Math ~ Reading + Writing ***
summary(model1)
##
## Call:
## lm(formula = math_score ~ reading_score + writing_score, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.8779 -6.1750 0.2693 6.0184 24.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.52409 1.32823 5.665 1.93e-08 ***
## reading_score 0.60129 0.06304 9.538 < 2e-16 ***
## writing_score 0.24942 0.06057 4.118 4.14e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.667 on 997 degrees of freedom
## Multiple R-squared: 0.674, Adjusted R-squared: 0.6733
## F-statistic: 1031 on 2 and 997 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model1)
par(mfrow = c(1, 1))
cat("\nJarque-Bera test (model 1):\n")
##
## Jarque-Bera test (model 1):
jarque.bera.test(residuals(model1))
##
## Jarque Bera Test
##
## data: residuals(model1)
## X-squared = 6.8011, df = 2, p-value = 0.03335
cat("\nOutlier test (model 1):\n")
##
## Outlier test (model 1):
outlierTest(model1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 951 2.882065 0.0040356 NA
model2 <- lm(math_score ~ reading_score + writing_score, data = data)
cat("\n*** MODEL 2 – Math ~ Reading + Writing ***\n")
##
## *** MODEL 2 – Math ~ Reading + Writing ***
summary(model2)
##
## Call:
## lm(formula = math_score ~ reading_score + writing_score, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.8779 -6.1750 0.2693 6.0184 24.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.52409 1.32823 5.665 1.93e-08 ***
## reading_score 0.60129 0.06304 9.538 < 2e-16 ***
## writing_score 0.24942 0.06057 4.118 4.14e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.667 on 997 degrees of freedom
## Multiple R-squared: 0.674, Adjusted R-squared: 0.6733
## F-statistic: 1031 on 2 and 997 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model2)
par(mfrow = c(1, 1))
cat("\nJarque-Bera test (model 2):\n")
##
## Jarque-Bera test (model 2):
jarque.bera.test(residuals(model2))
##
## Jarque Bera Test
##
## data: residuals(model2)
## X-squared = 6.8011, df = 2, p-value = 0.03335