options(repos = c(CRAN = "https://cloud.r-project.org"))
Sys.unsetenv("RSPM")
install.packages(c("rsconnect", "rmarkdown", "knitr"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
need <- c("haven", "dplyr", "lmtest", "sandwich", "broom")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install) > 0) install.packages(to_install, dependencies = TRUE)
library(haven); library(dplyr); library(lmtest); library(sandwich); library(broom)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
df <- haven::read_dta("all.dta") %>% as.data.frame()
names(df)[1:30]
## [1] "player" "rank" "s1" "s2"
## [5] "s3" "swon" "b365" "gender"
## [9] "tylocation" "tytournament" "tyseries" "tycourt"
## [13] "tysurface" "tyround" "year" "month"
## [17] "day" "rankdiff" "s1diff" "s2diff"
## [21] "s3diff" "swondiff" "b365diff" "s1w"
## [25] "s2w" "s3w" "rankindex" "rankop"
## [29] "rankindexop" "rankindexdiff"
dim(df)
## [1] 112138 30
# Ľahký wrangling (kategórie ako faktory)
df <- df %>% mutate(
gender = factor(gender),
tysurface = factor(tysurface),
year = as.integer(year)
)
# Zvolené premenné (ak niektorá chýba, automaticky sa vynechá)
vars_needed <- c("b365","rankdiff","s1","s2","s3","swon","gender","tysurface","year")
vars_needed <- intersect(vars_needed, names(df))
df_model <- df %>% dplyr::select(all_of(vars_needed)) %>% na.omit()
dim(df_model); head(df_model)
## [1] 38582 9
## b365 rankdiff s1 s2 s3 swon gender tysurface year
## 4 2.62 -13 6 6 3 1 0 5 2012
## 7 3.75 174 3 6 3 1 1 5 2016
## 9 4.00 291 7 4 4 1 1 1 2008
## 24 2.37 -4 4 6 3 1 1 5 2015
## 25 8.00 227 7 2 3 1 1 5 2015
## 26 1.30 -50 6 6 3 1 1 5 2014
# Dynamicky poskladaná formula
rhs <- c()
for (v in c("rankdiff","s1","s2","s3","swon","gender","tysurface","year")) {
if (v %in% names(df_model)) rhs <- c(rhs, v)
}
stopifnot("b365" %in% names(df_model))
form <- as.formula(paste("b365 ~", paste(rhs, collapse = " + ")))
form
## b365 ~ rankdiff + s1 + s2 + s3 + swon + gender + tysurface +
## year
# OLS
m_ols <- lm(form, data = df_model)
summary(m_ols)
##
## Call:
## lm(formula = form, data = df_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3216 -0.6884 -0.3176 0.2827 25.9022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.860e+01 3.368e+00 8.490 < 2e-16 ***
## rankdiff 5.320e-03 6.313e-05 84.268 < 2e-16 ***
## s1 -5.370e-02 4.790e-03 -11.209 < 2e-16 ***
## s2 -4.656e-02 4.726e-03 -9.851 < 2e-16 ***
## s3 -6.735e-02 5.470e-03 -12.312 < 2e-16 ***
## swon -2.908e-01 2.069e-02 -14.053 < 2e-16 ***
## gender1 1.206e-01 1.419e-02 8.498 < 2e-16 ***
## tysurface2 -4.251e-02 7.190e-02 -0.591 0.554
## tysurface3 -6.302e-02 7.516e-02 -0.838 0.402
## tysurface4 -4.067e-01 5.633e-01 -0.722 0.470
## tysurface5 -1.078e-02 7.145e-02 -0.151 0.880
## year -1.244e-02 1.678e-03 -7.415 1.24e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.369 on 38570 degrees of freedom
## Multiple R-squared: 0.2042, Adjusted R-squared: 0.204
## F-statistic: 899.9 on 11 and 38570 DF, p-value: < 2.2e-16
# Breusch–Pagan
bp <- lmtest::bptest(m_ols)
bp
##
## studentized Breusch-Pagan test
##
## data: m_ols
## BP = 625.62, df = 11, p-value < 2.2e-16
# White
make_white_rhs <- function(vars) {
keep <- intersect(vars, names(df_model))
if (length(keep) == 0) return(~ 1)
terms <- unlist(lapply(keep, function(v) c(v, paste0("I(", v, "^2)"))))
as.formula(paste("~", paste(terms, collapse = " + ")))
}
white_rhs <- make_white_rhs(c("rankdiff","s1","s2","s3","swon"))
white <- lmtest::bptest(m_ols, varformula = white_rhs, studentize = TRUE, data = df_model)
white
##
## studentized Breusch-Pagan test
##
## data: m_ols
## BP = 1073.8, df = 10, p-value < 2.2e-16
aux_form <- as.formula(paste("log(residuals(m_ols)^2) ~", paste(rhs, collapse = " + ")))
aux <- lm(aux_form, data = df_model)
sigma_hat2 <- exp(fitted(aux)) # odhad var(e|X)
w <- 1 / sigma_hat2 # WLS váhy (menšia váha pri väčšej variancii)
# WLS/FGLS model
m_wls <- lm(form, data = df_model, weights = w)
summary(m_wls)
##
## Call:
## lm(formula = form, data = df_model, weights = w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.874 -1.464 -0.688 0.738 45.040
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.586e+01 2.770e+00 5.725 1.04e-08 ***
## rankdiff 1.824e-03 3.117e-05 58.517 < 2e-16 ***
## s1 -3.408e-02 4.037e-03 -8.442 < 2e-16 ***
## s2 -3.465e-02 3.974e-03 -8.719 < 2e-16 ***
## s3 -4.019e-02 5.564e-03 -7.224 5.15e-13 ***
## swon -3.380e-01 1.834e-02 -18.428 < 2e-16 ***
## gender1 8.945e-02 1.153e-02 7.757 8.89e-15 ***
## tysurface2 -1.138e-03 7.308e-02 -0.016 0.988
## tysurface3 -7.731e-03 7.486e-02 -0.103 0.918
## tysurface4 2.939e-03 5.143e-01 0.006 0.995
## tysurface5 1.186e-02 7.284e-02 0.163 0.871
## year -6.270e-03 1.379e-03 -4.545 5.50e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.502 on 38570 degrees of freedom
## Multiple R-squared: 0.1244, Adjusted R-squared: 0.1241
## F-statistic: 497.9 on 11 and 38570 DF, p-value: < 2.2e-16
Na porovnanie použijeme Residuals vs Fitted — priamo ukazuje, či sa rozptyl mení s predikovanou hodnotou (lievik = heteroskedasticita).
par(mfrow=c(1,1))
plot(m_ols, which = 1)
Vysvetlivka (Graf 1 – OLS):
- Čo skúma: vzťah medzi predikovanými hodnotami (X) a
reziduami (Y).
- Čo hľadám: náhodný „mrak“ okolo 0 bez trendu. Ak sa
rozptyl mení s X (tvar lievika), ide o
heteroskedasticitu.
- Dôsledok: OLS koeficienty sú stále nestranné, ale
štandardné chyby sú skreslené
par(mfrow=c(1,1))
plot(m_wls, which = 1)
Vysvetlivka (Graf 2 – WLS/FGLS):
- Čo skúma: to isté čo Graf 1, ale po aplikácii váženia
podľa odhadnutej variability chýb.
- Čo hľadám: rozptyl bodov by mal byť
stabilnejší naprieč fitted hodnotami (menej
„lievikovitý“).
- Dôsledok: Po korekcii sú odhady efektívnejšie a
intervaly/spoľahlivostné testy sú spoľahlivejšie.
data.frame(
Test = c("Breusch–Pagan", "White"),
p_value = signif(c(bp$p.value, white$p.value), 4),
stringsAsFactors = FALSE
)
## Test p_value
## 1 Breusch–Pagan 4.609e-127
## 2 White 2.396e-224