library(CVXR)
##
## Attaching package: 'CVXR'
## The following object is masked from 'package:stats':
##
## power
library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(WRTDStidal)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks CVXR::id()
## ✖ purrr::is_vector() masks CVXR::is_vector()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(quantreg)
## Loading required package: SparseM
##
## Attaching package: 'quantreg'
##
## The following object is masked from 'package:survival':
##
## untangle.specials
library(PogromcyDanych)
## Loading required package: SmarterPoland
## Loading required package: httr
## Loading required package: htmltools
Teraz Wasza kolej ;-)
Waszym zadaniem dzisiaj jest zamodelowanie - porównanie KMNK oraz regresji kwantylowej (różno-poziomowej) dla zmiennej “earnings” - wynagrodzenia.
Dobierz i przetestuj predyktory, kwantyle dla modeli. Wykonaj testy różnic współczynnikow dla finalnych modeli.
W przypadku problemów - obejrzyj video tutorial (włącz polskie napisy) oraz wejdź na jego stronę ze źródłami. Możesz również wykorzystać w/w przykłady.
data("CPSSW9298")
# ?CPSSW9298
unique(CPSSW9298$year)
## [1] 1992 1998
## Levels: 1992 1998
data98<-CPSSW9298 %>% filter(CPSSW9298$year==1998)
data98%>%
ggplot(aes(age,earnings))+
geom_point()
data(data98)
## Warning in data(data98): data set 'data98' not found
p <- ggplot(data = data98) +
geom_point(mapping = aes(x = age, y = earnings), color = "blue")
taus <- c(0.1, 0.25, 0.5, 0.75, 0.90, 0.95)
fits <- data.frame(
coef(lm(earnings ~ age, data = data98)),
sapply(taus, function(x) coef(rq(formula = earnings ~ age, data = data98, tau = x))))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
names(fits) <- c("OLS", sprintf("$\\tau_{%0.2f}$", taus))
nf <- ncol(fits)
colors <- colorRampPalette(colors = c("black", "red"))(nf)
p <- p + geom_abline(intercept = fits[1, 1], slope = fits[2, 1], color = colors[1], linewidth = 1.5)
for (i in seq_len(nf)[-1]) {
p <- p + geom_abline(intercept = fits[1, i], slope = fits[2, i], color = colors[i])
}
p
data(data98)
## Warning in data(data98): data set 'data98' not found
p <- ggplot(data = data98) +
geom_point(mapping = aes(x = gender, y = earnings), color = "blue")
taus <- c(0.1, 0.25, 0.5, 0.75, 0.90, 0.95)
fits <- data.frame(
coef(lm(earnings ~ gender, data = data98)),
sapply(taus, function(x) coef(rq(formula = earnings ~ gender, data = data98, tau = x))))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
names(fits) <- c("OLS", sprintf("$\\tau_{%0.2f}$", taus))
nf <- ncol(fits)
colors <- colorRampPalette(colors = c("black", "red"))(nf)
p <- p + geom_abline(intercept = fits[1, 1], slope = fits[2, 1], color = colors[1], linewidth = 1.5)
for (i in seq_len(nf)[-1]) {
p <- p + geom_abline(intercept = fits[1, i], slope = fits[2, i], color = colors[i])
}
p
knitr::kable(fits, format = "html", caption = "Oszacowania z KMNK oraz `quantreg`") %>%
kable_styling("striped") %>%
column_spec(1:8, background = "#ececec")
| OLS | \(\tau_{0.10}\) | \(\tau_{0.25}\) | \(\tau_{0.50}\) | \(\tau_{0.75}\) | \(\tau_{0.90}\) | \(\tau_{0.95}\) | |
|---|---|---|---|---|---|---|---|
| (Intercept) | 14.805467 | 7.019231 | 9.615385 | 13.461538 | 18.376068 | 24.038462 | 28.846153 |
| genderfemale | -2.044285 | -1.019231 | -1.442308 | -1.923077 | -2.991453 | -2.884615 | -3.846153 |
##
## Wyniki regresji kwantylowych
## ================================================================
## Dependent variable:
## -------------------------------------------------
## log(earnings)
## (1) (2) (3) (4) (5)
## ----------------------------------------------------------------
## age 0.018*** 0.023*** 0.026*** 0.026*** 0.024***
## (0.003) (0.003) (0.002) (0.003) (0.003)
##
## degreebachelor 0.413*** 0.394*** 0.379*** 0.377*** 0.403***
## (0.016) (0.015) (0.013) (0.016) (0.018)
##
## genderfemale -0.169*** -0.179*** -0.197*** -0.211*** -0.205***
## (0.016) (0.015) (0.013) (0.015) (0.019)
##
## Constant 1.504*** 1.663*** 1.783*** 2.043*** 2.266***
## (0.083) (0.079) (0.070) (0.083) (0.096)
##
## ----------------------------------------------------------------
## Observations 5,911 5,911 5,911 5,911 5,911
## ================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## model kwantylowy
model1 <- rq(log(earnings) ~ age+degree+gender,tau = 0.1, data = data98)
reszty1 <- resid(model1)
## bezwarunkowy (pusty) model kwantylowy
model2 <- rq(log(earnings) ~ 1, tau = 0.1,data=data98)
reszty2 <- resid(model2)
goodfit(reszty1, reszty2, 0.1)
## [1] 0.07776773
## r2 modelu KMNK dla porównania
model_lm <- lm(log(earnings) ~ age+degree+gender, data = data98)
summary(model_lm)$r.squared
## [1] 0.1828058
kmnk <- lm(log(earnings) ~ age + factor(degree) + factor(gender), data = data98)
summary(kmnk)
##
## Call:
## lm(formula = log(earnings) ~ age + factor(degree) + factor(gender),
## data = data98)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96583 -0.27644 0.02536 0.30209 1.50215
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.78845 0.06230 28.71 <2e-16 ***
## age 0.02142 0.00207 10.35 <2e-16 ***
## factor(degree)bachelor 0.38277 0.01173 32.64 <2e-16 ***
## factor(gender)female -0.18003 0.01182 -15.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4457 on 5907 degrees of freedom
## Multiple R-squared: 0.1828, Adjusted R-squared: 0.1824
## F-statistic: 440.5 on 3 and 5907 DF, p-value: < 2.2e-16
kwantyle <- c(0.25, 0.50, 0.75)
reg_kwantylowa <- rq(log(earnings) ~ age + factor(degree) + factor(gender),
tau = kwantyle,
data = data98)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
summary(reg_kwantylowa, se = "boot")
##
## Call: rq(formula = log(earnings) ~ age + factor(degree) + factor(gender),
## tau = kwantyle, data = data98)
##
## tau: [1] 0.25
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 1.53975 0.09088 16.94319 0.00000
## age 0.02026 0.00303 6.68133 0.00000
## factor(degree)bachelor 0.39850 0.01752 22.74413 0.00000
## factor(gender)female -0.18133 0.01884 -9.62618 0.00000
##
## Call: rq(formula = log(earnings) ~ age + factor(degree) + factor(gender),
## tau = kwantyle, data = data98)
##
## tau: [1] 0.5
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 1.71413 0.08494 20.18135 0.00000
## age 0.02479 0.00278 8.90182 0.00000
## factor(degree)bachelor 0.39563 0.01416 27.94466 0.00000
## factor(gender)female -0.19526 0.01371 -14.24517 0.00000
##
## Call: rq(formula = log(earnings) ~ age + factor(degree) + factor(gender),
## tau = kwantyle, data = data98)
##
## tau: [1] 0.75
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 1.94914 0.07423 26.25850 0.00000
## age 0.02666 0.00251 10.61619 0.00000
## factor(degree)bachelor 0.36898 0.01390 26.53678 0.00000
## factor(gender)female -0.21198 0.01289 -16.44662 0.00000
kwantyle <- c(0.25, 0.50)
reg_kwantylowa <- rq(log(earnings) ~ age + factor(degree) + factor(gender),
tau = kwantyle,
data = data98)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
anova(reg_kwantylowa, test = "Wald", joint = TRUE)
## Quantile Regression Analysis of Deviance Table
##
## Model: log(earnings) ~ age + factor(degree) + factor(gender)
## Joint Test of Equality of Slopes: tau in { 0.25 0.5 }
##
## Df Resid Df F value Pr(>F)
## 1 3 11819 1.5073 0.2104
anova(reg_kwantylowa, test = "Wald", joint=FALSE)
## Quantile Regression Analysis of Deviance Table
##
## Model: log(earnings) ~ age + factor(degree) + factor(gender)
## Tests of Equality of Distinct Slopes: tau in { 0.25 0.5 }
##
## Df Resid Df F value Pr(>F)
## age 1 11821 3.3290 0.06809 .
## factor(degree)bachelor 1 11821 0.0416 0.83830
## factor(gender)female 1 11821 0.9879 0.32028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kwantyle <- c(0.25, 0.50, 0.75)
reg_kwantylowa <- rq(log(earnings) ~ age + factor(degree) + factor(gender),
tau = kwantyle,
data = data98)
anova(reg_kwantylowa, test = "Wald", joint = TRUE)
## Quantile Regression Analysis of Deviance Table
##
## Model: log(earnings) ~ age + factor(degree) + factor(gender)
## Joint Test of Equality of Slopes: tau in { 0.25 0.5 0.75 }
##
## Df Resid Df F value Pr(>F)
## 1 6 17727 2.0933 0.05064 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(reg_kwantylowa, test = "Wald", joint=FALSE)
## Quantile Regression Analysis of Deviance Table
##
## Model: log(earnings) ~ age + factor(degree) + factor(gender)
## Tests of Equality of Distinct Slopes: tau in { 0.25 0.5 0.75 }
##
## Df Resid Df F value Pr(>F)
## age 2 17731 2.2269 0.1079
## factor(degree)bachelor 2 17731 2.1488 0.1167
## factor(gender)female 2 17731 1.4965 0.2239
## Model kwantylowy
model1 <- rq(log(earnings) ~ age + factor(degree) + factor(gender), tau = 0.5, data = data98)
reszty1 <- resid(model1)
## Bezwarunkowy (pusty) model kwantylowy
model2 <- rq(log(earnings) ~ 1, tau = 0.5, data = data98)
reszty2 <- resid(model2)
## Test dopasowania (goodness-of-fit)
goodfit(reszty1, reszty2, 0.5)
## [1] 0.1052646
## R^2 modelu KMNK dla porównania
model_lm <- lm(log(earnings) ~ age + factor(degree) + factor(gender), data = data98)
summary(model_lm)$r.squared
## [1] 0.1828058
Analiza danych CPSSW9298 dla 1998 roku dotyczyła wpływu wieku, wykształcenia i płci na logarytm zarobków. Model KMNK pokazał 18% zmienności (R2=0.18), pokazując, że wiek i wykształcenie zwiększają zarobki, a kobiety zarabiają mniej niż mężczyźni. Regresja kwantylowa (τ={0.25,0.50,0.75}) wykazała niewielkie różnice między współczynnikami (p=0.05), jednak wpływ predyktorów pozostawał stabilny w każdej części rozkładu (p>0.05). Podsumowując, regresja kwantylowa nie pokazała istotnych różnic między kwantylami, ale potwierdziła stabilność wpływu analizowanych zmiennych na zarobki. Wyniki obu metod były do siebie zbliżone.