library(CVXR)
## 
## Attaching package: 'CVXR'
## The following object is masked from 'package:stats':
## 
##     power
library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(WRTDStidal)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks CVXR::id()
## ✖ purrr::is_vector() masks CVXR::is_vector()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::recode()    masks car::recode()
## ✖ purrr::some()      masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(quantreg)
## Loading required package: SparseM
## 
## Attaching package: 'quantreg'
## 
## The following object is masked from 'package:survival':
## 
##     untangle.specials
library(PogromcyDanych)
## Loading required package: SmarterPoland
## Loading required package: httr
## Loading required package: htmltools

Zadanie

Teraz Wasza kolej ;-)

Waszym zadaniem dzisiaj jest zamodelowanie - porównanie KMNK oraz regresji kwantylowej (różno-poziomowej) dla zmiennej “earnings” - wynagrodzenia.

Dobierz i przetestuj predyktory, kwantyle dla modeli. Wykonaj testy różnic współczynnikow dla finalnych modeli.

W przypadku problemów - obejrzyj video tutorial (włącz polskie napisy) oraz wejdź na jego stronę ze źródłami. Możesz również wykorzystać w/w przykłady.

data("CPSSW9298")
# ?CPSSW9298 
unique(CPSSW9298$year)
## [1] 1992 1998
## Levels: 1992 1998
data98<-CPSSW9298 %>% filter(CPSSW9298$year==1998)
  data98%>%
  ggplot(aes(age,earnings))+
    geom_point()

data(data98) 
## Warning in data(data98): data set 'data98' not found
p <- ggplot(data = data98) +
    geom_point(mapping = aes(x = age, y = earnings), color = "blue")
taus <- c(0.1, 0.25, 0.5, 0.75, 0.90, 0.95)
fits <- data.frame(
    coef(lm(earnings ~ age, data = data98)),
    sapply(taus, function(x) coef(rq(formula = earnings ~ age, data = data98, tau = x))))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
names(fits) <- c("OLS", sprintf("$\\tau_{%0.2f}$", taus))
nf <- ncol(fits)
colors <- colorRampPalette(colors = c("black", "red"))(nf)
p <- p + geom_abline(intercept = fits[1, 1], slope = fits[2, 1], color = colors[1], linewidth = 1.5)
for (i in seq_len(nf)[-1]) {
    p <- p + geom_abline(intercept = fits[1, i], slope = fits[2, i], color = colors[i])
}
p

data(data98) 
## Warning in data(data98): data set 'data98' not found
p <- ggplot(data = data98) +
    geom_point(mapping = aes(x = gender, y = earnings), color = "blue")
taus <- c(0.1, 0.25, 0.5, 0.75, 0.90, 0.95)
fits <- data.frame(
    coef(lm(earnings ~ gender, data = data98)),
    sapply(taus, function(x) coef(rq(formula = earnings ~ gender, data = data98, tau = x))))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
names(fits) <- c("OLS", sprintf("$\\tau_{%0.2f}$", taus))
nf <- ncol(fits)
colors <- colorRampPalette(colors = c("black", "red"))(nf)
p <- p + geom_abline(intercept = fits[1, 1], slope = fits[2, 1], color = colors[1], linewidth = 1.5)
for (i in seq_len(nf)[-1]) {
    p <- p + geom_abline(intercept = fits[1, i], slope = fits[2, i], color = colors[i])
}
p

knitr::kable(fits, format = "html", caption = "Oszacowania z KMNK oraz `quantreg`") %>%
    kable_styling("striped") %>%
    column_spec(1:8, background = "#ececec")
Oszacowania z KMNK oraz quantreg
OLS \(\tau_{0.10}\) \(\tau_{0.25}\) \(\tau_{0.50}\) \(\tau_{0.75}\) \(\tau_{0.90}\) \(\tau_{0.95}\)
(Intercept) 14.805467 7.019231 9.615385 13.461538 18.376068 24.038462 28.846153
genderfemale -2.044285 -1.019231 -1.442308 -1.923077 -2.991453 -2.884615 -3.846153
## 
## Wyniki regresji kwantylowych
## ================================================================
##                               Dependent variable:               
##                -------------------------------------------------
##                                  log(earnings)                  
##                   (1)       (2)       (3)       (4)       (5)   
## ----------------------------------------------------------------
## age            0.018***  0.023***  0.026***  0.026***  0.024*** 
##                 (0.003)   (0.003)   (0.002)   (0.003)   (0.003) 
##                                                                 
## degreebachelor 0.413***  0.394***  0.379***  0.377***  0.403*** 
##                 (0.016)   (0.015)   (0.013)   (0.016)   (0.018) 
##                                                                 
## genderfemale   -0.169*** -0.179*** -0.197*** -0.211*** -0.205***
##                 (0.016)   (0.015)   (0.013)   (0.015)   (0.019) 
##                                                                 
## Constant       1.504***  1.663***  1.783***  2.043***  2.266*** 
##                 (0.083)   (0.079)   (0.070)   (0.083)   (0.096) 
##                                                                 
## ----------------------------------------------------------------
## Observations     5,911     5,911     5,911     5,911     5,911  
## ================================================================
## Note:                                *p<0.1; **p<0.05; ***p<0.01
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

## model kwantylowy
model1 <- rq(log(earnings) ~ age+degree+gender,tau = 0.1, data = data98)
reszty1 <- resid(model1)

## bezwarunkowy (pusty) model kwantylowy
model2 <- rq(log(earnings) ~ 1, tau = 0.1,data=data98)
reszty2 <- resid(model2)

goodfit(reszty1, reszty2, 0.1)
## [1] 0.07776773
## r2 modelu KMNK dla porównania
model_lm <- lm(log(earnings) ~ age+degree+gender, data = data98)

summary(model_lm)$r.squared
## [1] 0.1828058
kmnk <- lm(log(earnings) ~ age + factor(degree) + factor(gender), data = data98)
summary(kmnk)
## 
## Call:
## lm(formula = log(earnings) ~ age + factor(degree) + factor(gender), 
##     data = data98)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96583 -0.27644  0.02536  0.30209  1.50215 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.78845    0.06230   28.71   <2e-16 ***
## age                     0.02142    0.00207   10.35   <2e-16 ***
## factor(degree)bachelor  0.38277    0.01173   32.64   <2e-16 ***
## factor(gender)female   -0.18003    0.01182  -15.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4457 on 5907 degrees of freedom
## Multiple R-squared:  0.1828, Adjusted R-squared:  0.1824 
## F-statistic: 440.5 on 3 and 5907 DF,  p-value: < 2.2e-16
kwantyle <- c(0.25, 0.50, 0.75)

reg_kwantylowa <- rq(log(earnings) ~ age + factor(degree) + factor(gender), 
                     tau = kwantyle, 
                     data = data98)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
summary(reg_kwantylowa, se = "boot")
## 
## Call: rq(formula = log(earnings) ~ age + factor(degree) + factor(gender), 
##     tau = kwantyle, data = data98)
## 
## tau: [1] 0.25
## 
## Coefficients:
##                        Value    Std. Error t value  Pr(>|t|)
## (Intercept)             1.53975  0.09088   16.94319  0.00000
## age                     0.02026  0.00303    6.68133  0.00000
## factor(degree)bachelor  0.39850  0.01752   22.74413  0.00000
## factor(gender)female   -0.18133  0.01884   -9.62618  0.00000
## 
## Call: rq(formula = log(earnings) ~ age + factor(degree) + factor(gender), 
##     tau = kwantyle, data = data98)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              1.71413   0.08494   20.18135   0.00000
## age                      0.02479   0.00278    8.90182   0.00000
## factor(degree)bachelor   0.39563   0.01416   27.94466   0.00000
## factor(gender)female    -0.19526   0.01371  -14.24517   0.00000
## 
## Call: rq(formula = log(earnings) ~ age + factor(degree) + factor(gender), 
##     tau = kwantyle, data = data98)
## 
## tau: [1] 0.75
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              1.94914   0.07423   26.25850   0.00000
## age                      0.02666   0.00251   10.61619   0.00000
## factor(degree)bachelor   0.36898   0.01390   26.53678   0.00000
## factor(gender)female    -0.21198   0.01289  -16.44662   0.00000
kwantyle <- c(0.25, 0.50)

reg_kwantylowa <- rq(log(earnings) ~ age + factor(degree) + factor(gender), 
                     tau = kwantyle, 
                     data = data98)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
anova(reg_kwantylowa, test = "Wald", joint = TRUE)
## Quantile Regression Analysis of Deviance Table
## 
## Model: log(earnings) ~ age + factor(degree) + factor(gender)
## Joint Test of Equality of Slopes: tau in {  0.25 0.5  }
## 
##   Df Resid Df F value Pr(>F)
## 1  3    11819  1.5073 0.2104
anova(reg_kwantylowa, test = "Wald", joint=FALSE)
## Quantile Regression Analysis of Deviance Table
## 
## Model: log(earnings) ~ age + factor(degree) + factor(gender)
## Tests of Equality of Distinct Slopes: tau in {  0.25 0.5  }
## 
##                        Df Resid Df F value  Pr(>F)  
## age                     1    11821  3.3290 0.06809 .
## factor(degree)bachelor  1    11821  0.0416 0.83830  
## factor(gender)female    1    11821  0.9879 0.32028  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kwantyle <- c(0.25, 0.50, 0.75)

reg_kwantylowa <- rq(log(earnings) ~ age + factor(degree) + factor(gender), 
                     tau = kwantyle, 
                     data = data98)

anova(reg_kwantylowa, test = "Wald", joint = TRUE)
## Quantile Regression Analysis of Deviance Table
## 
## Model: log(earnings) ~ age + factor(degree) + factor(gender)
## Joint Test of Equality of Slopes: tau in {  0.25 0.5 0.75  }
## 
##   Df Resid Df F value  Pr(>F)  
## 1  6    17727  2.0933 0.05064 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(reg_kwantylowa, test = "Wald", joint=FALSE)
## Quantile Regression Analysis of Deviance Table
## 
## Model: log(earnings) ~ age + factor(degree) + factor(gender)
## Tests of Equality of Distinct Slopes: tau in {  0.25 0.5 0.75  }
## 
##                        Df Resid Df F value Pr(>F)
## age                     2    17731  2.2269 0.1079
## factor(degree)bachelor  2    17731  2.1488 0.1167
## factor(gender)female    2    17731  1.4965 0.2239
## Model kwantylowy
model1 <- rq(log(earnings) ~ age + factor(degree) + factor(gender), tau = 0.5, data = data98)
reszty1 <- resid(model1)

## Bezwarunkowy (pusty) model kwantylowy
model2 <- rq(log(earnings) ~ 1, tau = 0.5, data = data98)
reszty2 <- resid(model2)

## Test dopasowania (goodness-of-fit)
goodfit(reszty1, reszty2, 0.5)
## [1] 0.1052646
## R^2 modelu KMNK dla porównania
model_lm <- lm(log(earnings) ~ age + factor(degree) + factor(gender), data = data98)
summary(model_lm)$r.squared
## [1] 0.1828058

Analiza danych CPSSW9298 dla 1998 roku dotyczyła wpływu wieku, wykształcenia i płci na logarytm zarobków. Model KMNK pokazał 18% zmienności (R2=0.18), pokazując, że wiek i wykształcenie zwiększają zarobki, a kobiety zarabiają mniej niż mężczyźni. Regresja kwantylowa (τ={0.25,0.50,0.75}) wykazała niewielkie różnice między współczynnikami (p=0.05), jednak wpływ predyktorów pozostawał stabilny w każdej części rozkładu (p>0.05). Podsumowując, regresja kwantylowa nie pokazała istotnych różnic między kwantylami, ale potwierdziła stabilność wpływu analizowanych zmiennych na zarobki. Wyniki obu metod były do siebie zbliżone.