## Warning: pakiet 'CVXR' został zbudowany w wersji R 4.3.3
## Warning: pakiet 'lmtest' został zbudowany w wersji R 4.3.3
## Warning: pakiet 'WRTDStidal' został zbudowany w wersji R 4.3.3
## Warning: pakiet 'ggplot2' został zbudowany w wersji R 4.3.3
## Warning in .recacheSubclasses(def@className, def, env): niezdefiniowana
## podklasa "ndiMatrix" klasy "replValueSp"; definicja nie została zaktualizowana
## Warning: pakiet 'tidyverse' został zbudowany w wersji R 4.3.3
## Warning: pakiet 'lubridate' został zbudowany w wersji R 4.3.3
## Warning: pakiet 'kableExtra' został zbudowany w wersji R 4.3.3
## Warning: pakiet 'PogromcyDanych' został zbudowany w wersji R 4.3.3
## Warning: pakiet 'SmarterPoland' został zbudowany w wersji R 4.3.3
data("CPSSW9298")
#?CPSSW9298
# Podział na dwa zbiory danych względem lat (1992 oraz 1998)
data_1992 <- CPSSW9298 %>% filter(CPSSW9298$year==1992)
data_1998 <- CPSSW9298 %>% filter(CPSSW9298$year==1998)
head(data_1992)
## year earnings degree gender age
## 1 1992 11.188811 bachelor male 29
## 2 1992 10.000000 bachelor male 33
## 3 1992 5.769231 highschool male 30
## 4 1992 14.957265 bachelor male 31
## 5 1992 8.660096 bachelor female 26
## 6 1992 7.788462 highschool female 31
summary(data_1992)
## year earnings degree gender age
## 1992:7590 Min. : 1.840 highschool:4628 male :4354 Min. :25.00
## 1998: 0 1st Qu.: 7.692 bachelor :2962 female:3236 1st Qu.:27.00
## Median :10.577 Median :30.00
## Mean :11.654 Mean :29.71
## 3rd Qu.:14.423 3rd Qu.:32.00
## Max. :44.231 Max. :34.00
head(data_1998)
## year earnings degree gender age
## 7591 1998 10.341881 bachelor male 28
## 7592 1998 9.615385 highschool female 33
## 7593 1998 9.575923 bachelor male 29
## 7594 1998 4.516994 bachelor female 29
## 7595 1998 12.500000 highschool male 31
## 7596 1998 8.173077 highschool male 26
summary(data_1998)
## year earnings degree gender age
## 1992: 0 Min. : 2.098 highschool:3308 male :3461 Min. :25.00
## 1998:5911 1st Qu.: 9.135 bachelor :2603 female:2450 1st Qu.:27.00
## Median :12.577 Median :30.00
## Mean :13.958 Mean :29.71
## 3rd Qu.:17.308 3rd Qu.:32.00
## Max. :49.451 Max. :34.00
Podział danych ze względu na rok, gdzie w oryginalnym zbiorze istnieją tylko dwie wartości (1992 i 1998 rok), pozwala oprzeć na tym wyodrębnieniu pytania badawcze. Dane pochodzące z Current Population Surveys (CPS), zestawione przez Stock, Watson (2007) w ogólnej formie pozwalają na modelowanie zależności pomiędzy zarobkami a edukacją czy płcią i wiekiem. Wartym uwagi jest fakt, że tylko te zmienne znajdują się w tym konkretnym zbiorze danych i w całości nie wyjaśniają zjawiska jakim są zarobki. Jest to poza zakresem tego zadania, które koncentruje się na pokazaniu działania regresji kwantylowej.
Przedstawiona powyżej statystyka opisowa dla każdej ze zmiennych w dwóch podzbiorach, pokazuje że pomiędzy rokiem 1992 a 1998 zarobki przesuneły się w górę. Założyć możemy, że był to pośredni efekt inflacji. W tej pracy rozważone zostało jednak, czy rzeczywiście jest to wyłącznie różnica wzrostu cen.
taus <- c(0.1, 0.25, 0.5, 0.75, 0.90, 0.95)
p_92 <- ggplot(data=data_1992) +
geom_point(mapping = aes(x=age, y=earnings), color="red") +
ggtitle('Earnings vs age') +
labs(x="Age", y="Earnings", title=("Quantile Regression with selected tau values for 1992 data")) +
theme_minimal()
fits_92 <- data.frame(
coef(lm(earnings~age, data=data_1992)),
sapply(taus, function(x) coef(rq(formula=earnings~age, data=data_1992, tau=x)))
)
names(fits_92) <- c("OLS", sprintf("τ = %0.2f", taus))
nf_92 <- ncol(fits_92)
colors_92 <- c('black', colorRampPalette(colors=c("brown", "yellow"))(nf_92-1))
lines_df <- data.frame(
intercept = as.numeric(fits_92[1, ]),
slope = as.numeric(fits_92[2, ]),
method = names(fits_92)
)
p_92 <- p_92 +
geom_abline(data=lines_df, aes(intercept=intercept, slope=slope, color=method), size=1.5) +
scale_color_manual(values=colors_92) +
guides(color=guide_legend(title="Legend"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p_92
p_98 <- ggplot(data=data_1998) +
geom_point(mapping = aes(x=age, y=earnings), color="red") +
ggtitle('Earnings vs age') +
labs(x="Age", y="Earnings", title=("Quantile Regression with selected tau values for 1998 data")) +
theme_minimal()
fits_98 <- data.frame(
coef(lm(earnings~age, data=data_1998)),
sapply(taus, function(x) coef(rq(formula=earnings~age, data=data_1998, tau=x)))
)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
names(fits_98) <- c("OLS", sprintf("τ = %0.2f", taus))
nf_98 <- ncol(fits_98)
colors_98 <- c('black', colorRampPalette(colors=c("brown", "yellow"))(nf_98-1))
lines_df <- data.frame(
intercept = as.numeric(fits_98[1, ]),
slope = as.numeric(fits_98[2, ]),
method = names(fits_98)
)
p_98 <- p_98 +
geom_abline(data=lines_df, aes(intercept=intercept, slope=slope, color=method), size=1.5) +
scale_color_manual(values=colors_98) +
guides(color=guide_legend(title="Legend"))
p_98
Powyższe wykresy przedstawiają dopasowanie regresji kwantylowych dla
współczynnika tau równego (0.1, 0.25, 0.5, 0.75, 0.9, 0.95) dla dwóch
podzbiorów danych 1992 i 1998. Przechodzę do przedstawienia tabeli z
oszacowanymi współczynnikami.
knitr::kable(fits_92, caption="Estimations for OLS and quantile regression for 1992 data") %>%
kable_styling(full_width=FALSE, 'striped') %>%
row_spec(0, background='#000000', color='#FFFFFF')
| OLS | τ = 0.10 | τ = 0.25 | τ = 0.50 | τ = 0.75 | τ = 0.90 | τ = 0.95 | |
|---|---|---|---|---|---|---|---|
| (Intercept) | 2.6211438 | 3.5714290 | 4.0528865 | 2.8605754 | 0.8146346 | 2.0699801 | -0.5698020 |
| age | 0.3039981 | 0.0686813 | 0.1213942 | 0.2644231 | 0.4674146 | 0.5675747 | 0.7523149 |
Ekwiwaletnie można zapisać to tak:
q10 <- rq(earnings~age, data=data_1992, tau=0.1)
q25 <- rq(earnings~age, data=data_1992, tau=0.25)
q50 <- rq(earnings~age, data=data_1992, tau=0.50)
q75 <- rq(earnings~age, data=data_1992, tau=0.75)
q90 <- rq(earnings~age, data=data_1992, tau=0.90)
q95 <- rq(earnings~age, data=data_1992, tau=0.95)
stargazer(q10, q25, q50, q75, q90, q95, title = "Quantile Regression Results 1992", type="text")
##
## Quantile Regression Results 1992
## ==================================================================
## Dependent variable:
## -----------------------------------------------------
## earnings
## (1) (2) (3) (4) (5) (6)
## ------------------------------------------------------------------
## age 0.069*** 0.121*** 0.264*** 0.467*** 0.568*** 0.752***
## (0.020) (0.020) (0.024) (0.033) (0.058) (0.062)
##
## Constant 3.571*** 4.053*** 2.861*** 0.815 2.070 -0.570
## (0.608) (0.582) (0.673) (0.974) (1.636) (1.798)
##
## ------------------------------------------------------------------
## Observations 7,590 7,590 7,590 7,590 7,590 7,590
## ==================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
knitr::kable(fits_98, caption="Estimations for OLS and quantile regression for 1998 data") %>%
kable_styling(full_width=FALSE, 'striped') %>%
row_spec(0, background='#000000', color='#FFFFFF')
| OLS | τ = 0.10 | τ = 0.25 | τ = 0.50 | τ = 0.75 | τ = 0.90 | τ = 0.95 | |
|---|---|---|---|---|---|---|---|
| (Intercept) | 4.7459177 | 3.6538465 | 4.3269191 | 5.344723 | 2.9784996 | 0.9915855 | 4.2307709 |
| age | 0.3100626 | 0.0961538 | 0.1602565 | 0.247204 | 0.4791506 | 0.7391827 | 0.7692307 |
q10 <- rq(earnings~age, data=data_1998, tau=0.1)
q25 <- rq(earnings~age, data=data_1998, tau=0.25)
q50 <- rq(earnings~age, data=data_1998, tau=0.50)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
q75 <- rq(earnings~age, data=data_1998, tau=0.75)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
q90 <- rq(earnings~age, data=data_1998, tau=0.90)
q95 <- rq(earnings~age, data=data_1998, tau=0.95)
stargazer(q10, q25, q50, q75, q90, q95, title="Quantile Regression Results 1998", type="text")
##
## Quantile Regression Results 1998
## ==================================================================
## Dependent variable:
## -----------------------------------------------------
## earnings
## (1) (2) (3) (4) (5) (6)
## ------------------------------------------------------------------
## age 0.096*** 0.160*** 0.247*** 0.479*** 0.739*** 0.769***
## (0.030) (0.030) (0.033) (0.048) (0.084) (0.129)
##
## Constant 3.654*** 4.327*** 5.345*** 2.978** 0.992 4.231
## (0.888) (0.908) (0.971) (1.412) (2.454) (3.977)
##
## ------------------------------------------------------------------
## Observations 5,911 5,911 5,911 5,911 5,911 5,911
## ==================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
W przypadku obu zbiorów danych dla wszystkich przyjętych wartości współczynnika tau, zmienna wiek jest istotna. Wyraz wolny był istotny jedynie dla trzech wartości tau: 0.1, 0.25, 0.50. Ich wizualizacja znajduje się poniżej.
my_qr <- rq(earnings~age, data=data_1992, tau=c(0.10, 0.25, 0.50))
intercept_slope <- my_qr %>%
coef() %>%
t() %>%
data.frame() %>%
rename(intercept=X.Intercept., slope=age) %>%
mutate(quantile=row.names(.))
ggplot() +
geom_point(data=data_1992, aes(age, earnings), alpha=0.5) +
geom_abline(data=intercept_slope, aes(intercept=intercept, slope=slope, color=quantile)) +
theme_minimal() +
labs(x="Age", y="Earnings", title="Quantile Regression with selected tau = 0.10, 0.25 and 0.50 for 1992 data")
my_qr <- rq(earnings~age, data=data_1998, tau=c(0.10, 0.25, 0.50))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
intercept_slope <- my_qr %>%
coef() %>%
t() %>%
data.frame() %>%
rename(intercept=X.Intercept., slope=age) %>%
mutate(quantile=row.names(.))
ggplot() +
geom_point(data=data_1998, aes(age, earnings), alpha=0.5) +
geom_abline(data=intercept_slope, aes(intercept=intercept, slope=slope, color=quantile)) +
theme_minimal() +
labs(x="Age", y="Earnings", title="Quantile Regression with selected tau: 0.10, 0.25 and 0.50 for 1998 data")
Zaczynam od oszacowania regresji najmniejszych kwadratów.
model_lm_92 <- lm(earnings~age+factor(degree)+factor(gender), data=data_1992)
summary(model_lm_92)
##
## Call:
## lm(formula = earnings ~ age + factor(degree) + factor(gender),
## data = data_1992)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.172 -3.324 -0.606 2.580 32.651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.62877 0.61537 1.022 0.307
## age 0.34221 0.02041 16.771 <2e-16 ***
## factor(degree)bachelor 4.38740 0.11758 37.314 <2e-16 ***
## factor(gender)female -2.00554 0.11581 -17.318 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.981 on 7586 degrees of freedom
## Multiple R-squared: 0.1956, Adjusted R-squared: 0.1953
## F-statistic: 614.9 on 3 and 7586 DF, p-value: < 2.2e-16
model_lm_98 <- lm(earnings~age+factor(degree)+factor(gender), data=data_1998)
summary(model_lm_98)
##
## Call:
## lm(formula = earnings ~ age + factor(degree) + factor(gender),
## data = data_1998)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.448 -4.142 -0.890 2.978 32.109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3003 0.8729 3.781 0.000158 ***
## age 0.3144 0.0290 10.843 < 2e-16 ***
## factor(degree)bachelor 5.3357 0.1643 32.473 < 2e-16 ***
## factor(gender)female -2.4932 0.1656 -15.055 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.245 on 5907 degrees of freedom
## Multiple R-squared: 0.1823, Adjusted R-squared: 0.1819
## F-statistic: 439 on 3 and 5907 DF, p-value: < 2.2e-16
Czas na oszacowanie warunkowych regresji kwantylowych z współczynnikami tau wybranymi najpierw jako (0.1, 0.25, 0.5). Należy zwrócić uwagę, że wszystkie zmienne okazały się statystycznie istotne. Wartość współczynnika determinacji (R^2) nie zaskakuje, gdyż taka możliwość została zaznaczona już na wstępie. Celem jest regresja kwantylowa, której model znajduje się poniżej.
taus_new <- c(0.1, 0.25, 0.5)
model_rq_92 <- rq(earnings~age+factor(degree)+factor(gender), tau=taus_new, data=data_1992)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
summary(model_rq_92, se='boot')
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new, data = data_1992)
##
## tau: [1] 0.1
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 2.14286 0.58713 3.64969 0.00026
## age 0.10989 0.02117 5.19042 0.00000
## factor(degree)bachelor 2.48077 0.15031 16.50484 0.00000
## factor(gender)female -0.67433 0.13122 -5.13890 0.00000
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new, data = data_1992)
##
## tau: [1] 0.25
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 1.92427 0.69438 2.77120 0.00560
## age 0.18222 0.02359 7.72437 0.00000
## factor(degree)bachelor 3.41880 0.13885 24.62290 0.00000
## factor(gender)female -1.14850 0.12885 -8.91378 0.00000
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new, data = data_1992)
##
## tau: [1] 0.5
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 0.73078 0.69935 1.04494 0.29608
## age 0.31731 0.02434 13.03491 0.00000
## factor(degree)bachelor 4.33269 0.13038 33.23021 0.00000
## factor(gender)female -1.91923 0.10931 -17.55814 0.00000
model_rq_98 <- rq(earnings~age+factor(degree)+factor(gender), tau=taus_new, data=data_1998)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
summary(model_rq_98, se='boot')
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new, data = data_1998)
##
## tau: [1] 0.1
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 3.97694 0.93591 4.24927 0.00002
## age 0.08099 0.03141 2.57885 0.00994
## factor(degree)bachelor 2.97773 0.19504 15.26691 0.00000
## factor(gender)female -0.98077 0.16451 -5.96187 0.00000
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new, data = data_1998)
##
## tau: [1] 0.25
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 3.12432 0.87431 3.57348 0.00036
## age 0.18444 0.03003 6.14186 0.00000
## factor(degree)bachelor 3.80303 0.19336 19.66788 0.00000
## factor(gender)female -1.64320 0.15368 -10.69257 0.00000
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new, data = data_1998)
##
## tau: [1] 0.5
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 2.82967 0.89914 3.14707 0.00166
## age 0.30220 0.03091 9.77730 0.00000
## factor(degree)bachelor 4.98626 0.18267 27.29630 0.00000
## factor(gender)female -2.40385 0.15025 -15.99933 0.00000
Warto zauważyć, że w przypadku danych z 1992 roku wraz ze wzrostem wartości współczynnika Tau, wyraz wolny modelu okazał się nieistotny statystycznie. Dla danych z 1998 działo się podobnie, jednak z inną dynamiką.
A jak to wygląda dla innych wartości współczynników? Teraz współczynniki tau zostały dobrane skokowo co 40 kwartyli (0.1, 0.5, 0.9).
taus_new_2 <- c(0.1, 0.5, 0.9)
model_rq_92 <- rq(earnings~age+factor(degree)+factor(gender), tau=taus_new_2, data=data_1992)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
summary(model_rq_92, se='boot')
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new_2, data = data_1992)
##
## tau: [1] 0.1
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 2.14286 0.56164 3.81536 0.00014
## age 0.10989 0.02021 5.43819 0.00000
## factor(degree)bachelor 2.48077 0.15516 15.98845 0.00000
## factor(gender)female -0.67433 0.11787 -5.72096 0.00000
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new_2, data = data_1992)
##
## tau: [1] 0.5
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 0.73078 0.68526 1.06642 0.28627
## age 0.31731 0.02385 13.30620 0.00000
## factor(degree)bachelor 4.33269 0.13241 32.72283 0.00000
## factor(gender)female -1.91923 0.13440 -14.27959 0.00000
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new_2, data = data_1992)
##
## tau: [1] 0.9
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 1.37363 1.36098 1.00929 0.31287
## age 0.52198 0.04562 11.44141 0.00000
## factor(degree)bachelor 6.05769 0.23699 25.56089 0.00000
## factor(gender)female -3.17308 0.24091 -13.17102 0.00000
model_rq_98 <- rq(earnings~age+factor(degree)+factor(gender), tau=taus_new_2, data=data_1998)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
summary(model_rq_98, se='boot')
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new_2, data = data_1998)
##
## tau: [1] 0.1
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 3.97694 0.87505 4.54481 0.00001
## age 0.08099 0.02986 2.71241 0.00670
## factor(degree)bachelor 2.97773 0.20322 14.65242 0.00000
## factor(gender)female -0.98077 0.14888 -6.58760 0.00000
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new_2, data = data_1998)
##
## tau: [1] 0.5
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 2.82967 0.82427 3.43293 0.00060
## age 0.30220 0.02769 10.91459 0.00000
## factor(degree)bachelor 4.98626 0.18420 27.07035 0.00000
## factor(gender)female -2.40385 0.15959 -15.06301 0.00000
##
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender),
## tau = taus_new_2, data = data_1998)
##
## tau: [1] 0.9
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 5.76923 2.35903 2.44559 0.01449
## age 0.48077 0.07993 6.01510 0.00000
## factor(degree)bachelor 8.76068 0.39299 22.29243 0.00000
## factor(gender)female -4.43376 0.39885 -11.11628 0.00000
Po wykonanym modelowaniu można przystąpić do statystycznej weryfikacji współczynników, a dokładniej mówiąc statystycznej różnicy pomiędzy wybranymi kwantylami warunkowymi. W tym celu wykorzystany zostanie test WALDA. Hipoteza zerowa w tym teście zakłada liniowe ograniczenia dla współczynników.
taus_test <- c(0.1, 0.25)
rq_check_coef <- rq(earnings~age+factor(gender)+factor(degree), tau=taus_test, data=data_1992)
anova(rq_check_coef, test="Wald", joint=TRUE)
## Quantile Regression Analysis of Deviance Table
##
## Model: earnings ~ age + factor(gender) + factor(degree)
## Joint Test of Equality of Slopes: tau in { 0.1 0.25 }
##
## Df Resid Df F value Pr(>F)
## 1 3 15177 28.503 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(rq_check_coef, test = "Wald", joint=FALSE)
## Quantile Regression Analysis of Deviance Table
##
## Model: earnings ~ age + factor(gender) + factor(degree)
## Tests of Equality of Distinct Slopes: tau in { 0.1 0.25 }
##
## Df Resid Df F value Pr(>F)
## age 1 15179 15.357 8.937e-05 ***
## factor(gender)female 1 15179 20.938 4.783e-06 ***
## factor(degree)bachelor 1 15179 52.511 4.485e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rq_check_coef <- rq(earnings~age+factor(gender)+factor(degree), tau=taus_test, data=data_1998)
anova(rq_check_coef, test="Wald", joint=TRUE)
## Quantile Regression Analysis of Deviance Table
##
## Model: earnings ~ age + factor(gender) + factor(degree)
## Joint Test of Equality of Slopes: tau in { 0.1 0.25 }
##
## Df Resid Df F value Pr(>F)
## 1 3 11819 20.488 3.091e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(rq_check_coef, test="Wald", joint=FALSE)
## Quantile Regression Analysis of Deviance Table
##
## Model: earnings ~ age + factor(gender) + factor(degree)
## Tests of Equality of Distinct Slopes: tau in { 0.1 0.25 }
##
## Df Resid Df F value Pr(>F)
## age 1 11821 17.630 2.703e-05 ***
## factor(gender)female 1 11821 22.025 2.721e-06 ***
## factor(degree)bachelor 1 11821 22.789 1.829e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
W przypadku kwantyli 0.1 i 0.25 istnieją statystycznie istotne różnice. Oznacza to, że współczynniki dla tych dwóch różnych kwantyli zmiennych są różne dla każdej zmiennej.Taka sytuacja ma miejsce zarówno dla obu zbiorów danych. Teraz wybrane zostaną losowe kwartyle odmienne dla obu zbiorów, tym razem trzy kwartyle.
taus_random <- sample(taus, size=3, replace=TRUE)
rq_check_coef <- rq(earnings~age+factor(gender)+factor(degree), tau=taus_random, data=data_1992)
anova(rq_check_coef, test="Wald", joint=TRUE)
## Quantile Regression Analysis of Deviance Table
##
## Model: earnings ~ age + factor(gender) + factor(degree)
## Joint Test of Equality of Slopes: tau in { 0.75 0.95 }
##
## Df Resid Df F value Pr(>F)
## 1 3 15177 16.434 1.168e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(rq_check_coef, test="Wald", joint=FALSE)
## Quantile Regression Analysis of Deviance Table
##
## Model: earnings ~ age + factor(gender) + factor(degree)
## Tests of Equality of Distinct Slopes: tau in { 0.75 0.95 }
##
## Df Resid Df F value Pr(>F)
## age 1 15179 11.185 0.0008267 ***
## factor(gender)female 1 15179 25.038 5.685e-07 ***
## factor(degree)bachelor 1 15179 15.937 6.579e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rq_check_coef <- rq(earnings~age+factor(gender)+factor(degree), tau=taus_random, data=data_1998)
anova(rq_check_coef, test="Wald", joint=TRUE)
## Quantile Regression Analysis of Deviance Table
##
## Model: earnings ~ age + factor(gender) + factor(degree)
## Joint Test of Equality of Slopes: tau in { 0.75 0.95 }
##
## Df Resid Df F value Pr(>F)
## 1 3 11819 22.59 1.41e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(rq_check_coef, test="Wald", joint=FALSE)
## Quantile Regression Analysis of Deviance Table
##
## Model: earnings ~ age + factor(gender) + factor(degree)
## Tests of Equality of Distinct Slopes: tau in { 0.75 0.95 }
##
## Df Resid Df F value Pr(>F)
## age 1 11821 0.0321 0.857772
## factor(gender)female 1 11821 8.5762 0.003412 **
## factor(degree)bachelor 1 11821 61.8001 4.108e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Współczynniki w dalszym ciągu są statystycznie różne.
Biorąc pod uwagę, wyniki modeli dla lat 1992 i 1998 zbadane zostanie dopasowanie dla modeli z współczynnikami tau (0.1, 0.75, 0.95)
model_rq_92_01 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.1, data=data_1992)
resid_92_01 <- resid(model_rq_92_01)
model_empty_01 <- rq(earnings~1, tau=0.1, data=data_1992)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
resid_empty_01 <- resid(model_empty_01)
goodfit(resid_92_01, resid_empty_01, 0.1)
## [1] 0.0507487
model_rq_92_075 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.75, data=data_1992)
resid_92_075 <- resid(model_rq_92_075)
model_empty_075 <- rq(earnings~1, tau=0.75, data=data_1992)
resid_empty_075 <- resid(model_empty_075)
goodfit(resid_92_075, resid_empty_075, 0.75)
## [1] 0.1401108
model_rq_92_095 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.95, data=data_1992)
resid_92_095 <- resid(model_rq_92_095)
model_empty_095 <- rq(earnings~1, tau=0.95, data=data_1992)
resid_empty_095 <- resid(model_empty_095)
goodfit(resid_92_095, resid_empty_095, 0.95)
## [1] 0.148776
Wywołajmy jeszcze raz model regresji liniowej.
summary(model_lm_92)$r.squared
## [1] 0.1956149
W przypadku danych dla 1992 roku najlepiej radził sobie model najmniejszych kwadratów, z wynikim dopasowania niespełna 20%.
model_rq_98_01 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.1, data=data_1998)
resid_98_01 <- resid(model_rq_98_01)
model_empty_01_98 <- rq(earnings~1, tau=0.1, data=data_1998)
resid_empty_01_98 <- resid(model_empty_01_98)
goodfit(resid_98_01, resid_empty_01_98, 0.1)
## [1] 0.0553968
model_rq_98_075 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.75, data=data_1998)
resid_98_075 <- resid(model_rq_98_075)
model_empty_075_98 <- rq(earnings~1, tau=0.75, data=data_1998)
resid_empty_075_98 <- resid(model_empty_075)
goodfit(resid_98_075, resid_empty_075, 0.75)
## [1] 0.147787
model_rq_98_095 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.95, data=data_1998)
resid_98_095 <- resid(model_rq_98_095)
model_empty_095_98 <- rq(earnings~1, tau=0.95, data=data_1998)
resid_empty_095_98 <- resid(model_empty_095_98)
goodfit(resid_98_095, resid_empty_095_98, 0.95)
## [1] 0.1503992
Wywołajmy jeszcze raz model regresji liniowej.
summary(model_lm_98)$r.squared
## [1] 0.1823063
Również w przypadku danych z 1998 roku najlepsza okazała się regresja liniowa.