## Warning: pakiet 'CVXR' został zbudowany w wersji R 4.3.3

## Warning: pakiet 'lmtest' został zbudowany w wersji R 4.3.3

## Warning: pakiet 'WRTDStidal' został zbudowany w wersji R 4.3.3

## Warning: pakiet 'ggplot2' został zbudowany w wersji R 4.3.3

## Warning in .recacheSubclasses(def@className, def, env): niezdefiniowana
## podklasa "ndiMatrix" klasy "replValueSp"; definicja nie została zaktualizowana

## Warning: pakiet 'tidyverse' został zbudowany w wersji R 4.3.3

## Warning: pakiet 'lubridate' został zbudowany w wersji R 4.3.3

## Warning: pakiet 'kableExtra' został zbudowany w wersji R 4.3.3

## Warning: pakiet 'PogromcyDanych' został zbudowany w wersji R 4.3.3

## Warning: pakiet 'SmarterPoland' został zbudowany w wersji R 4.3.3

Wczytywanie danych

data("CPSSW9298")
#?CPSSW9298 

# Podział na dwa zbiory danych względem lat (1992 oraz 1998)

data_1992 <- CPSSW9298 %>% filter(CPSSW9298$year==1992)
data_1998 <- CPSSW9298 %>% filter(CPSSW9298$year==1998)

head(data_1992)

##   year  earnings     degree gender age
## 1 1992 11.188811   bachelor   male  29
## 2 1992 10.000000   bachelor   male  33
## 3 1992  5.769231 highschool   male  30
## 4 1992 14.957265   bachelor   male  31
## 5 1992  8.660096   bachelor female  26
## 6 1992  7.788462 highschool female  31

summary(data_1992)

##    year         earnings             degree        gender          age       
##  1992:7590   Min.   : 1.840   highschool:4628   male  :4354   Min.   :25.00  
##  1998:   0   1st Qu.: 7.692   bachelor  :2962   female:3236   1st Qu.:27.00  
##              Median :10.577                                   Median :30.00  
##              Mean   :11.654                                   Mean   :29.71  
##              3rd Qu.:14.423                                   3rd Qu.:32.00  
##              Max.   :44.231                                   Max.   :34.00

head(data_1998)

##      year  earnings     degree gender age
## 7591 1998 10.341881   bachelor   male  28
## 7592 1998  9.615385 highschool female  33
## 7593 1998  9.575923   bachelor   male  29
## 7594 1998  4.516994   bachelor female  29
## 7595 1998 12.500000 highschool   male  31
## 7596 1998  8.173077 highschool   male  26

summary(data_1998)

##    year         earnings             degree        gender          age       
##  1992:   0   Min.   : 2.098   highschool:3308   male  :3461   Min.   :25.00  
##  1998:5911   1st Qu.: 9.135   bachelor  :2603   female:2450   1st Qu.:27.00  
##              Median :12.577                                   Median :30.00  
##              Mean   :13.958                                   Mean   :29.71  
##              3rd Qu.:17.308                                   3rd Qu.:32.00  
##              Max.   :49.451                                   Max.   :34.00

Podział danych ze względu na rok, gdzie w oryginalnym zbiorze istnieją tylko dwie wartości (1992 i 1998 rok), pozwala oprzeć na tym wyodrębnieniu pytania badawcze. Dane pochodzące z Current Population Surveys (CPS), zestawione przez Stock, Watson (2007) w ogólnej formie pozwalają na modelowanie zależności pomiędzy zarobkami a edukacją czy płcią i wiekiem. Wartym uwagi jest fakt, że tylko te zmienne znajdują się w tym konkretnym zbiorze danych i w całości nie wyjaśniają zjawiska jakim są zarobki. Jest to poza zakresem tego zadania, które koncentruje się na pokazaniu działania regresji kwantylowej.

Przedstawiona powyżej statystyka opisowa dla każdej ze zmiennych w dwóch podzbiorach, pokazuje że pomiędzy rokiem 1992 a 1998 zarobki przesuneły się w górę. Założyć możemy, że był to pośredni efekt inflacji. W tej pracy rozważone zostało jednak, czy rzeczywiście jest to wyłącznie różnica wzrostu cen.

taus <- c(0.1, 0.25, 0.5, 0.75, 0.90, 0.95)

p_92 <- ggplot(data=data_1992) +
     geom_point(mapping = aes(x=age, y=earnings), color="red") + 
     ggtitle('Earnings vs age') +
     labs(x="Age", y="Earnings", title=("Quantile Regression with selected tau values for 1992 data")) +
     theme_minimal()

fits_92 <- data.frame(
    coef(lm(earnings~age, data=data_1992)),
    sapply(taus, function(x) coef(rq(formula=earnings~age, data=data_1992, tau=x)))
)

names(fits_92) <- c("OLS", sprintf("τ = %0.2f", taus))
nf_92 <- ncol(fits_92)

colors_92 <- c('black', colorRampPalette(colors=c("brown", "yellow"))(nf_92-1))

lines_df <- data.frame(
  intercept = as.numeric(fits_92[1, ]),
  slope = as.numeric(fits_92[2, ]),
  method = names(fits_92)
)


p_92 <- p_92 + 
  geom_abline(data=lines_df, aes(intercept=intercept, slope=slope, color=method), size=1.5) +
  scale_color_manual(values=colors_92) +
  guides(color=guide_legend(title="Legend"))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

p_92

p_98 <- ggplot(data=data_1998) +
     geom_point(mapping = aes(x=age, y=earnings), color="red") + 
     ggtitle('Earnings vs age') +
     labs(x="Age", y="Earnings", title=("Quantile Regression with selected tau values for 1998 data")) +
     theme_minimal()

fits_98 <- data.frame(
    coef(lm(earnings~age, data=data_1998)),
    sapply(taus, function(x) coef(rq(formula=earnings~age, data=data_1998, tau=x)))
)

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

names(fits_98) <- c("OLS", sprintf("τ = %0.2f", taus))
nf_98 <- ncol(fits_98)

colors_98 <- c('black', colorRampPalette(colors=c("brown", "yellow"))(nf_98-1))

lines_df <- data.frame(
  intercept = as.numeric(fits_98[1, ]),
  slope = as.numeric(fits_98[2, ]),
  method = names(fits_98)
)


p_98 <- p_98 + 
  geom_abline(data=lines_df, aes(intercept=intercept, slope=slope, color=method), size=1.5) +
  scale_color_manual(values=colors_98) +
  guides(color=guide_legend(title="Legend"))

p_98

Powyższe wykresy przedstawiają dopasowanie regresji kwantylowych dla współczynnika tau równego (0.1, 0.25, 0.5, 0.75, 0.9, 0.95) dla dwóch podzbiorów danych 1992 i 1998. Przechodzę do przedstawienia tabeli z oszacowanymi współczynnikami.

knitr::kable(fits_92, caption="Estimations for OLS and quantile regression for 1992 data") %>%
    kable_styling(full_width=FALSE, 'striped') %>% 
    row_spec(0, background='#000000', color='#FFFFFF')

Estimations for OLS and quantile regression for 1992 data
	OLS	τ = 0.10	τ = 0.25	τ = 0.50	τ = 0.75	τ = 0.90	τ = 0.95
(Intercept)	2.6211438	3.5714290	4.0528865	2.8605754	0.8146346	2.0699801	-0.5698020
age	0.3039981	0.0686813	0.1213942	0.2644231	0.4674146	0.5675747	0.7523149

Ekwiwaletnie można zapisać to tak:

q10 <- rq(earnings~age, data=data_1992, tau=0.1)
q25 <- rq(earnings~age, data=data_1992, tau=0.25)
q50 <- rq(earnings~age, data=data_1992, tau=0.50)
q75 <- rq(earnings~age, data=data_1992, tau=0.75)
q90 <- rq(earnings~age, data=data_1992, tau=0.90)
q95 <- rq(earnings~age, data=data_1992, tau=0.95)

stargazer(q10, q25, q50, q75, q90, q95, title = "Quantile Regression Results 1992", type="text")

## 
## Quantile Regression Results 1992
## ==================================================================
##                               Dependent variable:                 
##              -----------------------------------------------------
##                                    earnings                       
##                (1)      (2)      (3)      (4)      (5)      (6)   
## ------------------------------------------------------------------
## age          0.069*** 0.121*** 0.264*** 0.467*** 0.568*** 0.752***
##              (0.020)  (0.020)  (0.024)  (0.033)  (0.058)  (0.062) 
##                                                                   
## Constant     3.571*** 4.053*** 2.861***  0.815    2.070    -0.570 
##              (0.608)  (0.582)  (0.673)  (0.974)  (1.636)  (1.798) 
##                                                                   
## ------------------------------------------------------------------
## Observations  7,590    7,590    7,590    7,590    7,590    7,590  
## ==================================================================
## Note:                                  *p<0.1; **p<0.05; ***p<0.01

knitr::kable(fits_98, caption="Estimations for OLS and quantile regression for 1998 data") %>%
    kable_styling(full_width=FALSE, 'striped') %>% 
    row_spec(0, background='#000000', color='#FFFFFF')

Estimations for OLS and quantile regression for 1998 data
	OLS	τ = 0.10	τ = 0.25	τ = 0.50	τ = 0.75	τ = 0.90	τ = 0.95
(Intercept)	4.7459177	3.6538465	4.3269191	5.344723	2.9784996	0.9915855	4.2307709
age	0.3100626	0.0961538	0.1602565	0.247204	0.4791506	0.7391827	0.7692307

q10 <- rq(earnings~age, data=data_1998, tau=0.1)
q25 <- rq(earnings~age, data=data_1998, tau=0.25)
q50 <- rq(earnings~age, data=data_1998, tau=0.50)

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

q75 <- rq(earnings~age, data=data_1998, tau=0.75)

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

q90 <- rq(earnings~age, data=data_1998, tau=0.90)
q95 <- rq(earnings~age, data=data_1998, tau=0.95)

stargazer(q10, q25, q50, q75, q90, q95, title="Quantile Regression Results 1998", type="text")

## 
## Quantile Regression Results 1998
## ==================================================================
##                               Dependent variable:                 
##              -----------------------------------------------------
##                                    earnings                       
##                (1)      (2)      (3)      (4)      (5)      (6)   
## ------------------------------------------------------------------
## age          0.096*** 0.160*** 0.247*** 0.479*** 0.739*** 0.769***
##              (0.030)  (0.030)  (0.033)  (0.048)  (0.084)  (0.129) 
##                                                                   
## Constant     3.654*** 4.327*** 5.345*** 2.978**   0.992    4.231  
##              (0.888)  (0.908)  (0.971)  (1.412)  (2.454)  (3.977) 
##                                                                   
## ------------------------------------------------------------------
## Observations  5,911    5,911    5,911    5,911    5,911    5,911  
## ==================================================================
## Note:                                  *p<0.1; **p<0.05; ***p<0.01

W przypadku obu zbiorów danych dla wszystkich przyjętych wartości współczynnika tau, zmienna wiek jest istotna. Wyraz wolny był istotny jedynie dla trzech wartości tau: 0.1, 0.25, 0.50. Ich wizualizacja znajduje się poniżej.

my_qr <- rq(earnings~age, data=data_1992, tau=c(0.10, 0.25, 0.50))

intercept_slope <- my_qr %>% 
  coef() %>% 
  t() %>% 
  data.frame() %>%
  rename(intercept=X.Intercept., slope=age) %>% 
  mutate(quantile=row.names(.))

ggplot() + 
  geom_point(data=data_1992, aes(age, earnings), alpha=0.5) + 
  geom_abline(data=intercept_slope, aes(intercept=intercept, slope=slope, color=quantile)) + 
  theme_minimal() + 
  labs(x="Age", y="Earnings", title="Quantile Regression with selected tau = 0.10, 0.25 and 0.50 for 1992 data")

my_qr <- rq(earnings~age, data=data_1998, tau=c(0.10, 0.25, 0.50))

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

intercept_slope <- my_qr %>% 
  coef() %>% 
  t() %>% 
  data.frame() %>% 
  rename(intercept=X.Intercept., slope=age) %>% 
  mutate(quantile=row.names(.))

ggplot() + 
  geom_point(data=data_1998, aes(age, earnings), alpha=0.5) + 
  geom_abline(data=intercept_slope, aes(intercept=intercept, slope=slope, color=quantile)) + 
  theme_minimal() +
  labs(x="Age", y="Earnings", title="Quantile Regression with selected tau: 0.10, 0.25 and 0.50 for 1998 data")

Modelowanie zależności regresyjnej

Zaczynam od oszacowania regresji najmniejszych kwadratów.

model_lm_92 <- lm(earnings~age+factor(degree)+factor(gender), data=data_1992)
summary(model_lm_92)

## 
## Call:
## lm(formula = earnings ~ age + factor(degree) + factor(gender), 
##     data = data_1992)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.172  -3.324  -0.606   2.580  32.651 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.62877    0.61537   1.022    0.307    
## age                     0.34221    0.02041  16.771   <2e-16 ***
## factor(degree)bachelor  4.38740    0.11758  37.314   <2e-16 ***
## factor(gender)female   -2.00554    0.11581 -17.318   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.981 on 7586 degrees of freedom
## Multiple R-squared:  0.1956, Adjusted R-squared:  0.1953 
## F-statistic: 614.9 on 3 and 7586 DF,  p-value: < 2.2e-16

model_lm_98 <- lm(earnings~age+factor(degree)+factor(gender), data=data_1998)
summary(model_lm_98)

## 
## Call:
## lm(formula = earnings ~ age + factor(degree) + factor(gender), 
##     data = data_1998)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.448  -4.142  -0.890   2.978  32.109 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.3003     0.8729   3.781 0.000158 ***
## age                      0.3144     0.0290  10.843  < 2e-16 ***
## factor(degree)bachelor   5.3357     0.1643  32.473  < 2e-16 ***
## factor(gender)female    -2.4932     0.1656 -15.055  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.245 on 5907 degrees of freedom
## Multiple R-squared:  0.1823, Adjusted R-squared:  0.1819 
## F-statistic:   439 on 3 and 5907 DF,  p-value: < 2.2e-16

Czas na oszacowanie warunkowych regresji kwantylowych z współczynnikami tau wybranymi najpierw jako (0.1, 0.25, 0.5). Należy zwrócić uwagę, że wszystkie zmienne okazały się statystycznie istotne. Wartość współczynnika determinacji (R^2) nie zaskakuje, gdyż taka możliwość została zaznaczona już na wstępie. Celem jest regresja kwantylowa, której model znajduje się poniżej.

taus_new <- c(0.1, 0.25, 0.5)

model_rq_92 <- rq(earnings~age+factor(degree)+factor(gender), tau=taus_new, data=data_1992)

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

summary(model_rq_92, se='boot')

## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new, data = data_1992)
## 
## tau: [1] 0.1
## 
## Coefficients:
##                        Value    Std. Error t value  Pr(>|t|)
## (Intercept)             2.14286  0.58713    3.64969  0.00026
## age                     0.10989  0.02117    5.19042  0.00000
## factor(degree)bachelor  2.48077  0.15031   16.50484  0.00000
## factor(gender)female   -0.67433  0.13122   -5.13890  0.00000
## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new, data = data_1992)
## 
## tau: [1] 0.25
## 
## Coefficients:
##                        Value    Std. Error t value  Pr(>|t|)
## (Intercept)             1.92427  0.69438    2.77120  0.00560
## age                     0.18222  0.02359    7.72437  0.00000
## factor(degree)bachelor  3.41880  0.13885   24.62290  0.00000
## factor(gender)female   -1.14850  0.12885   -8.91378  0.00000
## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new, data = data_1992)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              0.73078   0.69935    1.04494   0.29608
## age                      0.31731   0.02434   13.03491   0.00000
## factor(degree)bachelor   4.33269   0.13038   33.23021   0.00000
## factor(gender)female    -1.91923   0.10931  -17.55814   0.00000

model_rq_98 <- rq(earnings~age+factor(degree)+factor(gender), tau=taus_new, data=data_1998)

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

summary(model_rq_98, se='boot')

## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new, data = data_1998)
## 
## tau: [1] 0.1
## 
## Coefficients:
##                        Value    Std. Error t value  Pr(>|t|)
## (Intercept)             3.97694  0.93591    4.24927  0.00002
## age                     0.08099  0.03141    2.57885  0.00994
## factor(degree)bachelor  2.97773  0.19504   15.26691  0.00000
## factor(gender)female   -0.98077  0.16451   -5.96187  0.00000
## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new, data = data_1998)
## 
## tau: [1] 0.25
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              3.12432   0.87431    3.57348   0.00036
## age                      0.18444   0.03003    6.14186   0.00000
## factor(degree)bachelor   3.80303   0.19336   19.66788   0.00000
## factor(gender)female    -1.64320   0.15368  -10.69257   0.00000
## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new, data = data_1998)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              2.82967   0.89914    3.14707   0.00166
## age                      0.30220   0.03091    9.77730   0.00000
## factor(degree)bachelor   4.98626   0.18267   27.29630   0.00000
## factor(gender)female    -2.40385   0.15025  -15.99933   0.00000

Warto zauważyć, że w przypadku danych z 1992 roku wraz ze wzrostem wartości współczynnika Tau, wyraz wolny modelu okazał się nieistotny statystycznie. Dla danych z 1998 działo się podobnie, jednak z inną dynamiką.

A jak to wygląda dla innych wartości współczynników? Teraz współczynniki tau zostały dobrane skokowo co 40 kwartyli (0.1, 0.5, 0.9).

taus_new_2 <- c(0.1, 0.5, 0.9)

model_rq_92 <- rq(earnings~age+factor(degree)+factor(gender), tau=taus_new_2, data=data_1992)

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

summary(model_rq_92, se='boot')

## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new_2, data = data_1992)
## 
## tau: [1] 0.1
## 
## Coefficients:
##                        Value    Std. Error t value  Pr(>|t|)
## (Intercept)             2.14286  0.56164    3.81536  0.00014
## age                     0.10989  0.02021    5.43819  0.00000
## factor(degree)bachelor  2.48077  0.15516   15.98845  0.00000
## factor(gender)female   -0.67433  0.11787   -5.72096  0.00000
## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new_2, data = data_1992)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              0.73078   0.68526    1.06642   0.28627
## age                      0.31731   0.02385   13.30620   0.00000
## factor(degree)bachelor   4.33269   0.13241   32.72283   0.00000
## factor(gender)female    -1.91923   0.13440  -14.27959   0.00000
## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new_2, data = data_1992)
## 
## tau: [1] 0.9
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              1.37363   1.36098    1.00929   0.31287
## age                      0.52198   0.04562   11.44141   0.00000
## factor(degree)bachelor   6.05769   0.23699   25.56089   0.00000
## factor(gender)female    -3.17308   0.24091  -13.17102   0.00000

model_rq_98 <- rq(earnings~age+factor(degree)+factor(gender), tau=taus_new_2, data=data_1998)

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

summary(model_rq_98, se='boot')

## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new_2, data = data_1998)
## 
## tau: [1] 0.1
## 
## Coefficients:
##                        Value    Std. Error t value  Pr(>|t|)
## (Intercept)             3.97694  0.87505    4.54481  0.00001
## age                     0.08099  0.02986    2.71241  0.00670
## factor(degree)bachelor  2.97773  0.20322   14.65242  0.00000
## factor(gender)female   -0.98077  0.14888   -6.58760  0.00000
## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new_2, data = data_1998)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              2.82967   0.82427    3.43293   0.00060
## age                      0.30220   0.02769   10.91459   0.00000
## factor(degree)bachelor   4.98626   0.18420   27.07035   0.00000
## factor(gender)female    -2.40385   0.15959  -15.06301   0.00000
## 
## Call: rq(formula = earnings ~ age + factor(degree) + factor(gender), 
##     tau = taus_new_2, data = data_1998)
## 
## tau: [1] 0.9
## 
## Coefficients:
##                        Value     Std. Error t value   Pr(>|t|) 
## (Intercept)              5.76923   2.35903    2.44559   0.01449
## age                      0.48077   0.07993    6.01510   0.00000
## factor(degree)bachelor   8.76068   0.39299   22.29243   0.00000
## factor(gender)female    -4.43376   0.39885  -11.11628   0.00000

Testy współczynników

Po wykonanym modelowaniu można przystąpić do statystycznej weryfikacji współczynników, a dokładniej mówiąc statystycznej różnicy pomiędzy wybranymi kwantylami warunkowymi. W tym celu wykorzystany zostanie test WALDA. Hipoteza zerowa w tym teście zakłada liniowe ograniczenia dla współczynników.

taus_test <- c(0.1, 0.25)
rq_check_coef <- rq(earnings~age+factor(gender)+factor(degree), tau=taus_test, data=data_1992)
anova(rq_check_coef, test="Wald", joint=TRUE)

## Quantile Regression Analysis of Deviance Table
## 
## Model: earnings ~ age + factor(gender) + factor(degree)
## Joint Test of Equality of Slopes: tau in {  0.1 0.25  }
## 
##   Df Resid Df F value    Pr(>F)    
## 1  3    15177  28.503 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(rq_check_coef, test = "Wald", joint=FALSE)

## Quantile Regression Analysis of Deviance Table
## 
## Model: earnings ~ age + factor(gender) + factor(degree)
## Tests of Equality of Distinct Slopes: tau in {  0.1 0.25  }
## 
##                        Df Resid Df F value    Pr(>F)    
## age                     1    15179  15.357 8.937e-05 ***
## factor(gender)female    1    15179  20.938 4.783e-06 ***
## factor(degree)bachelor  1    15179  52.511 4.485e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

rq_check_coef <- rq(earnings~age+factor(gender)+factor(degree), tau=taus_test, data=data_1998)
anova(rq_check_coef, test="Wald", joint=TRUE)

## Quantile Regression Analysis of Deviance Table
## 
## Model: earnings ~ age + factor(gender) + factor(degree)
## Joint Test of Equality of Slopes: tau in {  0.1 0.25  }
## 
##   Df Resid Df F value    Pr(>F)    
## 1  3    11819  20.488 3.091e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(rq_check_coef, test="Wald", joint=FALSE)

## Quantile Regression Analysis of Deviance Table
## 
## Model: earnings ~ age + factor(gender) + factor(degree)
## Tests of Equality of Distinct Slopes: tau in {  0.1 0.25  }
## 
##                        Df Resid Df F value    Pr(>F)    
## age                     1    11821  17.630 2.703e-05 ***
## factor(gender)female    1    11821  22.025 2.721e-06 ***
## factor(degree)bachelor  1    11821  22.789 1.829e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

W przypadku kwantyli 0.1 i 0.25 istnieją statystycznie istotne różnice. Oznacza to, że współczynniki dla tych dwóch różnych kwantyli zmiennych są różne dla każdej zmiennej.Taka sytuacja ma miejsce zarówno dla obu zbiorów danych. Teraz wybrane zostaną losowe kwartyle odmienne dla obu zbiorów, tym razem trzy kwartyle.

taus_random <- sample(taus, size=3, replace=TRUE)
rq_check_coef <- rq(earnings~age+factor(gender)+factor(degree), tau=taus_random, data=data_1992)
anova(rq_check_coef, test="Wald", joint=TRUE)

## Quantile Regression Analysis of Deviance Table
## 
## Model: earnings ~ age + factor(gender) + factor(degree)
## Joint Test of Equality of Slopes: tau in {  0.75 0.95  }
## 
##   Df Resid Df F value    Pr(>F)    
## 1  3    15177  16.434 1.168e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(rq_check_coef, test="Wald", joint=FALSE)

## Quantile Regression Analysis of Deviance Table
## 
## Model: earnings ~ age + factor(gender) + factor(degree)
## Tests of Equality of Distinct Slopes: tau in {  0.75 0.95  }
## 
##                        Df Resid Df F value    Pr(>F)    
## age                     1    15179  11.185 0.0008267 ***
## factor(gender)female    1    15179  25.038 5.685e-07 ***
## factor(degree)bachelor  1    15179  15.937 6.579e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

rq_check_coef <- rq(earnings~age+factor(gender)+factor(degree), tau=taus_random, data=data_1998)
anova(rq_check_coef, test="Wald", joint=TRUE)

## Quantile Regression Analysis of Deviance Table
## 
## Model: earnings ~ age + factor(gender) + factor(degree)
## Joint Test of Equality of Slopes: tau in {  0.75 0.95  }
## 
##   Df Resid Df F value   Pr(>F)    
## 1  3    11819   22.59 1.41e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(rq_check_coef, test="Wald", joint=FALSE)

## Quantile Regression Analysis of Deviance Table
## 
## Model: earnings ~ age + factor(gender) + factor(degree)
## Tests of Equality of Distinct Slopes: tau in {  0.75 0.95  }
## 
##                        Df Resid Df F value    Pr(>F)    
## age                     1    11821  0.0321  0.857772    
## factor(gender)female    1    11821  8.5762  0.003412 ** 
## factor(degree)bachelor  1    11821 61.8001 4.108e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Współczynniki w dalszym ciągu są statystycznie różne.

Weryfikacja dopasowania regresji kwantylowej

Biorąc pod uwagę, wyniki modeli dla lat 1992 i 1998 zbadane zostanie dopasowanie dla modeli z współczynnikami tau (0.1, 0.75, 0.95)

model_rq_92_01 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.1, data=data_1992)
resid_92_01 <- resid(model_rq_92_01)

model_empty_01 <- rq(earnings~1, tau=0.1, data=data_1992)

## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

resid_empty_01 <- resid(model_empty_01)

goodfit(resid_92_01, resid_empty_01, 0.1)

## [1] 0.0507487

model_rq_92_075 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.75, data=data_1992)
resid_92_075 <- resid(model_rq_92_075)

model_empty_075 <- rq(earnings~1, tau=0.75, data=data_1992)
resid_empty_075 <- resid(model_empty_075)

goodfit(resid_92_075, resid_empty_075, 0.75)

## [1] 0.1401108

model_rq_92_095 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.95, data=data_1992)
resid_92_095 <- resid(model_rq_92_095)

model_empty_095 <- rq(earnings~1, tau=0.95, data=data_1992)
resid_empty_095 <- resid(model_empty_095)

goodfit(resid_92_095, resid_empty_095, 0.95)

## [1] 0.148776

Wywołajmy jeszcze raz model regresji liniowej.

summary(model_lm_92)$r.squared

## [1] 0.1956149

W przypadku danych dla 1992 roku najlepiej radził sobie model najmniejszych kwadratów, z wynikim dopasowania niespełna 20%.

model_rq_98_01 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.1, data=data_1998)
resid_98_01 <- resid(model_rq_98_01)

model_empty_01_98 <- rq(earnings~1, tau=0.1, data=data_1998)
resid_empty_01_98 <- resid(model_empty_01_98)

goodfit(resid_98_01, resid_empty_01_98, 0.1)

## [1] 0.0553968

model_rq_98_075 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.75, data=data_1998)
resid_98_075 <- resid(model_rq_98_075)

model_empty_075_98 <- rq(earnings~1, tau=0.75, data=data_1998)
resid_empty_075_98 <- resid(model_empty_075)

goodfit(resid_98_075, resid_empty_075, 0.75)

## [1] 0.147787

model_rq_98_095 <- rq(earnings~age+factor(gender)+factor(degree), tau=0.95, data=data_1998)
resid_98_095 <- resid(model_rq_98_095)

model_empty_095_98 <- rq(earnings~1, tau=0.95, data=data_1998)
resid_empty_095_98 <- resid(model_empty_095_98)

goodfit(resid_98_095, resid_empty_095_98, 0.95)

## [1] 0.1503992

Wywołajmy jeszcze raz model regresji liniowej.

summary(model_lm_98)$r.squared

## [1] 0.1823063

Również w przypadku danych z 1998 roku najlepsza okazała się regresja liniowa.

Nieklasyczne metody statystyki

Regresja kwantylowa - Raport 4

Piotr Szypuliński

2024-12-01

Wczytywanie danych

Modelowanie zależności regresyjnej

Testy współczynników

Weryfikacja dopasowania regresji kwantylowej