rm(list=ls())
library(lmtest) # podpora regresie
library(outliers) # analyza odlahlych hodnot (outliers)
library(gptstudio)
library(kableExtra)
library(knitr)
library(dplyr)
library(broom)
library(corrplot)
Súbor Life_Expectancy_Data obsahuje databázu determinantov očakávanej dĺžky života. Import údajov urobíme nasledovne
# import the dataset and create a data.frame udaje
#
udaje_svet <- read.csv("udaje/Life-Expectancy-Data-Updated.csv",header=TRUE,sep=",",dec=".",check.names = TRUE)
head(udaje_svet)
## Country Region Year Infant_deaths Under_five_deaths
## 1 Turkiye Middle East 2015 11.1 13.0
## 2 Spain European Union 2015 2.7 3.3
## 3 India Asia 2007 51.5 67.9
## 4 Guyana South America 2006 32.8 40.5
## 5 Israel Middle East 2012 3.4 4.3
## 6 Costa Rica Central America and Caribbean 2006 9.8 11.2
## Adult_mortality Alcohol_consumption Hepatitis_B Measles BMI Polio Diphtheria
## 1 105.8240 1.32 97 65 27.8 97 97
## 2 57.9025 10.35 97 94 26.0 97 97
## 3 201.0765 1.57 60 35 21.2 67 64
## 4 222.1965 5.68 93 74 25.3 92 93
## 5 57.9510 2.89 97 89 27.0 94 94
## 6 95.2200 4.19 88 86 26.4 89 89
## Incidents_HIV GDP_per_capita Population_mln Thinness_ten_nineteen_years
## 1 0.08 11006 78.53 4.9
## 2 0.09 25742 46.44 0.6
## 3 0.13 1076 1183.21 27.1
## 4 0.79 4146 0.75 5.7
## 5 0.08 33995 7.91 1.2
## 6 0.16 9110 4.35 2.0
## Thinness_five_nine_years Schooling Economy_status_Developed
## 1 4.8 7.8 0
## 2 0.5 9.7 1
## 3 28.0 5.0 0
## 4 5.5 7.9 0
## 5 1.1 12.8 1
## 6 1.9 7.9 0
## Economy_status_Developing Life_expectancy
## 1 1 76.5
## 2 0 82.8
## 3 1 65.4
## 4 1 67.0
## 5 0 81.7
## 6 1 78.2
#
Cieľom tohto Notebooku je predstaviť kvantilovú regresiu.
Táto metóda je užitočná, keď nechceme opísať iba priemerný vplyv vysvetľujúcej premennej na závislú premennú, ale aj to, ako sa tento vplyv líši pre pozorovania nachádzajúce sa v rôznych častiach podmieneného rozdelenia závislej premennej.
Kvantilná regresia je užitočným rozšírením klasickej regresnej analýzy. Zatiaľ čo OLS sa zameriava na podmienený priemer, kvantilová regresia nám umožňuje študovať celé podmienené rozdelenie závislej premennej.
Je obzvlášť cenná, keď:
attach(udaje_svet)
udaje_2010 <- udaje_svet[udaje_svet$Year == 2010,]
m_ols <- lm(Life_expectancy ~ BMI+Incidents_HIV, data = udaje_2010)
m_q10 <- rq(Life_expectancy ~ BMI+Incidents_HIV, tau = 0.10, data = udaje_2010)
m_q50 <- rq(Life_expectancy ~ BMI+Incidents_HIV, tau = 0.50, data = udaje_2010)
m_q90 <- rq(Life_expectancy ~ BMI+Incidents_HIV, tau = 0.90, data = udaje_2010)
summary(m_ols)
##
## Call:
## lm(formula = Life_expectancy ~ BMI + Incidents_HIV, data = udaje_2010)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.4255 -3.6038 0.4032 3.7182 16.8764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.3907 5.3664 3.800 0.000199 ***
## BMI 2.0214 0.2109 9.584 < 2e-16 ***
## Incidents_HIV -2.0767 0.2243 -9.256 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.028 on 176 degrees of freedom
## Multiple R-squared: 0.5392, Adjusted R-squared: 0.534
## F-statistic: 103 on 2 and 176 DF, p-value: < 2.2e-16
summary(m_q10)
##
## Call: rq(formula = Life_expectancy ~ BMI + Incidents_HIV, tau = 0.1,
## data = udaje_2010)
##
## tau: [1] 0.1
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 14.68779 -8.00551 26.31112
## BMI 1.93425 1.50474 2.86904
## Incidents_HIV -1.89857 -218.07604 -1.34142
summary(m_q50)
##
## Call: rq(formula = Life_expectancy ~ BMI + Incidents_HIV, tau = 0.5,
## data = udaje_2010)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 18.68258 5.57831 28.68301
## BMI 2.10055 1.74020 2.65154
## Incidents_HIV -2.04712 -2.85600 -1.82943
summary(m_q90)
##
## Call: rq(formula = Life_expectancy ~ BMI + Incidents_HIV, tau = 0.9,
## data = udaje_2010)
##
## tau: [1] 0.9
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 44.12046 19.51152 87.03861
## BMI 1.39110 -0.14409 2.33236
## Incidents_HIV -2.01919 -2.21696 282.56776
m_q10 <- rq(Life_expectancy ~ BMI+Incidents_HIV, tau = 0.10, data = udaje_2010)
m_q50 <- rq(Life_expectancy ~ BMI+Incidents_HIV, tau = 0.50, data = udaje_2010)
m_q90 <- rq(Life_expectancy ~ BMI+Incidents_HIV, tau = 0.90, data = udaje_2010)
coef_table <- rbind(
OLS = coef(m_ols),
Q10 = coef(m_q10),
Q50 = coef(m_q50),
Q90 = coef(m_q90)
)
coef_table
## (Intercept) BMI Incidents_HIV
## OLS 20.39069 2.021406 -2.076693
## Q10 14.68779 1.934251 -1.898573
## Q50 18.68258 2.100548 -2.047116
## Q90 44.12046 1.391099 -2.019191
ggplot(udaje_2010, aes(x=BMI,y=Life_expectancy)) +
geom_point(alpha = 0.5) +
geom_abline(
intercept = coef(m_ols)[1],
slope = coef(m_ols)[2],
linewidth = 1
) +
geom_abline(
intercept = coef(m_q10)[1],
slope = coef(m_q10)[2],
linetype = "dashed",
color = "pink",
linewidth = 1
) +
geom_abline(
intercept = coef(m_q50)[1],
slope = coef(m_q50)[2],
linetype = "dotdash",
color = "cyan",
linewidth = 1
) +
geom_abline(
intercept = coef(m_q90)[1],
slope = coef(m_q90)[2],
linetype = "longdash",
color = "orange",
linewidth = 1
) +
labs(
title = "Vyrovnané priamky OLS a kvanilových regresií",
subtitle = "čierna = OLS, ružová, azúrová, oranžová - Q10,Q50, Q90",
x = "x",
y = "y"
) +
theme_minimal()