Chương 6: Hồi quy OLS

library(dplyr) # tiền xử lý

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) # vẽ hình
library(car) # phân tích phần dư

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

DATA_RAW <- read.csv('data/application_ols.csv')

1 . Phân tích tương quan

1.1. Hiệp phương sai

cov(DATA_RAW$AMT_CREDIT, DATA_RAW$AMT_INCOME_TOTAL)

## [1] 15812947446

cov(select(DATA_RAW, -FLAG_OWN_CAR), use = "complete.obs")

##                    AMT_CREDIT AMT_INCOME_TOTAL AMT_GOODS_PRICE
## AMT_CREDIT       163700128801      15809719843    148029477098
## AMT_INCOME_TOTAL  15809719843      35221070102     14892206290
## AMT_GOODS_PRICE  148029477098      14892206290    137571966488

cov(select(DATA_RAW, -FLAG_OWN_CAR), use = "pairwise.complete.obs")

##                    AMT_CREDIT AMT_INCOME_TOTAL AMT_GOODS_PRICE
## AMT_CREDIT       163652881494      15812947446    148029477098
## AMT_INCOME_TOTAL  15812947446      35193056111     14892206290
## AMT_GOODS_PRICE  148029477098      14892206290    137571966488

1.2. Hệ số tương quan (-1, 1)

cor(DATA_RAW$AMT_INCOME_TOTAL, DATA_RAW$AMT_CREDIT)

## [1] 0.2083639

cor(select(DATA_RAW, -FLAG_OWN_CAR), use="complete.obs")

##                  AMT_CREDIT AMT_INCOME_TOTAL AMT_GOODS_PRICE
## AMT_CREDIT        1.0000000        0.2082085       0.9864129
## AMT_INCOME_TOTAL  0.2082085        1.0000000       0.2139404
## AMT_GOODS_PRICE   0.9864129        0.2139404       1.0000000

cor(
  select(DATA_RAW, -FLAG_OWN_CAR), 
  use="pairwise.complete.obs"
)

##                  AMT_CREDIT AMT_INCOME_TOTAL AMT_GOODS_PRICE
## AMT_CREDIT        1.0000000        0.2083639       0.9864129
## AMT_INCOME_TOTAL  0.2083639        1.0000000       0.2139404
## AMT_GOODS_PRICE   0.9864129        0.2139404       1.0000000

2. Hồi quy đơn

application_ols <- DATA_RAW %>% 
  rename(
    CI = AMT_INCOME_TOTAL,
    CD = AMT_CREDIT
  ) %>% 
  mutate(
    CI = CI / 1000,
    CD = CD / 1000
  )

m1 <- lm(CD ~ CI, data = application_ols)

summary(m1)

## 
## Call:
## lm(formula = CD ~ CI, data = application_ols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7935.6  -308.9   -90.3   214.4  2676.0 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 522.84486    3.78043  138.30   <2e-16 ***
## CI            0.44932    0.01491   30.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 395.7 on 19998 degrees of freedom
## Multiple R-squared:  0.04342,    Adjusted R-squared:  0.04337 
## F-statistic: 907.6 on 1 and 19998 DF,  p-value: < 2.2e-16

# khoảng tin cậy 95%
confint(m1)

##                   2.5 %      97.5 %
## (Intercept) 515.4349018 530.2548226
## CI            0.4200869   0.4785533

# bảng anova
anova(m1)

predict(m1, data.frame(CI = c(100, 200)))

##        1        2 
## 567.7769 612.7089

data.frame(CI = c(100, 200))

ggplot(application_ols, aes(log(CI), log(CD))) +
  geom_point() + 
  geom_smooth(method = "lm", se = F)

## `geom_smooth()` using formula = 'y ~ x'

m2 <- lm(log(CD) ~ log(CI), data = application_ols)
summary(m2)

## 
## Call:
## lm(formula = log(CD) ~ log(CI), data = application_ols)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.72317 -0.45131  0.03133  0.49430  2.28875 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.044480   0.047034   64.73   <2e-16 ***
## log(CI)     0.622560   0.009354   66.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6517 on 19998 degrees of freedom
## Multiple R-squared:  0.1813, Adjusted R-squared:  0.1813 
## F-statistic:  4430 on 1 and 19998 DF,  p-value: < 2.2e-16

summary(lm(CD ~ CI + I(CI^2), application_ols))

## 
## Call:
## lm(formula = CD ~ CI + I(CI^2), data = application_ols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3735.7  -274.1   -85.8   206.7  2444.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.438e+02  5.225e+00   65.81   <2e-16 ***
## CI           1.534e+00  2.702e-02   56.77   <2e-16 ***
## I(CI^2)     -9.072e-05  1.925e-06  -47.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 375.4 on 19997 degrees of freedom
## Multiple R-squared:  0.139,  Adjusted R-squared:  0.1389 
## F-statistic:  1614 on 2 and 19997 DF,  p-value: < 2.2e-16

ggplot(application_ols, aes(log(CI), log(CD))) +
  geom_point() + 
  geom_smooth(method = "lm", 
              formula = "y ~ x + I(x^2) + I(x^3)", 
              se = F)

summary(lm(CD ~ CI - 1, application_ols))

## 
## Call:
## lm(formula = CD ~ CI - 1, data = application_ols)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32382     26    206    497   2655 
## 
## Coefficients:
##    Estimate Std. Error t value Pr(>|t|)    
## CI  1.83652    0.01544     119   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 553.4 on 19999 degrees of freedom
## Multiple R-squared:  0.4144, Adjusted R-squared:  0.4143 
## F-statistic: 1.415e+04 on 1 and 19999 DF,  p-value: < 2.2e-16

summary(lm(CD ~ . - FLAG_OWN_CAR, application_ols))

## 
## Call:
## lm(formula = CD ~ . - FLAG_OWN_CAR, data = application_ols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -786.41  -40.45   -7.29   37.29  465.48 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.060e+01  8.780e-01  23.458   <2e-16 ***
## CI              -6.383e-03  2.565e-03  -2.488   0.0128 *  
## AMT_GOODS_PRICE  1.077e-03  1.298e-06 829.530   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.46 on 19971 degrees of freedom
##   (26 observations deleted due to missingness)
## Multiple R-squared:  0.973,  Adjusted R-squared:  0.973 
## F-statistic: 3.601e+05 on 2 and 19971 DF,  p-value: < 2.2e-16

3. Hồi quy bội

application_ols <- application_ols %>% 
  rename(
    CAR = FLAG_OWN_CAR,
    GP = AMT_GOODS_PRICE
  ) %>% 
  mutate(GP = GP / 1000)

m3 <- lm(CD ~ CI + GP, application_ols)
summary(m3)

## 
## Call:
## lm(formula = CD ~ CI + GP, data = application_ols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -786.41  -40.45   -7.29   37.29  465.48 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 20.596267   0.878014  23.458   <2e-16 ***
## CI          -0.006383   0.002565  -2.488   0.0128 *  
## GP           1.076706   0.001298 829.530   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.46 on 19971 degrees of freedom
##   (26 observations deleted due to missingness)
## Multiple R-squared:  0.973,  Adjusted R-squared:  0.973 
## F-statistic: 3.601e+05 on 2 and 19971 DF,  p-value: < 2.2e-16

summary(lm(CD ~ CAR, data = application_ols))

## 
## Call:
## lm(formula = CD ~ CAR, data = application_ols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -623.40 -308.40  -92.33  224.27 2706.60 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  562.858      3.512   160.3   <2e-16 ***
## CARY         105.544      5.965    17.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 401.4 on 19998 degrees of freedom
## Multiple R-squared:  0.01542,    Adjusted R-squared:  0.01537 
## F-statistic: 313.1 on 1 and 19998 DF,  p-value: < 2.2e-16

summarise(group_by(application_ols, CAR), mean(CD))

# khác nhau giữa 2 nhóm khi được điều chỉnh với thu nhập
summary(lm(CD ~ CI + CAR, data=application_ols))

## 
## Call:
## lm(formula = CD ~ CI + CAR, data = application_ols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7517.2  -300.3   -88.1   214.9  2626.4 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 497.91691    4.13923  120.29   <2e-16 ***
## CI            0.42273    0.01495   28.27   <2e-16 ***
## CARY         84.99738    5.89395   14.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 393.6 on 19997 degrees of freedom
## Multiple R-squared:  0.05326,    Adjusted R-squared:  0.05317 
## F-statistic: 562.5 on 2 and 19997 DF,  p-value: < 2.2e-16

4. Phân tích phần dư

m1 <- lm(log(CD) ~ log(CI), data=application_ols)

# View(m1)

hist(m1$residuals)

boxplot(m1$residuals, horizontal = T)

# phần dư chuẩn hoá
stdres <- rstandard(m1)
hist(stdres)

boxplot(stdres, horizontal = T)

qqnorm(stdres)
qqline(stdres)

# kiểm định phân phối chuẩn
nortest::lillie.test(stdres)

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  stdres
## D = 0.032923, p-value < 2.2e-16

# phân phối không phải pp chuẩn
# H0: là phân phối chuẩn
# H1: không phải pp chuẩn
# p-value < alpha (5%) -> bác bỏ H0

# thử test trên 1 phân phốt chuẩn bất kì
nortest::lillie.test(rnorm(100000))

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  rnorm(1e+05)
## D = 0.0021552, p-value = 0.3145

hist(application_ols$CD)

# kiểm tra mối quan hệ tuyến tính
crPlots(m1)

# kiểm định phương sai bất biến
# h0: phương sai sai số không đổi
# h1: phương sai sai số thay đổi
ncvTest(m1)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 49.00403, Df = 1, p = 2.5544e-12

# vẽ biểu đồ kiểm tra giả định phương sai bất biến
spreadLevelPlot(m1)

## 
## Suggested power transformation:  2.248717

influencePlot(
  m1,
  # id.method="identify",
  main="Influence Plot",
  sub="Circle size is proportial to Cook's Distance"
)

# Bỏ đi outliers đã tìm thấy bên trên
outliers_idx <- c(13348, 16060, 18287)
# application_ols1 <- application_ols[-outliers_idx, ]
# application_ols1

# cách khác, dùng hàm slice (dplyr): lấy quan sát mong muốn trong bảng
application_ols1 <- slice(application_ols, -outliers_idx)
nrow(application_ols1)

## [1] 19997