library(dplyr) # tiền xử lý
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # vẽ hình
library(car) # phân tích phần dư
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
DATA_RAW <- read.csv('data/application_ols.csv')
1 . Phân tích tương quan
1.1. Hiệp phương sai
cov(DATA_RAW$AMT_CREDIT, DATA_RAW$AMT_INCOME_TOTAL)
## [1] 15812947446
cov(select(DATA_RAW, -FLAG_OWN_CAR), use = "complete.obs")
## AMT_CREDIT AMT_INCOME_TOTAL AMT_GOODS_PRICE
## AMT_CREDIT 163700128801 15809719843 148029477098
## AMT_INCOME_TOTAL 15809719843 35221070102 14892206290
## AMT_GOODS_PRICE 148029477098 14892206290 137571966488
cov(select(DATA_RAW, -FLAG_OWN_CAR), use = "pairwise.complete.obs")
## AMT_CREDIT AMT_INCOME_TOTAL AMT_GOODS_PRICE
## AMT_CREDIT 163652881494 15812947446 148029477098
## AMT_INCOME_TOTAL 15812947446 35193056111 14892206290
## AMT_GOODS_PRICE 148029477098 14892206290 137571966488
1.2. Hệ số tương quan (-1, 1)
cor(DATA_RAW$AMT_INCOME_TOTAL, DATA_RAW$AMT_CREDIT)
## [1] 0.2083639
cor(select(DATA_RAW, -FLAG_OWN_CAR), use="complete.obs")
## AMT_CREDIT AMT_INCOME_TOTAL AMT_GOODS_PRICE
## AMT_CREDIT 1.0000000 0.2082085 0.9864129
## AMT_INCOME_TOTAL 0.2082085 1.0000000 0.2139404
## AMT_GOODS_PRICE 0.9864129 0.2139404 1.0000000
cor(
select(DATA_RAW, -FLAG_OWN_CAR),
use="pairwise.complete.obs"
)
## AMT_CREDIT AMT_INCOME_TOTAL AMT_GOODS_PRICE
## AMT_CREDIT 1.0000000 0.2083639 0.9864129
## AMT_INCOME_TOTAL 0.2083639 1.0000000 0.2139404
## AMT_GOODS_PRICE 0.9864129 0.2139404 1.0000000
2. Hồi quy đơn
application_ols <- DATA_RAW %>%
rename(
CI = AMT_INCOME_TOTAL,
CD = AMT_CREDIT
) %>%
mutate(
CI = CI / 1000,
CD = CD / 1000
)
m1 <- lm(CD ~ CI, data = application_ols)
summary(m1)
##
## Call:
## lm(formula = CD ~ CI, data = application_ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7935.6 -308.9 -90.3 214.4 2676.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 522.84486 3.78043 138.30 <2e-16 ***
## CI 0.44932 0.01491 30.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 395.7 on 19998 degrees of freedom
## Multiple R-squared: 0.04342, Adjusted R-squared: 0.04337
## F-statistic: 907.6 on 1 and 19998 DF, p-value: < 2.2e-16
# khoảng tin cậy 95%
confint(m1)
## 2.5 % 97.5 %
## (Intercept) 515.4349018 530.2548226
## CI 0.4200869 0.4785533
# bảng anova
anova(m1)
predict(m1, data.frame(CI = c(100, 200)))
## 1 2
## 567.7769 612.7089
data.frame(CI = c(100, 200))
ggplot(application_ols, aes(log(CI), log(CD))) +
geom_point() +
geom_smooth(method = "lm", se = F)
## `geom_smooth()` using formula = 'y ~ x'

m2 <- lm(log(CD) ~ log(CI), data = application_ols)
summary(m2)
##
## Call:
## lm(formula = log(CD) ~ log(CI), data = application_ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.72317 -0.45131 0.03133 0.49430 2.28875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.044480 0.047034 64.73 <2e-16 ***
## log(CI) 0.622560 0.009354 66.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6517 on 19998 degrees of freedom
## Multiple R-squared: 0.1813, Adjusted R-squared: 0.1813
## F-statistic: 4430 on 1 and 19998 DF, p-value: < 2.2e-16
summary(lm(CD ~ CI + I(CI^2), application_ols))
##
## Call:
## lm(formula = CD ~ CI + I(CI^2), data = application_ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3735.7 -274.1 -85.8 206.7 2444.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.438e+02 5.225e+00 65.81 <2e-16 ***
## CI 1.534e+00 2.702e-02 56.77 <2e-16 ***
## I(CI^2) -9.072e-05 1.925e-06 -47.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 375.4 on 19997 degrees of freedom
## Multiple R-squared: 0.139, Adjusted R-squared: 0.1389
## F-statistic: 1614 on 2 and 19997 DF, p-value: < 2.2e-16
ggplot(application_ols, aes(log(CI), log(CD))) +
geom_point() +
geom_smooth(method = "lm",
formula = "y ~ x + I(x^2) + I(x^3)",
se = F)

summary(lm(CD ~ CI - 1, application_ols))
##
## Call:
## lm(formula = CD ~ CI - 1, data = application_ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32382 26 206 497 2655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## CI 1.83652 0.01544 119 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 553.4 on 19999 degrees of freedom
## Multiple R-squared: 0.4144, Adjusted R-squared: 0.4143
## F-statistic: 1.415e+04 on 1 and 19999 DF, p-value: < 2.2e-16
summary(lm(CD ~ . - FLAG_OWN_CAR, application_ols))
##
## Call:
## lm(formula = CD ~ . - FLAG_OWN_CAR, data = application_ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -786.41 -40.45 -7.29 37.29 465.48
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.060e+01 8.780e-01 23.458 <2e-16 ***
## CI -6.383e-03 2.565e-03 -2.488 0.0128 *
## AMT_GOODS_PRICE 1.077e-03 1.298e-06 829.530 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.46 on 19971 degrees of freedom
## (26 observations deleted due to missingness)
## Multiple R-squared: 0.973, Adjusted R-squared: 0.973
## F-statistic: 3.601e+05 on 2 and 19971 DF, p-value: < 2.2e-16
3. Hồi quy bội
application_ols <- application_ols %>%
rename(
CAR = FLAG_OWN_CAR,
GP = AMT_GOODS_PRICE
) %>%
mutate(GP = GP / 1000)
m3 <- lm(CD ~ CI + GP, application_ols)
summary(m3)
##
## Call:
## lm(formula = CD ~ CI + GP, data = application_ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -786.41 -40.45 -7.29 37.29 465.48
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.596267 0.878014 23.458 <2e-16 ***
## CI -0.006383 0.002565 -2.488 0.0128 *
## GP 1.076706 0.001298 829.530 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.46 on 19971 degrees of freedom
## (26 observations deleted due to missingness)
## Multiple R-squared: 0.973, Adjusted R-squared: 0.973
## F-statistic: 3.601e+05 on 2 and 19971 DF, p-value: < 2.2e-16
summary(lm(CD ~ CAR, data = application_ols))
##
## Call:
## lm(formula = CD ~ CAR, data = application_ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.40 -308.40 -92.33 224.27 2706.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 562.858 3.512 160.3 <2e-16 ***
## CARY 105.544 5.965 17.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 401.4 on 19998 degrees of freedom
## Multiple R-squared: 0.01542, Adjusted R-squared: 0.01537
## F-statistic: 313.1 on 1 and 19998 DF, p-value: < 2.2e-16
summarise(group_by(application_ols, CAR), mean(CD))
# khác nhau giữa 2 nhóm khi được điều chỉnh với thu nhập
summary(lm(CD ~ CI + CAR, data=application_ols))
##
## Call:
## lm(formula = CD ~ CI + CAR, data = application_ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7517.2 -300.3 -88.1 214.9 2626.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 497.91691 4.13923 120.29 <2e-16 ***
## CI 0.42273 0.01495 28.27 <2e-16 ***
## CARY 84.99738 5.89395 14.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 393.6 on 19997 degrees of freedom
## Multiple R-squared: 0.05326, Adjusted R-squared: 0.05317
## F-statistic: 562.5 on 2 and 19997 DF, p-value: < 2.2e-16
4. Phân tích phần dư
m1 <- lm(log(CD) ~ log(CI), data=application_ols)
# View(m1)
hist(m1$residuals)

boxplot(m1$residuals, horizontal = T)

# phần dư chuẩn hoá
stdres <- rstandard(m1)
hist(stdres)

boxplot(stdres, horizontal = T)

qqnorm(stdres)
qqline(stdres)

# kiểm định phân phối chuẩn
nortest::lillie.test(stdres)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: stdres
## D = 0.032923, p-value < 2.2e-16
# phân phối không phải pp chuẩn
# H0: là phân phối chuẩn
# H1: không phải pp chuẩn
# p-value < alpha (5%) -> bác bỏ H0
# thử test trên 1 phân phốt chuẩn bất kì
nortest::lillie.test(rnorm(100000))
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: rnorm(1e+05)
## D = 0.0021552, p-value = 0.3145
hist(application_ols$CD)

# kiểm tra mối quan hệ tuyến tính
crPlots(m1)

# kiểm định phương sai bất biến
# h0: phương sai sai số không đổi
# h1: phương sai sai số thay đổi
ncvTest(m1)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 49.00403, Df = 1, p = 2.5544e-12
# vẽ biểu đồ kiểm tra giả định phương sai bất biến
spreadLevelPlot(m1)

##
## Suggested power transformation: 2.248717
influencePlot(
m1,
# id.method="identify",
main="Influence Plot",
sub="Circle size is proportial to Cook's Distance"
)
# Bỏ đi outliers đã tìm thấy bên trên
outliers_idx <- c(13348, 16060, 18287)
# application_ols1 <- application_ols[-outliers_idx, ]
# application_ols1
# cách khác, dùng hàm slice (dplyr): lấy quan sát mong muốn trong bảng
application_ols1 <- slice(application_ols, -outliers_idx)
nrow(application_ols1)
## [1] 19997