library(ISLR)
data("Default")
dim(Default)
## [1] 10000 4
Total rows are 1000 and total columns are 4.
names(Default)
## [1] "default" "student" "balance" "income"
print(lapply(Default, class))
## $default
## [1] "factor"
##
## $student
## [1] "factor"
##
## $balance
## [1] "numeric"
##
## $income
## [1] "numeric"
4 variables in the dataset.
default and student are factor.
balance and income are numeric.
Default$default <- as.numeric(Default$default)
Default$student <- as.numeric(Default$student)
Default$default[Default$default == 1] <- 0
Default$default[Default$default == 2] <- 1
Default$student[Default$student == 1] <- 0
Default$student[Default$student == 2] <- 1
logit_model <- glm(Default$default ~ Default$student + Default$balance + Default$income + Default$student:Default$income, family = "binomial")
summary(logit_model)
##
## Call:
## glm(formula = Default$default ~ Default$student + Default$balance +
## Default$income + Default$student:Default$income, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4638 -0.1420 -0.0558 -0.0203 3.7406
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.081e+01 5.031e-01 -21.495 <2e-16 ***
## Default$student -9.343e-01 6.067e-01 -1.540 0.124
## Default$balance 5.737e-03 2.319e-04 24.736 <2e-16 ***
## Default$income 1.644e-06 8.633e-06 0.190 0.849
## Default$student:Default$income 1.429e-05 2.772e-05 0.516 0.606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.3 on 9995 degrees of freedom
## AIC: 1581.3
##
## Number of Fisher Scoring iterations: 8
When throwing an interaction term in a multivariate regression model, its constituent terms should be included too. In order to obtain the interaction effect with respect to student/income, it also depends on the value of the other constituent term income/student theoretically. In R,it does not matter if the constituent terms are not included. Because R will automatically consider constituent terms when dealing with interaction effect.
logit.coef <- exp(logit_model$coefficients)
logit.coef
## (Intercept) Default$student
## 0.0000201236 0.3928464305
## Default$balance Default$income
## 1.0057533348 1.0000016439
## Default$student:Default$income
## 1.0000142938
當學生 = 1 會使 odd ratio(default = 1 / default = 0)的比例 0.39
左右
當 balance 增加一單位,對 odd-ratio 沒有影響(約 1:1)
當
income 增加一單位,對 odd-ratio 沒有影響(約 1:1)
交乘項對 odd ratio
沒有影響(約 1:1)
require(AER)
data("HMDA")
print(dim(HMDA))
## [1] 2380 14
print(names(HMDA))
## [1] "deny" "pirat" "hirat" "lvrat" "chist" "mhist"
## [7] "phist" "unemp" "selfemp" "insurance" "condomin" "afam"
## [13] "single" "hschool"
print(lapply(HMDA, class))
## $deny
## [1] "factor"
##
## $pirat
## [1] "numeric"
##
## $hirat
## [1] "numeric"
##
## $lvrat
## [1] "numeric"
##
## $chist
## [1] "factor"
##
## $mhist
## [1] "factor"
##
## $phist
## [1] "factor"
##
## $unemp
## [1] "numeric"
##
## $selfemp
## [1] "factor"
##
## $insurance
## [1] "factor"
##
## $condomin
## [1] "factor"
##
## $afam
## [1] "factor"
##
## $single
## [1] "factor"
##
## $hschool
## [1] "factor"
howManyFactor = 0
howMantNumeric = 0
for (i in lapply(HMDA, class)){
if (i == "factor"){
howManyFactor = howManyFactor + 1
}else if(i == "numeric"){
howMantNumeric = howMantNumeric + 1
}
}
cat(howManyFactor, "factors", howMantNumeric, "numerics")
## 10 factors 4 numerics
2380 rows 14 columns.
14 variables in this dataset.
10 factors
and 4 numeric.
HMDA$deny <- as.numeric(HMDA$deny)
HMDA$selfemp <- as.numeric(HMDA$selfemp)
HMDA$insurance <- as.numeric(HMDA$insurance)
HMDA$deny[HMDA$deny == 1] <- 0
HMDA$deny[HMDA$deny == 2] <- 1
HMDA$selfemp[HMDA$selfemp == 1] <- 0
HMDA$selfemp[HMDA$selfemp == 2] <- 1
HMDA$insurance[HMDA$insurance == 1] <- 0
HMDA$insurance[HMDA$insurance == 2] <- 1
library(mfx)
probit_model_1 <- glm(deny ~ lvrat + selfemp + insurance, family = binomial(link = "probit"),data = HMDA)
summary(probit_model_1)
##
## Call:
## glm(formula = deny ~ lvrat + selfemp + insurance, family = binomial(link = "probit"),
## data = HMDA)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4359 -0.5156 -0.4491 -0.3438 2.8783
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3456 0.1816 -12.914 < 2e-16 ***
## lvrat 1.3665 0.2280 5.994 2.05e-09 ***
## selfemp 0.3220 0.1012 3.181 0.00147 **
## insurance 2.4593 0.2571 9.566 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.2 on 2379 degrees of freedom
## Residual deviance: 1526.9 on 2376 degrees of freedom
## AIC: 1534.9
##
## Number of Fisher Scoring iterations: 6
summary:
According to P value of the estimator of three variables
in model1, they all significantly affect the occuring of deny. Ivrat and
insurance reject H0 at alomst 0% level, and selfemp rejects H0 at .001
level.
probit_model_2 <- glm(deny ~ lvrat + I(lvrat^2) + selfemp + insurance, family = binomial(link = "probit"),data = HMDA)
summary(probit_model_2)
##
## Call:
## glm(formula = deny ~ lvrat + I(lvrat^2) + selfemp + insurance,
## family = binomial(link = "probit"), data = HMDA)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7640 -0.5195 -0.4547 -0.3362 3.0775
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.6966 0.3768 -7.157 8.27e-13 ***
## lvrat 2.3034 0.9192 2.506 0.01222 *
## I(lvrat^2) -0.6009 0.5822 -1.032 0.30201
## selfemp 0.3265 0.1014 3.219 0.00129 **
## insurance 2.5054 0.2624 9.547 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.2 on 2379 degrees of freedom
## Residual deviance: 1525.3 on 2375 degrees of freedom
## AIC: 1535.3
##
## Number of Fisher Scoring iterations: 10
summary:
According to P value of the estimator of four variables
in model2, insurance rejects H0 at 0% level. selfemp rejects H0 Pvalue
close to .001. lvrat rejects H0 Pvalue close to .01.
library(lmtest)
lrtest(probit_model_1, probit_model_2)
## Likelihood ratio test
##
## Model 1: deny ~ lvrat + selfemp + insurance
## Model 2: deny ~ lvrat + I(lvrat^2) + selfemp + insurance
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 4 -763.46
## 2 5 -762.63 1 1.6525 0.1986
##please excuse me for using simpler method doing LRT
Since the Pvalue of chi-square tests on these two model is around
20%, two models are not significantly different.
I guess the
coefficient estimate of lvrat from model1 and model2 are different due
to multicollinearity between lrvat and lrvat^2 in model2.
Multicollinearity could cause coefficient understated and less
precise.(Check Appendix)
probit_mfx_2 <- probitmfx(deny ~ lvrat + I(lvrat^2) + selfemp + insurance, data = HMDA)
probit_mfx_2$mfxest
## dF/dx Std. Err. z P>|z|
## lvrat 0.41809553 0.16379451 2.552561 1.069341e-02
## I(lvrat^2) -0.10907717 0.10502625 -1.038571 2.990045e-01
## selfemp 0.06889121 0.02432568 2.832036 4.625263e-03
## insurance 0.78905133 0.05086386 15.513005 2.833021e-54
One unit increase in lvrat is associated with an increase in the
probability of being denied by 41.8%
One unit increase in lvrat^2
is associated with an decrease in the probability of being denied by
0.11%
Being selfemp is associated with an increase in the
probability of being denied by 6.88%
Having insurance is associated
with an increase in the probability of being denied by 78.90%
library(caret)
set.seed(100)
train_index <- createDataPartition(HMDA$lvrat, p = .75, list = FALSE)
training <- HMDA[train_index,]
testing <- HMDA[-train_index,]
training.fit <- glm(deny ~ lvrat + I(lvrat^2) + selfemp + insurance, family = "binomial", data = training)
pred <- predict(training.fit, testing, type = "response")
comparison <- cbind(testing$deny, pred)
head(comparison, 20)
## pred
## 2 0 0.13525239
## 9 1 0.94024417
## 13 1 0.82260348
## 17 0 0.06315843
## 23 0 0.09531994
## 26 0 0.06019327
## 30 0 0.12722557
## 31 0 0.11983052
## 32 0 0.08032843
## 34 0 0.13012821
## 37 0 0.07379399
## 44 1 0.08177513
## 46 0 0.12957066
## 50 0 0.07082011
## 52 0 0.14015747
## 53 0 0.08496673
## 54 0 0.13311176
## 56 0 0.13845499
## 62 0 0.10278397
## 64 0 0.09000095
library(olsrr)
VIF_test_model <- lm(deny ~ lvrat + I(lvrat^2) + selfemp + insurance, data = HMDA)
ols_vif_tol(VIF_test_model)
## Variables Tolerance VIF
## 1 lvrat 0.07242128 13.808096
## 2 I(lvrat^2) 0.07195969 13.896669
## 3 selfemp 0.99672536 1.003285
## 4 insurance 0.97387557 1.026825
It seems that GLM did not work with VIF. I’m not sure if it’s appropriate to assume probit_model_2 as lm. And, I did this re-modelling is just to take VIF test on variables in probit_model_2.
The result shows lvrat and lvrat^2 VIF are too large which means high collinearity.