library("ISLR")
library("AER")
## 載入需要的套件:car
## 載入需要的套件:carData
## 載入需要的套件:lmtest
## 載入需要的套件:zoo
##
## 載入套件:'zoo'
## 下列物件被遮斷自 'package:base':
##
## as.Date, as.Date.numeric
## 載入需要的套件:sandwich
## 載入需要的套件:survival
library("broom")
library("dplyr")
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:car':
##
## recode
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
data(Default)
Check data dimension and class.
dim(Default)
## [1] 10000 4
names(Default)
## [1] "default" "student" "balance" "income"
# There are 10000 rows and 4 columns in the data, 4 variables in total.
class(Default$default)
## [1] "factor"
class(Default$balance)
## [1] "numeric"
class(Default$student)
## [1] "factor"
class(Default$income)
## [1] "numeric"
Convert variables to factor class, then fit a logit model.
## Converting variables
Default$default <- ifelse(Default$default == "Yes", 1, 0)
Default$student <- ifelse(Default$student == "Yes", 1, 0)
logit.fit1 <- glm(default ~ balance + student*income,
family = binomial(link = "logit"),
data = Default
)
summary(logit.fit1)
##
## Call:
## glm(formula = default ~ balance + student * income, family = binomial(link = "logit"),
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.081e+01 5.031e-01 -21.495 <2e-16 ***
## balance 5.737e-03 2.319e-04 24.736 <2e-16 ***
## student -9.343e-01 6.067e-01 -1.540 0.124
## income 1.644e-06 8.633e-06 0.190 0.849
## student:income 1.429e-05 2.772e-05 0.516 0.606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.3 on 9995 degrees of freedom
## AIC: 1581.3
##
## Number of Fisher Scoring iterations: 8
We shouldn’t add the constituent terms of student and income in the model, because the constituent terms themselves are not significant.
Transform to odds-ration term.
logit.coef <- exp(logit.fit1$coefficients)
logit.coef
## (Intercept) balance student income student:income
## 0.0000201236 1.0057533348 0.3928464305 1.0000016439 1.0000142938
logit.df2 <- tidy(logit.fit1)
logit.df2 %>%
mutate(OddRatio = exp(estimate),
var.diag = diag(vcov(logit.fit1)),
OddRatio.se = sqrt(OddRatio ^ 2 * var.diag)
)
## # A tibble: 5 × 8
## term estimate std.error statistic p.value OddRatio var.diag OddRatio.se
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Interce… -1.08e+1 5.03e-1 -21.5 1.73e-102 2.01e-5 2.53e- 1 0.0000101
## 2 balance 5.74e-3 2.32e-4 24.7 4.44e-135 1.01e+0 5.38e- 8 0.000233
## 3 student -9.34e-1 6.07e-1 -1.54 1.24e- 1 3.93e-1 3.68e- 1 0.238
## 4 income 1.64e-6 8.63e-6 0.190 8.49e- 1 1.00e+0 7.45e-11 0.00000863
## 5 student:… 1.43e-5 2.77e-5 0.516 6.06e- 1 1.00e+0 7.68e-10 0.0000277
We see that the neither student, income nor the interaction effect of student and income has a significant effect even after being odd-ratio-ized, they all have a p value larger than 0.1.
library("roperators")
## Since R doesn't have a "+=" syntax, I use roperators to do the similar thing.
data(HMDA)
dim(HMDA)
## [1] 2380 14
names(HMDA)
## [1] "deny" "pirat" "hirat" "lvrat" "chist" "mhist"
## [7] "phist" "unemp" "selfemp" "insurance" "condomin" "afam"
## [13] "single" "hschool"
## There are 2380 rows and 14 columns, 14 variables in HMDA.
factor_count = 0
num_count = 0
for (x in c(names(HMDA))) {
print(class(HMDA[[x]]))
ifelse(class(HMDA[[x]]) == "factor", factor_count %+=% 1, num_count %+=% 1)
}
## [1] "factor"
## [1] "numeric"
## [1] "numeric"
## [1] "numeric"
## [1] "factor"
## [1] "factor"
## [1] "factor"
## [1] "numeric"
## [1] "factor"
## [1] "factor"
## [1] "factor"
## [1] "factor"
## [1] "factor"
## [1] "factor"
factor_count
## [1] 10
num_count
## [1] 4
## There are 10 factor class variables and 4 numeric class variables in HMDA.
## Converting variables
HMDA$deny <- ifelse(HMDA$deny == "yes", 1, 0)
HMDA$selfemp <- ifelse(HMDA$selfemp == "yes", 1, 0)
HMDA$insurance <- ifelse(HMDA$insurance == "yes", 1, 0)
Now we do the probit fitting.
probit.fit1 <- glm(deny ~ lvrat + selfemp + insurance,
family = binomial(link = "probit"),
data = HMDA
)
probit.fit2 <- glm(deny ~ lvrat + I(lvrat ^ 2) + selfemp + insurance,
family = binomial(link = "probit"),
data = HMDA
)
summary(probit.fit1)
##
## Call:
## glm(formula = deny ~ lvrat + selfemp + insurance, family = binomial(link = "probit"),
## data = HMDA)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3456 0.1816 -12.914 < 2e-16 ***
## lvrat 1.3665 0.2280 5.994 2.05e-09 ***
## selfemp 0.3220 0.1012 3.181 0.00147 **
## insurance 2.4593 0.2571 9.566 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.2 on 2379 degrees of freedom
## Residual deviance: 1526.9 on 2376 degrees of freedom
## AIC: 1534.9
##
## Number of Fisher Scoring iterations: 6
summary(probit.fit2)
##
## Call:
## glm(formula = deny ~ lvrat + I(lvrat^2) + selfemp + insurance,
## family = binomial(link = "probit"), data = HMDA)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.6966 0.3768 -7.157 8.27e-13 ***
## lvrat 2.3034 0.9192 2.506 0.01222 *
## I(lvrat^2) -0.6009 0.5822 -1.032 0.30201
## selfemp 0.3265 0.1014 3.219 0.00129 **
## insurance 2.5054 0.2624 9.547 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.2 on 2379 degrees of freedom
## Residual deviance: 1525.3 on 2375 degrees of freedom
## AIC: 1535.3
##
## Number of Fisher Scoring iterations: 10
## likelihood test
lrtest(probit.fit1, probit.fit2)
## Likelihood ratio test
##
## Model 1: deny ~ lvrat + selfemp + insurance
## Model 2: deny ~ lvrat + I(lvrat^2) + selfemp + insurance
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 4 -763.46
## 2 5 -762.63 1 1.6525 0.1986
The 2 models are not significantly different. The second model takes on a squared variable while keeping the original variable, therefore not causing a significant difference.
library(mfx)
## 載入需要的套件:MASS
##
## 載入套件:'MASS'
## 下列物件被遮斷自 'package:dplyr':
##
## select
## 載入需要的套件:betareg
probit.fit2.mfx <- probitmfx(deny ~ lvrat + I(lvrat ^ 2) + selfemp + insurance,
data = HMDA
)
probit.fit2.mfx$mfxest
## dF/dx Std. Err. z P>|z|
## lvrat 0.41809553 0.16379451 2.552561 1.069341e-02
## I(lvrat^2) -0.10907717 0.10502625 -1.038571 2.990045e-01
## selfemp 0.06889121 0.02432568 2.832036 4.625263e-03
## insurance 0.78905133 0.05086386 15.513005 2.833021e-54
We see that in terms of p value, selfemp and insurance have a very significant effect on deny, while lvrat ^ 2 and lvrat are both not significant to deny.
Train-test split.
library(caret)
## 載入需要的套件:ggplot2
##
## 載入套件:'ggplot2'
## 下列物件被遮斷自 'package:roperators':
##
## %+%
## 載入需要的套件:lattice
##
## 載入套件:'caret'
## 下列物件被遮斷自 'package:survival':
##
## cluster
set.seed(100)
train_index <- createDataPartition(HMDA$lvrat, p = .75, list = FALSE)
training <- HMDA[train_index, ]
testing <- HMDA[-train_index, ]
## Doing a trained model fitting
training.fit <- glm(deny ~ lvrat + I(lvrat ^ 2) + selfemp + insurance,
family = binomial(link = "probit"),
data = HMDA
)
## Predicting using the 25% testing data
pred <- predict(training.fit, testing, type = "response")
comparison <- cbind(testing$deny, pred)
head(comparison, 20)
## pred
## 2 0 0.13921242
## 9 1 0.91800178
## 13 1 0.80880520
## 17 0 0.07157168
## 23 0 0.10089955
## 26 0 0.06883311
## 30 0 0.13109380
## 31 0 0.12385385
## 32 0 0.08724569
## 34 0 0.13399464
## 37 0 0.08130421
## 44 1 0.08856000
## 46 0 0.13343456
## 50 0 0.07859352
## 52 0 0.14434144
## 53 0 0.09146015
## 54 0 0.13701664
## 56 0 0.14254486
## 62 0 0.10776909
## 64 0 0.09604133