Assignment 2

Download Assignment 2-2.pdf

Load necessary file and libraries.

library("ISLR")
library("AER")
## 載入需要的套件:car
## 載入需要的套件:carData
## 載入需要的套件:lmtest
## 載入需要的套件:zoo
## 
## 載入套件:'zoo'
## 下列物件被遮斷自 'package:base':
## 
##     as.Date, as.Date.numeric
## 載入需要的套件:sandwich
## 載入需要的套件:survival
library("broom")
library("dplyr")
## 
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:car':
## 
##     recode
## 下列物件被遮斷自 'package:stats':
## 
##     filter, lag
## 下列物件被遮斷自 'package:base':
## 
##     intersect, setdiff, setequal, union
data(Default)

Starting Problem 2

Check data dimension and class.

dim(Default)
## [1] 10000     4
names(Default)
## [1] "default" "student" "balance" "income"
# There are 10000 rows and 4 columns in the data, 4 variables in total. 

class(Default$default)
## [1] "factor"
class(Default$balance)
## [1] "numeric"
class(Default$student)
## [1] "factor"
class(Default$income)
## [1] "numeric"

Convert variables to factor class, then fit a logit model.

## Converting variables
Default$default <- ifelse(Default$default == "Yes", 1, 0)
Default$student <- ifelse(Default$student == "Yes", 1, 0)


logit.fit1 <- glm(default ~ balance + student*income,
                  family = binomial(link = "logit"),
                  data = Default
)

summary(logit.fit1)
## 
## Call:
## glm(formula = default ~ balance + student * income, family = binomial(link = "logit"), 
##     data = Default)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.081e+01  5.031e-01 -21.495   <2e-16 ***
## balance         5.737e-03  2.319e-04  24.736   <2e-16 ***
## student        -9.343e-01  6.067e-01  -1.540    0.124    
## income          1.644e-06  8.633e-06   0.190    0.849    
## student:income  1.429e-05  2.772e-05   0.516    0.606    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.3  on 9995  degrees of freedom
## AIC: 1581.3
## 
## Number of Fisher Scoring iterations: 8

We shouldn’t add the constituent terms of student and income in the model, because the constituent terms themselves are not significant.

Transform to odds-ration term.

logit.coef <- exp(logit.fit1$coefficients)
logit.coef
##    (Intercept)        balance        student         income student:income 
##   0.0000201236   1.0057533348   0.3928464305   1.0000016439   1.0000142938
logit.df2 <- tidy(logit.fit1)

logit.df2 %>%
  mutate(OddRatio = exp(estimate),
  var.diag = diag(vcov(logit.fit1)),
  OddRatio.se = sqrt(OddRatio ^ 2 * var.diag)
)
## # A tibble: 5 × 8
##   term      estimate std.error statistic   p.value OddRatio var.diag OddRatio.se
##   <chr>        <dbl>     <dbl>     <dbl>     <dbl>    <dbl>    <dbl>       <dbl>
## 1 (Interce… -1.08e+1   5.03e-1   -21.5   1.73e-102  2.01e-5 2.53e- 1  0.0000101 
## 2 balance    5.74e-3   2.32e-4    24.7   4.44e-135  1.01e+0 5.38e- 8  0.000233  
## 3 student   -9.34e-1   6.07e-1    -1.54  1.24e-  1  3.93e-1 3.68e- 1  0.238     
## 4 income     1.64e-6   8.63e-6     0.190 8.49e-  1  1.00e+0 7.45e-11  0.00000863
## 5 student:…  1.43e-5   2.77e-5     0.516 6.06e-  1  1.00e+0 7.68e-10  0.0000277

We see that the neither student, income nor the interaction effect of student and income has a significant effect even after being odd-ratio-ized, they all have a p value larger than 0.1.

Starting Problem 3 and 4

library("roperators")
## Since R doesn't have a "+=" syntax, I use roperators to do the similar thing. 

data(HMDA)

dim(HMDA)
## [1] 2380   14
names(HMDA)
##  [1] "deny"      "pirat"     "hirat"     "lvrat"     "chist"     "mhist"    
##  [7] "phist"     "unemp"     "selfemp"   "insurance" "condomin"  "afam"     
## [13] "single"    "hschool"
## There are 2380 rows and 14 columns, 14 variables in HMDA.

factor_count = 0
num_count = 0
for (x in c(names(HMDA))) {
  print(class(HMDA[[x]]))
  ifelse(class(HMDA[[x]]) == "factor", factor_count %+=% 1, num_count %+=% 1) 
}
## [1] "factor"
## [1] "numeric"
## [1] "numeric"
## [1] "numeric"
## [1] "factor"
## [1] "factor"
## [1] "factor"
## [1] "numeric"
## [1] "factor"
## [1] "factor"
## [1] "factor"
## [1] "factor"
## [1] "factor"
## [1] "factor"
factor_count
## [1] 10
num_count
## [1] 4
## There are 10 factor class variables and 4 numeric class variables in HMDA. 

## Converting variables
HMDA$deny <- ifelse(HMDA$deny == "yes", 1, 0)
HMDA$selfemp <- ifelse(HMDA$selfemp == "yes", 1, 0)
HMDA$insurance <- ifelse(HMDA$insurance == "yes", 1, 0)

Now we do the probit fitting.

probit.fit1 <- glm(deny ~ lvrat + selfemp + insurance,
                   family = binomial(link = "probit"),
                   data = HMDA
)

probit.fit2 <- glm(deny ~ lvrat + I(lvrat ^ 2) + selfemp + insurance,
                   family = binomial(link = "probit"),
                   data = HMDA
)

summary(probit.fit1)
## 
## Call:
## glm(formula = deny ~ lvrat + selfemp + insurance, family = binomial(link = "probit"), 
##     data = HMDA)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.3456     0.1816 -12.914  < 2e-16 ***
## lvrat         1.3665     0.2280   5.994 2.05e-09 ***
## selfemp       0.3220     0.1012   3.181  0.00147 ** 
## insurance     2.4593     0.2571   9.566  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1526.9  on 2376  degrees of freedom
## AIC: 1534.9
## 
## Number of Fisher Scoring iterations: 6
summary(probit.fit2)
## 
## Call:
## glm(formula = deny ~ lvrat + I(lvrat^2) + selfemp + insurance, 
##     family = binomial(link = "probit"), data = HMDA)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.6966     0.3768  -7.157 8.27e-13 ***
## lvrat         2.3034     0.9192   2.506  0.01222 *  
## I(lvrat^2)   -0.6009     0.5822  -1.032  0.30201    
## selfemp       0.3265     0.1014   3.219  0.00129 ** 
## insurance     2.5054     0.2624   9.547  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1525.3  on 2375  degrees of freedom
## AIC: 1535.3
## 
## Number of Fisher Scoring iterations: 10
## likelihood test
lrtest(probit.fit1, probit.fit2)
## Likelihood ratio test
## 
## Model 1: deny ~ lvrat + selfemp + insurance
## Model 2: deny ~ lvrat + I(lvrat^2) + selfemp + insurance
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   4 -763.46                     
## 2   5 -762.63  1 1.6525     0.1986

The 2 models are not significantly different. The second model takes on a squared variable while keeping the original variable, therefore not causing a significant difference.

library(mfx)
## 載入需要的套件:MASS
## 
## 載入套件:'MASS'
## 下列物件被遮斷自 'package:dplyr':
## 
##     select
## 載入需要的套件:betareg
probit.fit2.mfx <- probitmfx(deny ~ lvrat + I(lvrat ^ 2) + selfemp + insurance,
                             data = HMDA
)
probit.fit2.mfx$mfxest
##                  dF/dx  Std. Err.         z        P>|z|
## lvrat       0.41809553 0.16379451  2.552561 1.069341e-02
## I(lvrat^2) -0.10907717 0.10502625 -1.038571 2.990045e-01
## selfemp     0.06889121 0.02432568  2.832036 4.625263e-03
## insurance   0.78905133 0.05086386 15.513005 2.833021e-54

We see that in terms of p value, selfemp and insurance have a very significant effect on deny, while lvrat ^ 2 and lvrat are both not significant to deny.

Starting Problem 5

Train-test split.

library(caret)
## 載入需要的套件:ggplot2
## 
## 載入套件:'ggplot2'
## 下列物件被遮斷自 'package:roperators':
## 
##     %+%
## 載入需要的套件:lattice
## 
## 載入套件:'caret'
## 下列物件被遮斷自 'package:survival':
## 
##     cluster
set.seed(100)

train_index <- createDataPartition(HMDA$lvrat, p = .75, list = FALSE)

training <- HMDA[train_index, ]
testing <- HMDA[-train_index, ]

## Doing a trained model fitting
training.fit <- glm(deny ~ lvrat + I(lvrat ^ 2) + selfemp + insurance,
                       family = binomial(link = "probit"),
                       data = HMDA
)

## Predicting using the 25% testing data
pred <- predict(training.fit, testing, type = "response")

comparison <- cbind(testing$deny, pred)
head(comparison, 20)
##            pred
## 2  0 0.13921242
## 9  1 0.91800178
## 13 1 0.80880520
## 17 0 0.07157168
## 23 0 0.10089955
## 26 0 0.06883311
## 30 0 0.13109380
## 31 0 0.12385385
## 32 0 0.08724569
## 34 0 0.13399464
## 37 0 0.08130421
## 44 1 0.08856000
## 46 0 0.13343456
## 50 0 0.07859352
## 52 0 0.14434144
## 53 0 0.09146015
## 54 0 0.13701664
## 56 0 0.14254486
## 62 0 0.10776909
## 64 0 0.09604133

End of Homework 2.