Question 3.

Question 5

set.seed(123)
df <- Default
log.fit <- glm(default ~ income + balance,family = binomial, df)
summary(log.fit)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, df$default)
##         
## log.pred   No  Yes
##      No  9629  225
##      Yes   38  108
terror <- function(table){
  1 - ((table[1,1] + table[2,2]) / sum(table))
}
terror(table(log.pred, df$default))
## [1] 0.0263

Using .50 Split

split <- .50
index <- sample(1:nrow(df), round(nrow(df)*split))
train <- df[index,]
test <- df[-index,]
 

log.fit <- glm(default ~ income + balance,family = binomial, train)
summary(log.fit)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2810  -0.1348  -0.0529  -0.0185   3.7767  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.194e+01  6.451e-01 -18.504  < 2e-16 ***
## income       2.210e-05  7.381e-06   2.995  0.00275 ** 
## balance      5.874e-03  3.362e-04  17.474  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1429.88  on 4999  degrees of freedom
## Residual deviance:  752.69  on 4997  degrees of freedom
## AIC: 758.69
## 
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, test, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, test$default)
##         
## log.pred   No  Yes
##      No  4808  117
##      Yes   21   54
terror(table(log.pred, test$default))
## [1] 0.0276

Using .75 split

split <- .75
index <- sample(1:nrow(df), round(nrow(df)*split))
train <- df[index,]
test <- df[-index,]
 

log.fit <- glm(default ~ income + balance,family = binomial, train)
summary(log.fit)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4523  -0.1480  -0.0600  -0.0224   3.4593  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.139e+01  4.890e-01 -23.298  < 2e-16 ***
## income       2.072e-05  5.664e-06   3.658 0.000254 ***
## balance      5.566e-03  2.564e-04  21.703  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2219  on 7499  degrees of freedom
## Residual deviance: 1218  on 7497  degrees of freedom
## AIC: 1224
## 
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, test, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, test$default)
##         
## log.pred   No  Yes
##      No  2415   52
##      Yes    6   27
terror(table(log.pred, test$default))
## [1] 0.0232

Using .25 Split

split <- .25
index <- sample(1:nrow(df), round(nrow(df)*split))
train <- df[index,]
test <- df[-index,]
 

log.fit <- glm(default ~ income + balance,family = binomial, train)
summary(log.fit)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3753  -0.1354  -0.0529  -0.0202   3.7947  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.112e+01  8.734e-01 -12.730   <2e-16 ***
## income       4.087e-06  1.045e-05   0.391    0.696    
## balance      5.718e-03  4.655e-04  12.285   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 708.14  on 2499  degrees of freedom
## Residual deviance: 368.05  on 2497  degrees of freedom
## AIC: 374.05
## 
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, test, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, test$default)
##         
## log.pred   No  Yes
##      No  7215  173
##      Yes   32   80
terror(table(log.pred, test$default))
## [1] 0.02733333
split <- .75
index <- sample(1:nrow(df), round(nrow(df)*split))
train <- df[index,]
test <- df[-index,]
 

log.fit <- glm(default ~ income + balance + student,family = binomial, train)
summary(log.fit)
## 
## Call:
## glm(formula = default ~ income + balance + student, family = binomial, 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4543  -0.1513  -0.0607  -0.0226   3.6464  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.080e+01  5.576e-01 -19.370   <2e-16 ***
## income       7.777e-06  9.411e-06   0.826   0.4086    
## balance      5.592e-03  2.624e-04  21.308   <2e-16 ***
## studentYes  -5.489e-01  2.732e-01  -2.009   0.0445 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2178.7  on 7499  degrees of freedom
## Residual deviance: 1214.3  on 7496  degrees of freedom
## AIC: 1222.3
## 
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, test, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, test$default)
##         
## log.pred   No  Yes
##      No  2404   53
##      Yes   11   32
terror(table(log.pred, test$default))
## [1] 0.0256

It does not seem that adding student in the model will significantly decreasse the total error rate.

Question 6

log.fit <- glm(default ~ income + balance,family = binomial, df)
summary(log.fit)$coef
##                  Estimate   Std. Error    z value      Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income       2.080898e-05 4.985167e-06   4.174178  2.990638e-05
## balance      5.647103e-03 2.273731e-04  24.836280 3.638120e-136
boot.fn <- function(data, index){
  coef(glm(default ~ income + balance, family = binomial, data = data, subset = index))
}
split <- .75
index <- sample(1:nrow(df), round(nrow(df)*split))
boot.fn(df, index)
##   (Intercept)        income       balance 
## -1.157436e+01  2.142962e-05  5.681451e-03
boot(df,boot.fn, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = df, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -2.793522e-02 4.209967e-01
## t2*  2.080898e-05  1.693713e-07 4.729635e-06
## t3*  5.647103e-03  1.289558e-05 2.219446e-04

We can see that the boot strapping is very similar to the glm function

Question 9

library(MASS)
## Warning: package 'MASS' was built under R version 4.0.4
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
df2 <-  Boston

summary(glm(medv ~ 1 , data = df2))$coef
##             Estimate Std. Error  t value      Pr(>|t|)
## (Intercept) 22.53281  0.4088611 55.11115 9.370624e-216
boot.fn <- function(data, index){
  coef(glm(medv ~ 1, data = data, subset = index))
}
boot(df2,boot.fn, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = df2, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original     bias    std. error
## t1* 22.53281 0.01299229   0.4170413

We can see that the results of the bootstrap and glm are very similar.

median.fn <- function(data, index){
  median(data[index])
}
medboot <- boot(df2$medv,median.fn, 1000)
medboot.stderr <- sqrt(var(medboot$t))
list(medboot$t0 - 2 * medboot.stderr, medboot$t0 + 2 * medboot.stderr)
## [[1]]
##          [,1]
## [1,] 20.42276
## 
## [[2]]
##          [,1]
## [1,] 21.97724
quantile(df2$medv, .1)
##   10% 
## 12.75
tenth.fn <- function(data, index){
  quantile(df2$medv[index], .1)
}
boot(df2, tenth.fn, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = df2, statistic = tenth.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original   bias    std. error
## t1*    12.75 -0.01735   0.5071575

Boot strap yet again seems great at getting the estimate parameter. And showcases the the ability to get std. error for statistics that we donโ€™t know the sampling distribution for.