Untitled

Problem 3

The k-fold cross validation is implemented by segmenting the the data into different k-parts (k=5, k=10-, and so on.) k-fold cross then performs similarly to LOOCV, but instead of leaving out one item for test, it removes a whole k-segment. The process is then continued by removing the next segment, running calculations, then continue until all segments have been removed and items calculated.

In the validation set approach, data is splitbetween test/training pretty evenly and is less intense to compute than LOOCV. Due to the split, the training data/training data is more random, leading to less concise results.
Less bias as the entire dataset -1 is utilized each time to develop. Depending on the dataset, this can be very intense to compute.

Problem 5

library(ISLR)
set.seed(1)
attach(Default)
glm.fit1 <- glm(default ~ income + balance, family = binomial, data = Default)
summary(glm.fit1)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

train = sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit2 = glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
summary(glm.fit2)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5830  -0.1428  -0.0573  -0.0213   3.3395  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.194e+01  6.178e-01 -19.333  < 2e-16 ***
## income       3.262e-05  7.024e-06   4.644 3.41e-06 ***
## balance      5.689e-03  3.158e-04  18.014  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1523.8  on 4999  degrees of freedom
## Residual deviance:  803.3  on 4997  degrees of freedom
## AIC: 809.3
## 
## Number of Fisher Scoring iterations: 8

glm.probs = predict(glm.fit2, newdata = Default[-train, ], type="response")
glm.pred=rep("No",5000)
glm.pred[glm.probs>0.5] = "Yes"
mean(glm.pred != Default[-train, ]$default)

## [1] 0.0254

train = sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit2 = glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
summary(glm.fit2)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5420  -0.1329  -0.0512  -0.0176   3.7909  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.193e+01  6.379e-01 -18.709  < 2e-16 ***
## income       1.939e-05  6.953e-06   2.789  0.00528 ** 
## balance      5.918e-03  3.355e-04  17.641  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1490.52  on 4999  degrees of freedom
## Residual deviance:  774.41  on 4997  degrees of freedom
## AIC: 780.41
## 
## Number of Fisher Scoring iterations: 8

glm.probs = predict(glm.fit2, newdata = Default[-train, ], type="response")
glm.pred=rep("No",5000)
glm.pred[glm.probs>0.5] = "Yes"
mean(glm.pred != Default[-train, ]$default)

## [1] 0.0274

train = sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit2 = glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
summary(glm.fit2)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1634  -0.1446  -0.0553  -0.0203   3.3281  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.158e+01  6.008e-01 -19.281  < 2e-16 ***
## income       1.975e-05  6.775e-06   2.916  0.00355 ** 
## balance      5.723e-03  3.180e-04  17.996  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1543.58  on 4999  degrees of freedom
## Residual deviance:  816.44  on 4997  degrees of freedom
## AIC: 822.44
## 
## Number of Fisher Scoring iterations: 8

glm.probs = predict(glm.fit2, newdata = Default[-train, ], type="response")
glm.pred=rep("No",5000)
glm.pred[glm.probs>0.5] = "Yes"
mean(glm.pred != Default[-train, ]$default)

## [1] 0.0244

train = sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit2 = glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
summary(glm.fit2)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4027  -0.1517  -0.0624  -0.0233   3.6833  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.112e+01  5.816e-01 -19.120   <2e-16 ***
## income       1.638e-05  6.755e-06   2.425   0.0153 *  
## balance      5.489e-03  3.067e-04  17.897   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1503.85  on 4999  degrees of freedom
## Residual deviance:  831.51  on 4997  degrees of freedom
## AIC: 837.51
## 
## Number of Fisher Scoring iterations: 8

glm.probs = predict(glm.fit2, newdata = Default[-train, ], type="response")
glm.pred=rep("No",5000)
glm.pred[glm.probs>0.5] = "Yes"
mean(glm.pred != Default[-train, ]$default)

## [1] 0.0244

The error rate varies slighlty, but maintains a relatively consistent outcome.

train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
pred.glm <- rep("No", length(glm.probs))
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)

## [1] 0.0278

There seems to be no change in outcome utilizing a dummy variable.

Problem 6

train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
summary(fit.glm)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5668  -0.1419  -0.0571  -0.0206   3.6786  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.194e+01  6.378e-01 -18.728  < 2e-16 ***
## income       3.181e-05  7.210e-06   4.413 1.02e-05 ***
## balance      5.688e-03  3.233e-04  17.593  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1490.52  on 4999  degrees of freedom
## Residual deviance:  801.64  on 4997  degrees of freedom
## AIC: 807.64
## 
## Number of Fisher Scoring iterations: 8

boot.fn <- function(data, index) {
    fit <- glm(default ~ income + balance, data = data, family = "binomial", subset = index)
    return (coef(fit))}
library(boot)
boot(Default, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -3.936329e-02 4.347974e-01
## t2*  2.080898e-05  1.632735e-07 4.847106e-06
## t3*  5.647103e-03  1.858909e-05 2.300532e-04

Problem 9

detach(Default)
library(MASS)
attach(Boston)
mu.hat = mean(Boston$medv)
mu.hat

## [1] 22.53281

se.hat = sd(Boston$medv) /sqrt(dim(Boston)[1])
se.hat

## [1] 0.4088611

boot.fn <- function(data, index) {
    mu <- mean(data[index])
    return (mu)
}
boot(Boston$medv, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original   bias    std. error
## t1* 22.53281 0.015433   0.4148526

It is slightly higher, but minimal.

t.test(Boston$medv)

## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

CI.mu.hat <- c(22.53 - 2 * 0.4149, 22.53 + 2 * 0.4149)
CI.mu.hat

## [1] 21.7002 23.3598

med.hat <- median(Boston$medv)
med.hat

## [1] 21.2

boot.fn <- function(data, index) {
    mu <- median(data[index])
    return (mu)
}
boot(Boston$medv, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original   bias    std. error
## t1*     21.2 -0.02445   0.3676614

percent.hat <- quantile(Boston$medv, c(0.1))
percent.hat

##   10% 
## 12.75

boot.fn <- function(data, index) {
    mu <- quantile(data[index], c(0.1))
    return (mu)
}
boot(Boston$medv, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  0.0315     0.49625

12.75 is equal to step g and a std. error rate that is slightly higher, but still relatively small.

Untitled

Charles Ponthieux

2022-10-23