# logistic regression model
glm.fit = glm(default ~ income + balance, data = Default, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# split sample set into a training set and a validation set
set.seed(1)
train = sample(10000, 5000)
# fit a multiple logistic regression model using training observations
glm.fit_train = glm(default ~ income + balance, data =
Default, family = "binomial",
subset = train)
# validation set
pred_data = Default[-train,]
# prediction
probs= predict(glm.fit_train, pred_data, type =
"response")
# replicate the word "No" 9500 times
pred = rep("No",5000)
# if the probability is more than 0.5 then set to "Yes
pred[probs > 0.5] = "Yes"
# validation set error
err = mean(pred!=pred_data$default)
err
## [1] 0.0254
# split sample set into a training set and a validation set
train1 = sample(10000, 7500)
# fit a multiple logistic regression model using training observations
glm.fit_train1 = glm(default ~ income + balance, data =
Default, subset = train1, family
= "binomial")
# validation set
pred_data1 = Default[-train1,]
# prediction
probs1= predict(glm.fit_train1, pred_data1, type = "response")
# replicate the word "No" 2500 times
pred1 = rep("No", 2500)
# if the probability is more than 0.5 then set to "Yes
pred1[probs1 > 0.5] = "Yes"
# validation set error
err1 = mean(pred1!=pred_data1$default)
err1
## [1] 0.0256
# split sample set into a training set and a validation set
train2 = sample(10000, 3500)
# fit a multiple logistic regression model using training observations
glm.fit_train2 = glm(default ~ income + balance, data =
Default, subset = train2, family
= "binomial")
# validation set
pred_data2 = Default[-train2,]
# prediction
probs2= predict(glm.fit_train2, pred_data2, type =
"response")
# replicate the word "No" 6500 times
pred2 = rep("No", 6500)
# if the probability is more than 0.5 then set to "Yes
pred2[probs2 > 0.5] = "Yes"
# validation set error
err2 = mean(pred2!=pred_data2$default)
err2
## [1] 0.02553846
# split sample set into a training set and a validation set
train3 = sample(10000, 1000)
# fit a multiple logistic regression model using training observations
glm.fit_train3 = glm(default ~ income + balance, data = Default, subset = train3,
family = "binomial")
# validation set
pred_data3 = Default[-train3,]
# prediction
probs3= predict(glm.fit_train3, pred_data3, type =
"response")
# replicate the word "No" 9000 times
pred3 = rep("No", 9000)
# if the probability is more than 0.5 then set to "Yes
pred3[probs3 > 0.5] = "Yes"
# validation set error
err3 = mean(pred3!=pred_data3$default)
err3
## [1] 0.02722222
When the data was split in 7500 training and 2500 test observations, the error rate was 0.0256. When the data was split in 5000 training and 5000 test observations, the error rate was 0.0254. When the data was split in 3500 training and 6500 test observations, the error rate was 0.0256. When the data was split in 1000 training and 9000 test observations, the error rate was 0.0272. In conclusion, the error rate varies depending on number of observations in the test set and the training set.
# split sample set into a training set and a validation set
train_stud = sample(10000, 5000)
# fit a multiple logistic regression model using training observations
glm.fit_train_stud = glm(default ~ income + balance + student,
data = Default, subset = train_stud, family = "binomial")
# validation set
pred_data_stud = Default[-train_stud,]
# prediction
probs_stud= predict(glm.fit_train_stud, pred_data_stud, type = "response")
# replicate the word "No" 2500 times
pred_stud = rep("No", 5000)
# if the probability is more than 0.5 then set to "Yes
pred_stud[probs_stud > 0.5] = "Yes"
# validation set error
err_student = mean(pred_stud!=pred_data_stud$default)
err_student
## [1] 0.0282
When the data was split in 5000 training and 5000 testing observations, using the income and balance predictors produced an error rate of 0.0254. In the same split, using the income and balance predictors and the dummy variable student produced an error rate of 0.0282. The error rate did not reduce but increased instead.
library(car)
## Loading required package: carData
# logistic regression model
glm.fit = glm(default ~ income + balance, data = Default, family = "binomial")
# standard errors for the coefficients
compareCoefs(glm.fit, digits = 4)
## Calls:
## glm(formula = default ~ income + balance, family = "binomial", data =
## Default)
##
## Model 1
## (Intercept) -11.5405
## SE 0.4348
##
## income 2.081e-05
## SE 4.985e-06
##
## balance 0.0056471
## SE 0.0002274
##
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
set.seed(1)
boot.fn <- function(data, index)
{
# input the Default data set and an index of the observations
coef_fit<- glm(default ~ income + balance, data =
data, family =
"binomial", subset = index)
return(coef(coef_fit))
}
# the coefficient estimates for income and balance
coef_est = boot(Default, boot.fn, 1000)
coef_est$t0
## (Intercept) income balance
## -1.154047e+01 2.080898e-05 5.647103e-03
std_err = boot(Default,boot.fn, 1000)
std_err
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.914609e-02 4.492830e-01
## t2* 2.080898e-05 1.259753e-07 4.988746e-06
## t3* 5.647103e-03 1.268814e-05 2.347490e-04
the estimated standard errors obtained using the glm() function and using the bootstrap function are relatively same.
detach(Default)
mu_hat = mean(medv)
mu_hat
## [1] 22.53281
stderr = sd(medv)/sqrt(506)
stderr
## [1] 0.4088611
library(boot)
set.seed(1)
boot.fn <- function(data, index)
{
mu <- mean(data[index])
return (mu)
}
boot_stderr = boot(medv, boot.fn, 1000)
boot_stderr
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
The standard error 0.4088611 from using the formula and the standard error 0.4106622 from the bootstrap function are relatively the same.
# confidence interval using boot.ci function
confid_boot = boot.ci(boot_stderr, conf = 0.95, type = "norm")
confid_boot
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_stderr, conf = 0.95, type = "norm")
##
## Intervals :
## Level Normal
## 95% (21.72, 23.33 )
## Calculations and Intervals on Original Scale
# confidence interval using the formula [mu_hat - 2SE(mu_hat), mu_hat + 2SE(mu_hat)]
confid_form = c(22.53281 - 2*0.4106622, 22.53281 + 2*0.4106622)
confid_form
## [1] 21.71149 23.35413
# confidence interval using t-test
confid_t_test= t.test(medv)
confid_t_test
##
## One Sample t-test
##
## data: medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
The 95% confidence interval from the formula (21.71149, 23.35413) and the t-test (21.72953, 23.33608) are very close. ### Estimate, med, for the Median Value of medv
mu_med = median(medv)
mu_med
## [1] 21.2
boot.fn <- function(data, index) {
mu <- median(data[index])
return (mu)
}
mu_med_stderr = boot(medv, boot.fn, 1000)
mu_med_stderr
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0386 0.3770241
mu_0.1 <- quantile(medv, c(0.1))
mu_0.1
## 10%
## 12.75
boot.fn <- function(data, index) {
mu <- quantile(data[index], c(0.1))
return (mu)
}
mu_0.1_stderr = boot(medv, boot.fn, 1000)
mu_0.1_stderr
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0186 0.4925766
The standard error of \(\hat{\mu}\) is 0.378725 and \({{\hat{\mu}}_{0.1}}\) is 0.4934503 are vastly different.The standard error increased when it was estimated with tenth percentile of medv in Boston suburbs.
detach(Boston)