This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3
mydefault <- Default
# Fitting logictic regression model on default data
logr_default <- glm(default ~ income + balance, data = mydefault, family = "binomial")
summary(logr_default)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = mydefault)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(452)
sample_def = sample.split(mydefault$default, SplitRatio = 0.70)
default_train<-subset(mydefault,sample_def==TRUE)
default_val<-subset(mydefault,sample_def==FALSE)
# Fitting logictic regression model on default training data
lrdef_train <- glm(default ~ income + balance, data = default_train, family = "binomial")
def_probs=predict(lrdef_train,default_val,type="response")
def_pred=rep("No",length(default_val$default))
def_pred[def_probs > .5] = "Yes"
table(def_pred,default_val$default)
##
## def_pred No Yes
## No 2889 69
## Yes 11 31
mean(def_pred!=default_val$default) * 100
## [1] 2.666667
Validation Misclassification Rate::((11+69)/3000)=2.66%
Try 1
#Split Try 1
set.seed(297)
sample_def = sample.split(mydefault$default, SplitRatio = 0.60)
default_train<-subset(mydefault,sample_def==TRUE)
default_val<-subset(mydefault,sample_def==FALSE)
#Fit Try 1
# Fitting logictic regression model on default training data
lrdef_train <- glm(default ~ income + balance, data = default_train, family = "binomial")
#Prediction Try 1
def_probs=predict(lrdef_train,default_val,type="response")
def_pred=rep("No",length(default_val$default))
def_pred[def_probs > .5] = "Yes"
table(def_pred,default_val$default)
##
## def_pred No Yes
## No 3854 85
## Yes 13 48
#Misclassification rate try 1
mean(def_pred!=default_val$default) * 100
## [1] 2.45
Validation Misclassification Rate(Try 1)::((13+85)/4000)=2.45%
Try 2
#Split Try 1
set.seed(254)
sample_def = sample.split(mydefault$default, SplitRatio = 0.50)
default_train<-subset(mydefault,sample_def==TRUE)
default_val<-subset(mydefault,sample_def==FALSE)
#Fit Try 1
# Fitting logictic regression model on default training data
lrdef_train <- glm(default ~ income + balance, data = default_train, family = "binomial")
#Prediction Try 1
def_probs=predict(lrdef_train,default_val,type="response")
def_pred=rep("No",length(default_val$default))
def_pred[def_probs > .5] = "Yes"
table(def_pred,default_val$default)
##
## def_pred No Yes
## No 4817 113
## Yes 16 54
#Misclassification rate try 1
mean(def_pred!=default_val$default) * 100
## [1] 2.58
Validation Misclassification Rate(Try 2)::((16+113)/5000)=2.58%
Try 3
#Split Try 1
set.seed(999)
sample_def = sample.split(mydefault$default, SplitRatio = 0.75)
default_train<-subset(mydefault,sample_def==TRUE)
default_val<-subset(mydefault,sample_def==FALSE)
#Fit Try 1
# Fitting logictic regression model on default training data
lrdef_train <- glm(default ~ income + balance, data = default_train, family = "binomial")
#Prediction Try 1
def_probs=predict(lrdef_train,default_val,type="response")
def_pred=rep("No",length(default_val$default))
def_pred[def_probs > .5] = "Yes"
table(def_pred,default_val$default)
##
## def_pred No Yes
## No 2406 60
## Yes 11 23
#Misclassification rate try 1
mean(def_pred!=default_val$default) * 100
## [1] 2.84
Validation Misclassification Rate(Try 3)::((11+60)/2500)=2.84%
#including Student as a predictor
set.seed(297)
sample_def = sample.split(mydefault$default, SplitRatio = 0.60)
default_train<-subset(mydefault,sample_def==TRUE)
default_val<-subset(mydefault,sample_def==FALSE)
#Fit Try 1
# Fitting logictic regression model on default training data
lrdef_train <- glm(default ~ income + balance + student, data = default_train, family = "binomial")
#Prediction Try 1
def_probs=predict(lrdef_train,default_val,type="response")
def_pred=rep("No",length(default_val$default))
def_pred[def_probs > .5] = "Yes"
table(def_pred,default_val$default)
##
## def_pred No Yes
## No 3855 87
## Yes 12 46
#Misclassification rate try 1
mean(def_pred!=default_val$default) * 100
## [1] 2.475
set.seed(986)
# Fitting logictic regression model on default training data
lrdef_std <- glm(default ~ income + balance, data = mydefault, family = "binomial")
summary(lrdef_std)$coefficients[2:3,]
## Estimate Std. Error z value Pr(>|z|)
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
boot.fn <- function(data, index){
lr_fn = glm(default ~ income + balance, data = data, subset = index,family = "binomial")
return(summary(lr_fn)$coefficients[2:3,2])
}
library(boot)
## Warning: package 'boot' was built under R version 4.3.3
set.seed(986)
boot(mydefault,boot.fn,R=100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = mydefault, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 4.985167e-06 1.076749e-08 1.282837e-07
## t2* 2.273731e-04 8.210079e-07 1.016713e-05
The standard Errors from the bootstrap are lower compared to the standard error estimates obtained from the formulas.As the bootstrap approach does not rely on any of the assumptions that standard formulas take into account. Bootstap estimates for SE(β^income) and SE(β^balance) are likely giving a more accurate estimate of the standard errors that of the summary() function.
set.seed(1)
data("Weekly")
model_1 <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
summary(model_1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
model_2 <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = "binomial")
summary(model_2)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly[-1,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
predict.glm(model_2, Weekly[1, ], type = "response") > 0.5
## 1
## TRUE
We can infer that the prediction for the first observation is “Up,” but it was misclassified since the true direction is “Down.”
error <- rep(0, dim(Weekly)[1])
for (i in 1:dim(Weekly)[1]) {
model_3 <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = "binomial")
pred.up <- predict.glm(model_3, Weekly[i, ], type = "response") > 0.5
true.up <- Weekly[i, ]$Direction == "Up"
if (pred.up != true.up)
error[i] <- 1
}
error
## [1] 1 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 1 0 0
## [38] 0 1 0 1 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1 0 0
## [75] 0 1 0 1 1 0 0 1 1 0 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1
## [112] 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0
## [149] 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 0
## [186] 0 0 1 1 0 1 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1
## [223] 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 1 1 0 1 0 0 1 0 0 1 0
## [260] 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0
## [297] 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0
## [334] 1 1 1 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 1 0 1
## [371] 1 1 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1
## [408] 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 1
## [445] 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0
## [482] 0 1 1 1 0 1 0 0 0 1 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 0
## [519] 1 1 0 0 1 1 0 0 0 1 1 0 1 0 1 1 1 1 1 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1 1 1 1
## [556] 1 1 0 1 0 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0 1 0 1
## [593] 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 0 1 1 0 1 1 0 1 0 1
## [630] 0 1 1 1 1 0 1 1 0 0 0 1 1 1 1 0 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 0
## [667] 0 1 1 0 1 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1
## [704] 0 0 0 0 1 0 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1 0 1 1 0
## [741] 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1
## [778] 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1
## [815] 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1 0 1 0 1 1 0 0 1
## [852] 1 1 0 1 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 1 1 0 1 0 1
## [889] 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 0 1
## [926] 0 0 0 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1
## [963] 0 0 0 1 1 1 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 0 1 1 1 0 1 0 0 0
## [1000] 0 1 0 0 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 0 1 1 0 0 0 1 0
## [1037] 1 0 1 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 0 0
## [1074] 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0
mean(error)
## [1] 0.4499541
myboston <- Boston
mu_est <- mean(myboston$medv)
mu_est
## [1] 22.53281
A)μ^=22.5328
se_mu = sd(myboston$medv)/sqrt(length(myboston$medv))
se_mu
## [1] 0.4088611
#boot strap function
set.seed(123)
boot_mu_fn <- function(data,index)
return(mean(data[index]))
boot_result<- boot(myboston$medv,boot_mu_fn, R = 1000)
boot_result
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = myboston$medv, statistic = boot_mu_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.01607372 0.4045557
Hint: You can approximate a 95 % confidence interval using the formula [ μ^− 2∗SE(μ^), μ^+ 2∗SE(μ^)].
#confidence intervals using the SE from bootstrap is:
lower_bd <- mu_est - (2*0.4045557) #mu hat - 2 * SE(from bootstrap)
upper_db <- mu_est + (2*0.4045557) #mu hat + 2 * SE(from bootstrap)
lower_bd
## [1] 21.72369
upper_db
## [1] 23.34192
Lower bound is 21.72369 Upper bound is 23.34192
using One Sampe t-Test
t.test(myboston$medv)
##
## One Sample t-test
##
## data: myboston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
median(myboston$medv)
## [1] 21.2
#boot strap function
set.seed(123)
boot_med_fn <- function(data,index)
return(median(data[index]))
boot(myboston$medv,boot_med_fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = myboston$medv, statistic = boot_med_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0203 0.3676453
SE(μ^median) using the bootstrap is 0.37(rounded to 2 decimals). This is small compared to the SE(μ^mean). this indicates that the mediam estimate for the population is mostly accurate.
#boot strap function
quantile(myboston$medv,0.1)
## 10%
## 12.75
μ^0.1=12.75
set.seed(1)
boot_10q_fn <- function(data,index)
return(quantile(data[index],0.1))
boot(myboston$medv,boot_10q_fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = myboston$medv, statistic = boot_10q_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0339 0.4767526
Bootstrap μ^0.1=0.48 this is relatively small considering the 10th percentile of 12.75.