K fold cross validations, seperates the data into a training data set and test data. Then it is split into however many observations we wish for,the seperates them into groups. after running anylsis on these groups the test error is estimated by the Mean of the MSE.
Disadvantage:
Advantages:
1.Not computationally intensive, their for it is quicker.
2.We only repeat the anylsis K number of times(k=10,k=5), instead of performing the entire data set.
Disadvantage:
1.We use the entire data set, their for it is computationally intensive.
Advantages:
has less bias, almost the entire data set is being used.
produces a less varible MSE.
library(ISLR)
set.seed(1)
glm.fit<-glm(default~income+balance, data = Default, family = binomial)
glm.fit
##
## Call: glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## (Intercept) income balance
## -1.154e+01 2.081e-05 5.647e-03
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
## Null Deviance: 2921
## Residual Deviance: 1579 AIC: 1585
train<- sample(nrow(Default), .5*nrow(Default))
glm.fit<-glm(default~income+balance, data = Default, family = binomial, subset = train)
glm.fit
##
## Call: glm(formula = default ~ income + balance, family = binomial,
## data = Default, subset = train)
##
## Coefficients:
## (Intercept) income balance
## -1.208e+01 1.858e-05 6.053e-03
##
## Degrees of Freedom: 4999 Total (i.e. Null); 4997 Residual
## Null Deviance: 1457
## Residual Deviance: 734.4 AIC: 740.4
glm.probs <- predict(glm.fit, newdata=Default[-train, ], type = "response")
glm.pred <- rep("N", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Y"
error<-mean(glm.pred!=Default[-train, ]$default)
print(“our error is 2.86”)
set.seed(1)
glm.fit<-glm(default~income+balance, data = Default, family = "binomial")
glm.fit
##
## Call: glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## (Intercept) income balance
## -1.154e+01 2.081e-05 5.647e-03
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
## Null Deviance: 2921
## Residual Deviance: 1579 AIC: 1585
train<- sample(nrow(Default), .75*nrow(Default))
glm.fit.train<-glm(default~income+balance, data = Default, family = "binomial" ,subset = train)
glm.fit.train
##
## Call: glm(formula = default ~ income + balance, family = "binomial",
## data = Default, subset = train)
##
## Coefficients:
## (Intercept) income balance
## -1.167e+01 2.115e-05 5.738e-03
##
## Degrees of Freedom: 7499 Total (i.e. Null); 7497 Residual
## Null Deviance: 2226
## Residual Deviance: 1178 AIC: 1184
glm.probs <- predict(glm.fit.train, data = Default[-train, ], type = "response")
glm.pred <- rep("N", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Y"
error1<-mean(glm.pred!=Default[-train, ]$default)
set.seed(1)
glm.fit<-glm(default~income+balance, data = Default, family = "binomial")
glm.fit
##
## Call: glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## (Intercept) income balance
## -1.154e+01 2.081e-05 5.647e-03
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
## Null Deviance: 2921
## Residual Deviance: 1579 AIC: 1585
train<- sample(nrow(Default), .25*nrow(Default))
glm.fit.train<-glm(default~income+balance, data = Default, family = "binomial" ,subset = train)
glm.fit.train
##
## Call: glm(formula = default ~ income + balance, family = "binomial",
## data = Default, subset = train)
##
## Coefficients:
## (Intercept) income balance
## -1.233e+01 2.330e-05 6.233e-03
##
## Degrees of Freedom: 2499 Total (i.e. Null); 2497 Residual
## Null Deviance: 735.2
## Residual Deviance: 367 AIC: 373
glm.probs <- predict(glm.fit.train, data = Default[-train, ], type = "response")
glm.pred <- rep("N", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Y"
error2<-mean(glm.pred!=Default[-train, ]$default)
set.seed(1)
glm.fit<-glm(default~income+balance, data = Default, family = "binomial")
glm.fit
##
## Call: glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## (Intercept) income balance
## -1.154e+01 2.081e-05 5.647e-03
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
## Null Deviance: 2921
## Residual Deviance: 1579 AIC: 1585
train<- sample(nrow(Default), .80*nrow(Default))
glm.fit.train<-glm(default~income+balance, data = Default, family = "binomial" ,subset = train)
glm.fit.train
##
## Call: glm(formula = default ~ income + balance, family = "binomial",
## data = Default, subset = train)
##
## Coefficients:
## (Intercept) income balance
## -1.155e+01 1.859e-05 5.708e-03
##
## Degrees of Freedom: 7999 Total (i.e. Null); 7997 Residual
## Null Deviance: 2354
## Residual Deviance: 1255 AIC: 1261
glm.probs <- predict(glm.fit.train, data = Default[-train, ], type = "response")
glm.pred <- rep("N", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Y"
error3<-mean(glm.pred!=Default[-train, ]$default)
train<- sample(nrow(Default), .80*nrow(Default))
glm.fit <- glm(default~income+balance+student, data = Default, family = "binomial", subset = train)
glm.prob <- predict(glm.fit,Default[-train, ],type = "response")
glm.pred <- rep("No",length(glm.prob))
glm.pred[glm.prob > .5] ="Yes"
mean(glm.pred!=Default[-train,]$default)
## [1] 0.036
library(ISLR)
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
set.seed(1)
glm.fit<-glm(default~income+balance,data = Default, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn=function(data,index){ G.fit<-glm(default~ income+balance ,data = Default, family = binomial,subset = index)
return(coef(G.fit))
}
library(boot)
boot(Default,boot.fn,100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 9.699111e-02 4.101121e-01
## t2* 2.080898e-05 6.715005e-08 4.127740e-06
## t3* 5.647103e-03 -5.733883e-05 2.105660e-04
Both the glm and the boot strap appears to be the same.
library(MASS)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
mu <-mean(Boston$medv)
mu
## [1] 22.53281
error<-sd(Boston$medv)/sqrt(length(Boston$medv))
error
## [1] 0.4088611
library(boot)
boot.fn = function(data,index)
{
mu = mean(data[index])
return(mu)
}
bootstrap<-boot(Boston$medv, boot.fn, 1000)
bootstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.005122332 0.4117619
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
confidence.interval<-c(22.533-2*.4119,22.533+2*.4119)
confidence.interval
## [1] 21.7092 23.3568
medi<-median(Boston$medv)
medi
## [1] 21.2
boot.fn<- function(data,index)
{
mu<-median(data[index])
return(mu)
}
boot(Boston$medv,boot.fn,100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0395 0.3988174
Tenth<-quantile(Boston$medv,.1)
Tenth
## 10%
## 12.75
boot.fn<-function(data,index)
{
mu<quantile(data[index],.1)
return(mu)
}
boot(Boston$medv,boot.fn,10000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 10000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0 0