library(ISLR)
attach(Auto)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
mpg01 = rep(0, length(mpg))
mpg01[mpg>median(mpg)] = 1
Auto = data.frame(Auto, mpg)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
## mpg.1
## Min. : 9.00
## 1st Qu.:17.00
## Median :22.75
## Mean :23.45
## 3rd Qu.:29.00
## Max. :46.60
##
origin = as.factor(origin)
#Part II – Using the ggplot library we explore the data graphically to investigate the association between “mpg01” and the other variables.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'Auto':
##
## mpg
pairs(Auto)
Auto$origin = origin
plot(weight,mpg01)
r <- ggplot(data=Auto, aes(x=mpg,y=horsepower, color=displacement))
r + geom_point()
# Part III – converting data into 2 sets - Training and Test.
train = (year %% 2 == 0)
sum(train)
## [1] 210
test = !train
sum(test)
## [1] 182
Auto.train = Auto[train,]
Auto.test = Auto[test,]
mpg01.test = mpg01[test]
library(MASS)
lda.fit = lda(mpg01~cylinders+weight+displacement+horsepower,data=Auto, subset=train)
lda.pred = predict(lda.fit, Auto.test)
mean(lda.pred$class != mpg01.test)
## [1] 0.1263736
qda.fit = qda(mpg01~cylinders+weight+displacement+horsepower,data=Auto, subset=train)
qda.pred = predict(qda.fit, Auto.test)
mean(qda.pred$class != mpg01.test)
## [1] 0.1318681
glm.fit = glm(mpg01~cylinders+weight+displacement+horsepower+origin,data=Auto,family=binomial,subset=train)
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower +
## origin, family = binomial, data = Auto, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.82382 -0.02962 0.09044 0.33179 2.24987
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 16.647811 3.277619 5.079 3.79e-07 ***
## cylinders -0.816197 0.676146 -1.207 0.2274
## weight -0.002128 0.001192 -1.785 0.0742 .
## displacement -0.009215 0.019329 -0.477 0.6335
## horsepower -0.052293 0.025113 -2.082 0.0373 *
## origin2 -1.076005 0.903971 -1.190 0.2339
## origin3 0.445298 1.070818 0.416 0.6775
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.577 on 209 degrees of freedom
## Residual deviance: 80.524 on 203 degrees of freedom
## AIC: 94.524
##
## Number of Fisher Scoring iterations: 7
glm.probs = predict(glm.fit, Auto.test, type="response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != mpg01.test)
## [1] 0.1263736
glm.fit = glm(mpg01~cylinders+weight+displacement+horsepower,data=Auto,family=binomial,subset=train)
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower,
## family = binomial, data = Auto, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.48027 -0.03413 0.10583 0.29634 2.57584
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 17.658730 3.409012 5.180 2.22e-07 ***
## cylinders -1.028032 0.653607 -1.573 0.1158
## weight -0.002922 0.001137 -2.569 0.0102 *
## displacement 0.002462 0.015030 0.164 0.8699
## horsepower -0.050611 0.025209 -2.008 0.0447 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.58 on 209 degrees of freedom
## Residual deviance: 83.24 on 205 degrees of freedom
## AIC: 93.24
##
## Number of Fisher Scoring iterations: 7
glm.probs = predict(glm.fit, Auto.test, type="response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != mpg01.test)
## [1] 0.1208791
library(class)
train.X = cbind(cylinders, weight, displacement, horsepower)[train,]
test.X = cbind(cylinders, weight, displacement, horsepower)[test,]
train.mpg01 = mpg01[train]
StartOfK = 1
IncrementsOfK = 3
LengthOfK = floor(150/IncrementsOfK)
?round
KK = seq(from = StartOfK, length = LengthOfK,by = IncrementsOfK )
MeanTestError = seq(from = StartOfK, length = LengthOfK,by = IncrementsOfK )
KNNFUNCTION = function(TrainingData,TestData,TrainingOutcome,ValueOfK){
knn.pred = knn(TrainingData, TestData, TrainingOutcome, k= ValueOfK,prob=FALSE)
mean(knn.pred != mpg01.test)
}
?knn
j=0
for (i in KK) {
j = j+1
MeanTestError[j] = KNNFUNCTION(train.X, test.X, train.mpg01,i)
}
summary(MeanTestError)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1374 0.1429 0.1484 0.1520 0.1593 0.1868
plot(KK,MeanTestError,type="l")