Part I – Using the Auto data from the ISLR package. We start byCreating a binary variable, “mpg01”, that contains a 1 if “mpg” contains a value above its median, and a 0 if “mpg” contains a value below its median.

library(ISLR)
attach(Auto)
summary(Auto)

##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365

mpg01 = rep(0, length(mpg))
mpg01[mpg>median(mpg)] = 1


Auto = data.frame(Auto, mpg)
summary(Auto)

##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365  
##      mpg.1      
##  Min.   : 9.00  
##  1st Qu.:17.00  
##  Median :22.75  
##  Mean   :23.45  
##  3rd Qu.:29.00  
##  Max.   :46.60  
##

origin = as.factor(origin)

#Part II – Using the ggplot library we explore the data graphically to investigate the association between “mpg01” and the other variables.

We may conclude that there exists some association between “mpg01” and “cylinders”, “weight”, “displacement” and “horsepower”.

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'Auto':
## 
##     mpg

pairs(Auto)

Auto$origin = origin
plot(weight,mpg01)

r <- ggplot(data=Auto, aes(x=mpg,y=horsepower, color=displacement))
r + geom_point()

# Part III – converting data into 2 sets - Training and Test.

train = (year %% 2 == 0) 
sum(train)

## [1] 210

test = !train
sum(test)

## [1] 182

Auto.train = Auto[train,]
Auto.test = Auto[test,]
mpg01.test = mpg01[test]

Part IV – Performing linear discriminant analysis on the training data in order to predict “mpg01” using the variables that seemed most associated with “mpg01” in order to obtain the test error of the model. We may conclude that we have a test error rate of 0.1263%.

library(MASS)
lda.fit = lda(mpg01~cylinders+weight+displacement+horsepower,data=Auto, subset=train)
lda.pred = predict(lda.fit, Auto.test)
mean(lda.pred$class != mpg01.test)

## [1] 0.1263736

Part V – Performing Qualitative Data Analysis on the training data in order to predict “mpg01” using the variables that seemed most associated with “mpg01” in order to obtain the test error of the model.We may conclude that we have a test error rate of 0.1318%.

qda.fit = qda(mpg01~cylinders+weight+displacement+horsepower,data=Auto, subset=train)
qda.pred = predict(qda.fit, Auto.test)
mean(qda.pred$class != mpg01.test)

## [1] 0.1318681

Part VI – Making predictions on MPG01, by utilizing logistic regression.We may conclude that we have a test error rate of 0.120%.

glm.fit = glm(mpg01~cylinders+weight+displacement+horsepower+origin,data=Auto,family=binomial,subset=train)
summary(glm.fit)

## 
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower + 
##     origin, family = binomial, data = Auto, subset = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.82382  -0.02962   0.09044   0.33179   2.24987  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  16.647811   3.277619   5.079 3.79e-07 ***
## cylinders    -0.816197   0.676146  -1.207   0.2274    
## weight       -0.002128   0.001192  -1.785   0.0742 .  
## displacement -0.009215   0.019329  -0.477   0.6335    
## horsepower   -0.052293   0.025113  -2.082   0.0373 *  
## origin2      -1.076005   0.903971  -1.190   0.2339    
## origin3       0.445298   1.070818   0.416   0.6775    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.577  on 209  degrees of freedom
## Residual deviance:  80.524  on 203  degrees of freedom
## AIC: 94.524
## 
## Number of Fisher Scoring iterations: 7

glm.probs = predict(glm.fit, Auto.test, type="response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != mpg01.test)

## [1] 0.1263736

glm.fit = glm(mpg01~cylinders+weight+displacement+horsepower,data=Auto,family=binomial,subset=train)
summary(glm.fit)

## 
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower, 
##     family = binomial, data = Auto, subset = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.48027  -0.03413   0.10583   0.29634   2.57584  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  17.658730   3.409012   5.180 2.22e-07 ***
## cylinders    -1.028032   0.653607  -1.573   0.1158    
## weight       -0.002922   0.001137  -2.569   0.0102 *  
## displacement  0.002462   0.015030   0.164   0.8699    
## horsepower   -0.050611   0.025209  -2.008   0.0447 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.58  on 209  degrees of freedom
## Residual deviance:  83.24  on 205  degrees of freedom
## AIC: 93.24
## 
## Number of Fisher Scoring iterations: 7

glm.probs = predict(glm.fit, Auto.test, type="response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != mpg01.test)

## [1] 0.1208791

Part VII – Using K - nearest neighbor to predict MPG01. We may conclude that we have a test error rate of 0.132% for K=30. So, a K value of 30-40 seems to perform the best.

library(class)
train.X = cbind(cylinders, weight, displacement, horsepower)[train,]
test.X = cbind(cylinders, weight, displacement, horsepower)[test,]
train.mpg01 = mpg01[train]

StartOfK = 1
IncrementsOfK = 3
LengthOfK = floor(150/IncrementsOfK)
?round

KK = seq(from = StartOfK, length = LengthOfK,by = IncrementsOfK )
MeanTestError = seq(from = StartOfK, length = LengthOfK,by = IncrementsOfK )


KNNFUNCTION = function(TrainingData,TestData,TrainingOutcome,ValueOfK){
  knn.pred = knn(TrainingData, TestData, TrainingOutcome, k= ValueOfK,prob=FALSE)
  mean(knn.pred != mpg01.test)  
}
?knn
j=0

for (i in KK) {
  j = j+1
  MeanTestError[j] = KNNFUNCTION(train.X, test.X, train.mpg01,i)
}
summary(MeanTestError)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1374  0.1429  0.1484  0.1520  0.1593  0.1868

plot(KK,MeanTestError,type="l")