Introduction

This analysis looks at the topic of KNN, which stands for k-nearest neighbor. This type of algorithm is used to classify an object’s outcome based on their location in relationship to their space within a training set. This data set comes from the FuelEconomy.csv and looks at city miles per gallon, cylinders, engine displacement in liters(displ), and model year. In particular, the analyses looks how correctly fuel type for a vehicle is classified with a k of 1 and 5.

head(FuelEconomy)
## # A tibble: 6 x 11
##    city cylinders displ drive   fuelType highway make   model trans VClass  year
##   <dbl>     <dbl> <dbl> <chr>   <chr>      <dbl> <chr>  <chr> <chr> <chr>  <dbl>
## 1    19         4   2   Rear-W~ Regular       25 Alfa ~ Spid~ Manu~ Two S~  1985
## 2     9        12   4.9 Rear-W~ Regular       14 Ferra~ Test~ Manu~ Two S~  1985
## 3    23         4   2.2 Front-~ Regular       33 Dodge  Char~ Manu~ Subco~  1985
## 4    10         8   5.2 Rear-W~ Regular       12 Dodge  B150~ Auto~ Vans    1985
## 5    17         4   2.2 4-Whee~ Premium       23 Subaru Lega~ Manu~ Compa~  1993
## 6    21         4   1.8 Front-~ Regular       24 Subaru Loya~ Auto~ Compa~  1993
dim(FuelEconomy)
## [1] 38692    11
table(FuelEconomy$fuelType)
## 
##                         CNG                      Diesel 
##                          60                         977 
##             Gasoline or E85     Gasoline or natural gas 
##                        1287                          20 
##         Gasoline or propane                    Midgrade 
##                           8                         100 
##                     Premium     Premium and Electricity 
##                       11055                          47 
##  Premium Gas or Electricity              Premium or E85 
##                          28                         125 
##                     Regular Regular Gas and Electricity 
##                       24953                          29 
##  Regular Gas or Electricity 
##                           3
summary(FuelEconomy)
##       city         cylinders          displ         drive          
##  Min.   : 6.00   Min.   : 2.000   Min.   :0.60   Length:38692      
##  1st Qu.:15.00   1st Qu.: 4.000   1st Qu.:2.20   Class :character  
##  Median :17.00   Median : 6.000   Median :3.00   Mode  :character  
##  Mean   :17.81   Mean   : 5.745   Mean   :3.32                     
##  3rd Qu.:20.00   3rd Qu.: 6.000   3rd Qu.:4.30                     
##  Max.   :58.00   Max.   :16.000   Max.   :8.40                     
##    fuelType            highway          make              model          
##  Length:38692       Min.   : 9.00   Length:38692       Length:38692      
##  Class :character   1st Qu.:20.00   Class :character   Class :character  
##  Mode  :character   Median :24.00   Mode  :character   Mode  :character  
##                     Mean   :24.02                                        
##                     3rd Qu.:27.00                                        
##                     Max.   :61.00                                        
##     trans              VClass               year     
##  Length:38692       Length:38692       Min.   :1984  
##  Class :character   Class :character   1st Qu.:1992  
##  Mode  :character   Mode  :character   Median :2002  
##                                        Mean   :2002  
##                                        3rd Qu.:2011  
##                                        Max.   :2019
boxplot(city~fuelType, data = FuelEconomy)

boxplot(cylinders~fuelType, data = FuelEconomy)

boxplot(displ~fuelType, data = FuelEconomy)

boxplot(year~fuelType, data = FuelEconomy)

n=length(FuelEconomy$fuelType)
nt=36192
set.seed(1) 
train <- sample(1:n,nt)
x<-scale(FuelEconomy[,c(1,2,3,11)])
x[1:3,]
##            city  cylinders      displ      year
## [1,]  0.2432124 -0.9928793 -0.9711933 -1.553615
## [2,] -1.8084528  3.5597809  1.1627078 -1.553615
## [3,]  1.0638784 -0.9928793 -0.8240277 -1.553615
mean(x)
## [1] -1.846242e-15
sd(x)
## [1] 0.9999903
library(class)
nearest1 <- knn(train=x[train,],test=x[-train,],cl=FuelEconomy$fuelType[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=FuelEconomy$fuelType[train],k=5)
plot(x[train,],col=FuelEconomy$fuelType[train],cex=.8,main="1-nearest neighbor")
points(x[-train,],bg=nearest1,pch=21,col=grey(.9),cex=1.25)

plot(x[train,],col=FuelEconomy$fuelType[train],cex=.8,main="5-nearest neighbors")
points(x[-train,],bg=nearest5,pch=21,col=grey(.9),cex=1.25)

pcorrn1=100*sum(FuelEconomy$fuelType[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(FuelEconomy$fuelType[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 84.4
pcorrn5
## [1] 82.52
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*13))^2)/(n*12)
PressQ1
## [1] 320630.2
qchisq(.95,5)  
## [1] 11.0705
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*13))^2)/(n*12)
PressQ5
## [1] 305106.4
pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,FuelEconomy$fuelType,k)
  pcorr[k]=100*sum(FuelEconomy$fuelType==pred)/n
}
pcorr
##  [1] 83.55474 82.73545 82.41497 82.09707 82.01695 81.85413 81.73783 81.64478
##  [9] 81.62669 81.56725
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
near5<-data.frame(truetype=FuelEconomy$fuelType[-train],predtype=nearest5)
confusionMatrix(data=nearest5, reference=FuelEconomy$fuelType[-train])
## Confusion Matrix and Statistics
## 
##                              Reference
## Prediction                     CNG Diesel Gasoline or E85
##   CNG                            0      0               0
##   Diesel                         0     42               0
##   Gasoline or E85                1      0              24
##   Gasoline or natural gas        0      0               0
##   Gasoline or propane            0      0               0
##   Midgrade                       0      0               0
##   Premium                        1      4               7
##   Premium and Electricity        0      0               0
##   Premium Gas or Electricity     0      0               0
##   Premium or E85                 0      0               0
##   Regular                        4      7              48
##   Regular Gas and Electricity    0      0               0
##   Regular Gas or Electricity     0      0               0
##                              Reference
## Prediction                    Gasoline or natural gas Gasoline or propane
##   CNG                                               0                   0
##   Diesel                                            0                   0
##   Gasoline or E85                                   0                   0
##   Gasoline or natural gas                           0                   0
##   Gasoline or propane                               0                   0
##   Midgrade                                          0                   0
##   Premium                                           0                   1
##   Premium and Electricity                           0                   0
##   Premium Gas or Electricity                        0                   0
##   Premium or E85                                    0                   0
##   Regular                                           0                   0
##   Regular Gas and Electricity                       0                   0
##   Regular Gas or Electricity                        0                   0
##                              Reference
## Prediction                    Midgrade Premium Premium and Electricity
##   CNG                                0       0                       0
##   Diesel                             0       0                       0
##   Gasoline or E85                    0       7                       0
##   Gasoline or natural gas            0       1                       0
##   Gasoline or propane                0       0                       0
##   Midgrade                           5       1                       0
##   Premium                            0     505                       2
##   Premium and Electricity            0       0                       0
##   Premium Gas or Electricity         0       0                       0
##   Premium or E85                     0       1                       0
##   Regular                            1     188                       3
##   Regular Gas and Electricity        0       0                       0
##   Regular Gas or Electricity         0       0                       0
##                              Reference
## Prediction                    Premium Gas or Electricity Premium or E85 Regular
##   CNG                                                  0              0       0
##   Diesel                                               0              0       5
##   Gasoline or E85                                      0              0      13
##   Gasoline or natural gas                              0              0       0
##   Gasoline or propane                                  0              0       0
##   Midgrade                                             0              0       1
##   Premium                                              0              4     134
##   Premium and Electricity                              0              0       0
##   Premium Gas or Electricity                           0              0       0
##   Premium or E85                                       0              2       0
##   Regular                                              0              2    1485
##   Regular Gas and Electricity                          0              0       0
##   Regular Gas or Electricity                           0              0       0
##                              Reference
## Prediction                    Regular Gas and Electricity
##   CNG                                                   0
##   Diesel                                                0
##   Gasoline or E85                                       0
##   Gasoline or natural gas                               0
##   Gasoline or propane                                   0
##   Midgrade                                              0
##   Premium                                               1
##   Premium and Electricity                               0
##   Premium Gas or Electricity                            0
##   Premium or E85                                        0
##   Regular                                               0
##   Regular Gas and Electricity                           0
##   Regular Gas or Electricity                            0
##                              Reference
## Prediction                    Regular Gas or Electricity
##   CNG                                                  0
##   Diesel                                               0
##   Gasoline or E85                                      0
##   Gasoline or natural gas                              0
##   Gasoline or propane                                  0
##   Midgrade                                             0
##   Premium                                              0
##   Premium and Electricity                              0
##   Premium Gas or Electricity                           0
##   Premium or E85                                       0
##   Regular                                              0
##   Regular Gas and Electricity                          0
##   Regular Gas or Electricity                           0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8252          
##                  95% CI : (0.8097, 0.8399)
##     No Information Rate : 0.6552          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6276          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: CNG Class: Diesel Class: Gasoline or E85
## Sensitivity              0.0000        0.7925                 0.3038
## Specificity              1.0000        0.9980                 0.9913
## Pos Pred Value              NaN        0.8936                 0.5333
## Neg Pred Value           0.9976        0.9955                 0.9776
## Prevalence               0.0024        0.0212                 0.0316
## Detection Rate           0.0000        0.0168                 0.0096
## Detection Prevalence     0.0000        0.0188                 0.0180
## Balanced Accuracy        0.5000        0.8952                 0.6476
##                      Class: Gasoline or natural gas Class: Gasoline or propane
## Sensitivity                                      NA                     0.0000
## Specificity                                  0.9996                     1.0000
## Pos Pred Value                                   NA                        NaN
## Neg Pred Value                                   NA                     0.9996
## Prevalence                                   0.0000                     0.0004
## Detection Rate                               0.0000                     0.0000
## Detection Prevalence                         0.0004                     0.0000
## Balanced Accuracy                                NA                     0.5000
##                      Class: Midgrade Class: Premium
## Sensitivity                   0.8333         0.7183
## Specificity                   0.9992         0.9143
## Pos Pred Value                0.7143         0.7663
## Neg Pred Value                0.9996         0.8924
## Prevalence                    0.0024         0.2812
## Detection Rate                0.0020         0.2020
## Detection Prevalence          0.0028         0.2636
## Balanced Accuracy             0.9163         0.8163
##                      Class: Premium and Electricity
## Sensitivity                                   0.000
## Specificity                                   1.000
## Pos Pred Value                                  NaN
## Neg Pred Value                                0.998
## Prevalence                                    0.002
## Detection Rate                                0.000
## Detection Prevalence                          0.000
## Balanced Accuracy                             0.500
##                      Class: Premium Gas or Electricity Class: Premium or E85
## Sensitivity                                         NA                0.2500
## Specificity                                          1                0.9996
## Pos Pred Value                                      NA                0.6667
## Neg Pred Value                                      NA                0.9976
## Prevalence                                           0                0.0032
## Detection Rate                                       0                0.0008
## Detection Prevalence                                 0                0.0012
## Balanced Accuracy                                   NA                0.6248
##                      Class: Regular Class: Regular Gas and Electricity
## Sensitivity                  0.9066                             0.0000
## Specificity                  0.7065                             1.0000
## Pos Pred Value               0.8544                                NaN
## Neg Pred Value               0.7992                             0.9996
## Prevalence                   0.6552                             0.0004
## Detection Rate               0.5940                             0.0000
## Detection Prevalence         0.6952                             0.0000
## Balanced Accuracy            0.8065                             0.5000
##                      Class: Regular Gas or Electricity
## Sensitivity                                         NA
## Specificity                                          1
## Pos Pred Value                                      NA
## Neg Pred Value                                      NA
## Prevalence                                           0
## Detection Rate                                       0
## Detection Prevalence                                 0
## Balanced Accuracy                                   NA

Conclusion

The k-nearest neighbor method showed that when k=1,the likeliness of correctly classifying the fuel type for a vehicle is 84.4%. It also showed that when k=5,the likeliness of correctly classifying the fuel type for a vehicle is 82.52%. In addition to this, the cross validation showed that the pct for k levels between 1-10 showed that fuel types for vehicles are classified this way 81.62669% to 83.55474% of the time. The kappa value was 0.6276, and fell within a range that was considered substantial. Finally, the accuracy of the confusion matrix was 85.52%, which shows that the prediction of fuel type for a vehicle is accurate 85% of the time.