K - Nearest Neighbours

This is KNN regression algorithm for the machine.csv dataset from the link https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

CODE

IMPORT DATASET

hurl <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data")
dataset = read.table(hurl,header = F,sep=",",col.names = c("vendor name","Model Name","MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX","PRP","ERP"))
head(dataset)
##   vendor.name Model.Name MYCT MMIN  MMAX CACH CHMIN CHMAX PRP ERP
## 1     adviser      32/60  125  256  6000  256    16   128 198 199
## 2      amdahl     470v/7   29 8000 32000   32     8    32 269 253
## 3      amdahl    470v/7a   29 8000 32000   32     8    32 220 253
## 4      amdahl    470v/7b   29 8000 32000   32     8    32 172 253
## 5      amdahl    470v/7c   29 8000 16000   32     8    16 132 132
## 6      amdahl     470v/b   26 8000 32000   64     8    32 318 290

FEATURE SELECTION

dataset = dataset[3:9]
head(dataset)
##   MYCT MMIN  MMAX CACH CHMIN CHMAX PRP
## 1  125  256  6000  256    16   128 198
## 2   29 8000 32000   32     8    32 269
## 3   29 8000 32000   32     8    32 220
## 4   29 8000 32000   32     8    32 172
## 5   29 8000 16000   32     8    16 132
## 6   26 8000 32000   64     8    32 318
str(dataset)
## 'data.frame':    209 obs. of  7 variables:
##  $ MYCT : int  125 29 29 29 29 26 23 23 23 23 ...
##  $ MMIN : int  256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
##  $ MMAX : int  6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
##  $ CACH : int  256 32 32 32 32 64 64 64 64 128 ...
##  $ CHMIN: int  16 8 8 8 8 8 16 16 16 32 ...
##  $ CHMAX: int  128 32 32 32 16 32 32 32 32 64 ...
##  $ PRP  : int  198 269 220 172 132 318 367 489 636 1144 ...
summary(dataset)
##       MYCT             MMIN            MMAX            CACH       
##  Min.   :  17.0   Min.   :   64   Min.   :   64   Min.   :  0.00  
##  1st Qu.:  50.0   1st Qu.:  768   1st Qu.: 4000   1st Qu.:  0.00  
##  Median : 110.0   Median : 2000   Median : 8000   Median :  8.00  
##  Mean   : 203.8   Mean   : 2868   Mean   :11796   Mean   : 25.21  
##  3rd Qu.: 225.0   3rd Qu.: 4000   3rd Qu.:16000   3rd Qu.: 32.00  
##  Max.   :1500.0   Max.   :32000   Max.   :64000   Max.   :256.00  
##      CHMIN            CHMAX             PRP        
##  Min.   : 0.000   Min.   :  0.00   Min.   :   6.0  
##  1st Qu.: 1.000   1st Qu.:  5.00   1st Qu.:  27.0  
##  Median : 2.000   Median :  8.00   Median :  50.0  
##  Mean   : 4.699   Mean   : 18.27   Mean   : 105.6  
##  3rd Qu.: 6.000   3rd Qu.: 24.00   3rd Qu.: 113.0  
##  Max.   :52.000   Max.   :176.00   Max.   :1150.0

###SPLITTING

library(caTools)
set.seed(23)
split = sample.split(dataset$PRP , SplitRatio = 0.75)
training_set = subset(dataset, split==TRUE)
test_set = subset(dataset,split==FALSE)
dim(training_set)
## [1] 156   7
dim(test_set)
## [1] 53  7

MODEL

library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Loading required package: lattice
set.seed(1)
model <- train(
  PRP ~ .,
  data = training_set,
  method = 'knn' , 
)
model
## k-Nearest Neighbors 
## 
## 156 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 156, 156, 156, 156, 156, 156, ... 
## Resampling results across tuning parameters:
## 
##   k  RMSE       Rsquared   MAE     
##   5   96.42339  0.6318845  43.69282
##   7   99.28829  0.6056066  44.31492
##   9  100.97754  0.5917683  45.79206
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
plot(model , main = "model")

PREDICTION

y_pred = predict(model , newdata = test_set[-7])
y_pred
##  [1] 143.11111 344.22222 344.22222 701.00000 701.00000 152.40000  22.60000
##  [8]  21.60000  35.60000  22.40000  34.50000 174.00000  30.00000  58.40000
## [15]  20.00000  52.20000  30.00000  28.60000  28.60000  19.80000  31.60000
## [22]  22.40000  40.66667  60.20000 152.40000  88.60000  20.80000  35.00000
## [29]  34.16667  20.80000  16.80000  75.14286  60.20000 113.00000  59.33333
## [36]  26.20000 121.60000 261.42857 344.22222  30.00000 102.80000  57.83333
## [43] 147.88889  57.60000  20.60000  22.40000  31.60000 261.42857 220.66667
## [50] 226.11111 241.71429  26.20000  26.20000
plot(y_pred , main="prediction")

Analysis

rmse <- sqrt(mean((test_set$PRP - y_pred)^2))
rmse
## [1] 83.48779
R2 <- cor(test_set$PRP, y_pred) ^ 2
R2
## [1] 0.8784368