This is KNN regression algorithm for the machine.csv dataset from the link https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
hurl <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data")
dataset = read.table(hurl,header = F,sep=",",col.names = c("vendor name","Model Name","MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX","PRP","ERP"))
head(dataset)
## vendor.name Model.Name MYCT MMIN MMAX CACH CHMIN CHMAX PRP ERP
## 1 adviser 32/60 125 256 6000 256 16 128 198 199
## 2 amdahl 470v/7 29 8000 32000 32 8 32 269 253
## 3 amdahl 470v/7a 29 8000 32000 32 8 32 220 253
## 4 amdahl 470v/7b 29 8000 32000 32 8 32 172 253
## 5 amdahl 470v/7c 29 8000 16000 32 8 16 132 132
## 6 amdahl 470v/b 26 8000 32000 64 8 32 318 290
dataset = dataset[3:9]
head(dataset)
## MYCT MMIN MMAX CACH CHMIN CHMAX PRP
## 1 125 256 6000 256 16 128 198
## 2 29 8000 32000 32 8 32 269
## 3 29 8000 32000 32 8 32 220
## 4 29 8000 32000 32 8 32 172
## 5 29 8000 16000 32 8 16 132
## 6 26 8000 32000 64 8 32 318
str(dataset)
## 'data.frame': 209 obs. of 7 variables:
## $ MYCT : int 125 29 29 29 29 26 23 23 23 23 ...
## $ MMIN : int 256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
## $ MMAX : int 6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
## $ CACH : int 256 32 32 32 32 64 64 64 64 128 ...
## $ CHMIN: int 16 8 8 8 8 8 16 16 16 32 ...
## $ CHMAX: int 128 32 32 32 16 32 32 32 32 64 ...
## $ PRP : int 198 269 220 172 132 318 367 489 636 1144 ...
summary(dataset)
## MYCT MMIN MMAX CACH
## Min. : 17.0 Min. : 64 Min. : 64 Min. : 0.00
## 1st Qu.: 50.0 1st Qu.: 768 1st Qu.: 4000 1st Qu.: 0.00
## Median : 110.0 Median : 2000 Median : 8000 Median : 8.00
## Mean : 203.8 Mean : 2868 Mean :11796 Mean : 25.21
## 3rd Qu.: 225.0 3rd Qu.: 4000 3rd Qu.:16000 3rd Qu.: 32.00
## Max. :1500.0 Max. :32000 Max. :64000 Max. :256.00
## CHMIN CHMAX PRP
## Min. : 0.000 Min. : 0.00 Min. : 6.0
## 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 27.0
## Median : 2.000 Median : 8.00 Median : 50.0
## Mean : 4.699 Mean : 18.27 Mean : 105.6
## 3rd Qu.: 6.000 3rd Qu.: 24.00 3rd Qu.: 113.0
## Max. :52.000 Max. :176.00 Max. :1150.0
###SPLITTING
library(caTools)
set.seed(23)
split = sample.split(dataset$PRP , SplitRatio = 0.75)
training_set = subset(dataset, split==TRUE)
test_set = subset(dataset,split==FALSE)
dim(training_set)
## [1] 156 7
dim(test_set)
## [1] 53 7
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Loading required package: lattice
set.seed(1)
model <- train(
PRP ~ .,
data = training_set,
method = 'knn' ,
)
model
## k-Nearest Neighbors
##
## 156 samples
## 6 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 156, 156, 156, 156, 156, 156, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 96.42339 0.6318845 43.69282
## 7 99.28829 0.6056066 44.31492
## 9 100.97754 0.5917683 45.79206
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
plot(model , main = "model")
y_pred = predict(model , newdata = test_set[-7])
y_pred
## [1] 143.11111 344.22222 344.22222 701.00000 701.00000 152.40000 22.60000
## [8] 21.60000 35.60000 22.40000 34.50000 174.00000 30.00000 58.40000
## [15] 20.00000 52.20000 30.00000 28.60000 28.60000 19.80000 31.60000
## [22] 22.40000 40.66667 60.20000 152.40000 88.60000 20.80000 35.00000
## [29] 34.16667 20.80000 16.80000 75.14286 60.20000 113.00000 59.33333
## [36] 26.20000 121.60000 261.42857 344.22222 30.00000 102.80000 57.83333
## [43] 147.88889 57.60000 20.60000 22.40000 31.60000 261.42857 220.66667
## [50] 226.11111 241.71429 26.20000 26.20000
plot(y_pred , main="prediction")
rmse <- sqrt(mean((test_set$PRP - y_pred)^2))
rmse
## [1] 83.48779
R2 <- cor(test_set$PRP, y_pred) ^ 2
R2
## [1] 0.8784368