knn for iris data
library(class)
library(tidyverse)
package <U+393C><U+3E31>tidyverse<U+393C><U+3E32> was built under R version 3.3.2Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
package <U+393C><U+3E31>ggplot2<U+393C><U+3E32> was built under R version 3.3.2package <U+393C><U+3E31>tidyr<U+393C><U+3E32> was built under R version 3.3.2package <U+393C><U+3E31>readr<U+393C><U+3E32> was built under R version 3.3.2package <U+393C><U+3E31>purrr<U+393C><U+3E32> was built under R version 3.3.2Conflicts with tidy packages -----------------------------------------------------------------------
filter(): dplyr, stats
lag(): dplyr, stats
summary(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
normalize the data
iris_n%>%dim
[1] 150 4
create test and train dataset
library(caret)
train_index=createDataPartition(iris$Species,p=0.8,list=FALSE)
iris_train=iris_n[train_index,]
iris_test=iris_n[-train_index,]
train_label=iris$Species[train_index]
test_label=iris$Species[-train_index]
use the knn algorithm
confusionMatrix(model,test_label)
Confusion Matrix and Statistics
Reference
Prediction setosa versicolor virginica
setosa 10 0 0
versicolor 0 10 0
virginica 0 0 10
Overall Statistics
Accuracy : 1
95% CI : (0.8843, 1)
No Information Rate : 0.3333
P-Value [Acc > NIR] : 4.857e-15
Kappa : 1
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: setosa Class: versicolor Class: virginica
Sensitivity 1.0000 1.0000 1.0000
Specificity 1.0000 1.0000 1.0000
Pos Pred Value 1.0000 1.0000 1.0000
Neg Pred Value 1.0000 1.0000 1.0000
Prevalence 0.3333 0.3333 0.3333
Detection Rate 0.3333 0.3333 0.3333
Detection Prevalence 0.3333 0.3333 0.3333
Balanced Accuracy 1.0000 1.0000 1.0000
working with data from UCI
download it into your current working directory and import it into your Rstudio using fread from data.table or just read.csv
summary(bc)
id diagnosis radius_mean texture_mean perimeter_mean area_mean
Min. : 8670 B:357 Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
1st Qu.: 869218 M:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17 1st Qu.: 420.3
Median : 906024 Median :13.370 Median :18.84 Median : 86.24 Median : 551.1
Mean : 30371831 Mean :14.127 Mean :19.29 Mean : 91.97 Mean : 654.9
3rd Qu.: 8813129 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10 3rd Qu.: 782.7
Max. :911320502 Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
smoothness_mean compactness_mean concavity_mean concave.points_mean symmetry_mean
Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000 Min. :0.1060
1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031 1st Qu.:0.1619
Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350 Median :0.1792
Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892 Mean :0.1812
3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400 3rd Qu.:0.1957
Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120 Max. :0.3040
fractal_dimension_mean radius_se texture_se perimeter_se area_se
Min. :0.04996 Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850
Median :0.06154 Median :0.3242 Median :1.1080 Median : 2.287 Median : 24.530
Mean :0.06280 Mean :0.4052 Mean :1.2169 Mean : 2.866 Mean : 40.337
3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190
Max. :0.09744 Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
smoothness_se compactness_se concavity_se concave.points_se symmetry_se
Min. :0.001713 Min. :0.002252 Min. :0.00000 Min. :0.000000 Min. :0.007882
1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160
Median :0.006380 Median :0.020450 Median :0.02589 Median :0.010930 Median :0.018730
Mean :0.007041 Mean :0.025478 Mean :0.03189 Mean :0.011796 Mean :0.020542
3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480
Max. :0.031130 Max. :0.135400 Max. :0.39600 Max. :0.052790 Max. :0.078950
fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
Median :0.0031870 Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
Mean :0.0037949 Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
Max. :0.0298400 Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
smoothness_worst compactness_worst concavity_worst concave.points_worst symmetry_worst
Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000 Min. :0.1565
1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493 1st Qu.:0.2504
Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993 Median :0.2822
Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461 Mean :0.2901
3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140 3rd Qu.:0.3179
Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100 Max. :0.6638
fractal_dimension_worst
Min. :0.05504
1st Qu.:0.07146
Median :0.08004
Mean :0.08395
3rd Qu.:0.09208
Max. :0.20750
get data partition for training and test (createDataPartition from caret package)
index=createDataPartition(bc$diagnosis,p=0.8,list=FALSE,times=1)
train_bc=bc_n[index,]
test_bc=bc_n[-index,]
train_bc_cl=bc$diagnosis[index]
test_bc_cl=bc$diagnosis[-index]
apply knn
bc_model_5=knn(train=train_bc,test=test_bc,cl=train_bc_cl,k=5)
bc_model_10=knn(train=train_bc,test=test_bc,cl=train_bc_cl,k=10)
bc_model_100=knn(train=train_bc,test=test_bc,cl=train_bc_cl,k=100)
bc_model_200=knn(train=train_bc,test=test_bc,cl=train_bc_cl,k=200)
## access model performance
table(bc_model_5,test_bc_cl)
test_bc_cl
bc_model_5 B M
B 69 2
M 2 40
table(bc_model_10,test_bc_cl)
test_bc_cl
bc_model_10 B M
B 69 2
M 2 40
table(bc_model_100,test_bc_cl)
test_bc_cl
bc_model_100 B M
B 70 2
M 1 40
table(bc_model_200,test_bc_cl)
test_bc_cl
bc_model_200 B M
B 70 9
M 1 33
it seems that k=100 is a better model among those k variations.
LS0tDQp0aXRsZTogIktOTiBjbGFzc2lmaWNhdGlvbiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQojI2tubiBmb3IgaXJpcyBkYXRhDQoNCg0KIyMgDQpgYGB7cn0NCmxpYnJhcnkoY2xhc3MpDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCnN1bW1hcnkoaXJpcykNCg0KDQoNCmBgYA0KIyMgbm9ybWFsaXplIHRoZSBkYXRhDQpgYGB7cn0NCm5vcm1hbGl6ZT1mdW5jdGlvbihkYXRhKXsNCiAgYT1kYXRhLW1lYW4oZGF0YSxuYS5ybSA9IFRSVUUpDQogIGI9bWF4KGRhdGEpLW1pbihkYXRhKQ0KICANCiAgDQogIHJldHVybihhL2IpDQp9DQoNCmlyaXNfbj1zYXBwbHkoaXJpc1ssMTo0XSxub3JtYWxpemUpDQoNCmlyaXNfbiU+JWRpbQ0KYGBgDQojIyBjcmVhdGUgdGVzdCBhbmQgdHJhaW4gZGF0YXNldA0KDQpgYGB7cn0NCmxpYnJhcnkoY2FyZXQpDQp0cmFpbl9pbmRleD1jcmVhdGVEYXRhUGFydGl0aW9uKGlyaXMkU3BlY2llcyxwPTAuOCxsaXN0PUZBTFNFKQ0KDQppcmlzX3RyYWluPWlyaXNfblt0cmFpbl9pbmRleCxdDQppcmlzX3Rlc3Q9aXJpc19uWy10cmFpbl9pbmRleCxdDQoNCnRyYWluX2xhYmVsPWlyaXMkU3BlY2llc1t0cmFpbl9pbmRleF0NCg0KdGVzdF9sYWJlbD1pcmlzJFNwZWNpZXNbLXRyYWluX2luZGV4XQ0KDQpgYGANCg0KIyMgdXNlIHRoZSBrbm4gYWxnb3JpdGhtDQoNCmBgYHtyfQ0KbW9kZWw9a25uKHRyYWluPWlyaXNfdHJhaW4sdGVzdD1pcmlzX3Rlc3QsY2w9dHJhaW5fbGFiZWwsaz0zKQ0KDQpsaWJyYXJ5KGUxMDcxKQ0KY29uZnVzaW9uTWF0cml4KG1vZGVsLHRlc3RfbGFiZWwpDQoNCg0KdGFibGUobW9kZWwsdGVzdF9sYWJlbCkNCmBgYA0KIyMgd29ya2luZyB3aXRoIGRhdGEgZnJvbSBbVUNJXSgiaHR0cHM6Ly9hcmNoaXZlLmljcy51Y2kuZWR1L21sL21hY2hpbmUtbGVhcm5pbmctZGF0YWJhc2VzL2JyZWFzdC1jYW5jZXItd2lzY29uc2luLyIpDQoNCg0KIyMgZG93bmxvYWQgaXQgaW50byB5b3VyIGN1cnJlbnQgd29ya2luZyBkaXJlY3RvcnkgYW5kIGltcG9ydCBpdCBpbnRvIHlvdXIgUnN0dWRpbyB1c2luZyBmcmVhZCBmcm9tIGRhdGEudGFibGUgb3IganVzdCByZWFkLmNzdg0KDQpgYGB7cn0NCmdldHdkKCkNCmJjPXJlYWQuY3N2KCJ3Yi5jc3YiKQ0KDQpzdW1tYXJ5KGJjKQ0KIyBub3JtYWxpemUgdGhlIGRhdGEgDQpiY19uPWJjJT4lc2VsZWN0KC1kaWFnbm9zaXMpJT4lc2FwcGx5KG5vcm1hbGl6ZSkNCmBgYA0KDQojIyBnZXQgZGF0YSBwYXJ0aXRpb24gZm9yIHRyYWluaW5nIGFuZCB0ZXN0ICAoY3JlYXRlRGF0YVBhcnRpdGlvbiBmcm9tIGNhcmV0IHBhY2thZ2UpDQoNCmBgYHtyfQ0KaW5kZXg9Y3JlYXRlRGF0YVBhcnRpdGlvbihiYyRkaWFnbm9zaXMscD0wLjgsbGlzdD1GQUxTRSx0aW1lcz0xKQ0KDQp0cmFpbl9iYz1iY19uW2luZGV4LF0NCnRlc3RfYmM9YmNfblstaW5kZXgsXQ0KDQp0cmFpbl9iY19jbD1iYyRkaWFnbm9zaXNbaW5kZXhdDQoNCnRlc3RfYmNfY2w9YmMkZGlhZ25vc2lzWy1pbmRleF0NCg0KYGBgDQojIyBhcHBseSBrbm4NCg0KYGBge3J9DQpiY19tb2RlbF81PWtubih0cmFpbj10cmFpbl9iYyx0ZXN0PXRlc3RfYmMsY2w9dHJhaW5fYmNfY2wsaz01KQ0KYmNfbW9kZWxfMTA9a25uKHRyYWluPXRyYWluX2JjLHRlc3Q9dGVzdF9iYyxjbD10cmFpbl9iY19jbCxrPTEwKQ0KYmNfbW9kZWxfMTAwPWtubih0cmFpbj10cmFpbl9iYyx0ZXN0PXRlc3RfYmMsY2w9dHJhaW5fYmNfY2wsaz0xMDApDQpiY19tb2RlbF8yMDA9a25uKHRyYWluPXRyYWluX2JjLHRlc3Q9dGVzdF9iYyxjbD10cmFpbl9iY19jbCxrPTIwMCkNCg0KDQojIyBhY2Nlc3MgbW9kZWwgcGVyZm9ybWFuY2UNCg0KdGFibGUoYmNfbW9kZWxfNSx0ZXN0X2JjX2NsKQ0KDQoNCnRhYmxlKGJjX21vZGVsXzEwLHRlc3RfYmNfY2wpDQoNCnRhYmxlKGJjX21vZGVsXzEwMCx0ZXN0X2JjX2NsKQ0KDQp0YWJsZShiY19tb2RlbF8yMDAsdGVzdF9iY19jbCkNCmBgYA0KIyBpdCBzZWVtcyB0aGF0IGs9MTAwIGlzIGEgYmV0dGVyIG1vZGVsIGFtb25nIHRob3NlIGsgdmFyaWF0aW9ucy4gDQoNCg0KIA0KDQoNCg==