HW2

B. Load the data set PimaIndiansDiabetes

library(mlbench)
data(PimaIndiansDiabetes)
Data = PimaIndiansDiabetes

C. Explore the data set (structure/ instances…)

summary(Data)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##  diabetes 
##  neg:500  
##  pos:268  
##           
##           
##           
##

head(Data, 5)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos

D. Create a new data frame by normalizing numeric attributes

library(BBmisc)
New_Data = normalize(Data, method = "standardize", range = c(0, 1))
New_Data = na.omit(New_Data)
rownames(New_Data) <- NULL

E. Split the data frame randomly into two subsets, training and test, using a 70/30 split

n = nrow(New_Data)
trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
train = New_Data[trainIndex ,]
test = New_Data[-trainIndex ,]

table(train$diabetes)

## 
## neg pos 
## 350 188

cl = factor(c(rep("neg",348), rep("pos",190)))

G. Use the function knn to apply nearest neighbor classifier on the test set using the training set as model. Set k = 5. Compute the accuracy obtained.

library(class)
k_5 = knn(train[-9], test[-9], cl = train$diabetes, k = 5)
mean(k_5 == test$diabetes)

## [1] 0.7217391

H. Repeat step (g) with different values of k. What value of k resulted in the largest accuracy on the test set?

k_1 = knn(train[-9], test[-9], cl = train$diabetes, k = 1)
mean(k_1 == test$diabetes)

## [1] 0.7086957

k_2 = knn(train[-9], test[-9], cl = train$diabetes, k = 2)
mean(k_2 == test$diabetes)

## [1] 0.7130435

k_3 = knn(train[-9], test[-9], cl = train$diabetes, k = 3)
mean(k_3 == test$diabetes)

## [1] 0.7130435

k_4 = knn(train[-9], test[-9], cl = train$diabetes, k = 4)
mean(k_4 == test$diabetes)

## [1] 0.6913043

k_6 = knn(train[-9], test[-9], cl = train$diabetes, k = 6)
mean(k_6 == test$diabetes)

## [1] 0.726087

k_7 = knn(train[-9], test[-9], cl = train$diabetes, k = 7)
mean(k_7 == test$diabetes)

## [1] 0.7043478

Depending on the sample that is taken which is at random in this code…it appread that K_6 had the highest accuracy!