##Download the LabW9 dataset from spectrum and Load the dataset and appropriate packages
# Transt xlsx to csv
labW9 <- read.csv("C:/Users/Administrator/Desktop/labW9.csv")
View(labW9)
##Conduct data exploration and checking and cleaning if necessary
library(dplyr)
## Warning: 程辑包'dplyr'是用R版本4.1.2 来建造的
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
str(labW9)
## 'data.frame': 768 obs. of 9 variables:
## $ 锘縋regnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
colSums(is.na(labW9))
## 锘縋regnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
boxplot(labW9)
##Partition data 70/30 using any method you feel comfortable with
library(caret)
## Warning: 程辑包'caret'是用R版本4.1.2 来建造的
## 载入需要的程辑包:ggplot2
## Warning: 程辑包'ggplot2'是用R版本4.1.2 来建造的
## 载入需要的程辑包:lattice
split = 0.7
trainIndex <- createDataPartition(labW9$Outcome,p=split,list=FALSE)
df_train <- labW9[trainIndex,]
df_test <- labW9[-trainIndex,]
##Check both your training and test subsets
str(df_train)
## 'data.frame': 538 obs. of 9 variables:
## $ 锘縋regnancies : int 8 1 0 3 10 8 10 1 5 7 ...
## $ Glucose : int 183 89 137 78 115 125 139 189 166 100 ...
## $ BloodPressure : int 64 66 40 50 0 96 80 60 72 0 ...
## $ SkinThickness : int 0 23 35 32 0 0 0 23 19 0 ...
## $ Insulin : int 0 94 168 88 0 0 0 846 175 0 ...
## $ BMI : num 23.3 28.1 43.1 31 35.3 0 27.1 30.1 25.8 30 ...
## $ DiabetesPedigreeFunction: num 0.672 0.167 2.288 0.248 0.134 ...
## $ Age : int 32 21 33 26 29 54 57 59 51 32 ...
## $ Outcome : int 1 0 1 1 0 1 0 1 1 1 ...
table(df_train$Outcome)
##
## 0 1
## 347 191
str(df_test)
## 'data.frame': 230 obs. of 9 variables:
## $ 锘縋regnancies : int 6 1 5 2 4 10 0 7 3 7 ...
## $ Glucose : int 148 85 116 197 110 168 118 107 126 196 ...
## $ BloodPressure : int 72 66 74 70 92 74 84 74 88 90 ...
## $ SkinThickness : int 35 29 0 45 0 0 47 0 41 0 ...
## $ Insulin : int 0 0 0 543 0 0 230 0 235 0 ...
## $ BMI : num 33.6 26.6 25.6 30.5 37.6 38 45.8 29.6 39.3 39.8 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.201 0.158 0.191 0.537 0.551 0.254 0.704 0.451 ...
## $ Age : int 50 31 30 53 30 34 31 31 27 41 ...
## $ Outcome : int 1 0 0 1 0 1 1 1 0 1 ...
table(df_test$Outcome)
##
## 0 1
## 153 77
##Check for cross validation if the model allows for it
trControl <- trainControl(method = "cv")
##Train your test data using any model you feel is appropriate
knn <- train(Outcome~.,df_train,method="knn",trControl=trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
knn
## k-Nearest Neighbors
##
## 538 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 484, 484, 485, 484, 484, 484, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.4413380 0.1934718 0.3328465
## 7 0.4341022 0.2042671 0.3349905
## 9 0.4274715 0.2128323 0.3390749
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
##Plot your model
plot(knn)
##Predict using your test data onto your model
predict_data <- predict(knn,select(df_test,-Outcome))
predict_data
## [1] 0.4444444 0.1111111 0.3333333 0.6666667 0.4444444 0.7777778 0.3333333
## [8] 0.5555556 0.4444444 0.7777778 0.6666667 0.3333333 0.0000000 0.7777778
## [15] 0.1111111 0.3333333 0.4444444 0.5555556 1.0000000 0.2222222 0.0000000
## [22] 0.8888889 0.0000000 0.7777778 0.1111111 0.1111111 0.0000000 0.0000000
## [29] 0.0000000 0.1111111 0.0000000 0.0000000 0.6666667 0.2222222 0.0000000
## [36] 0.0000000 0.3333333 0.4444444 0.0000000 0.8888889 0.4444444 0.6666667
## [43] 0.6666667 0.6666667 0.2222222 0.2222222 0.4444444 0.7777778 0.2222222
## [50] 0.0000000 0.5555556 0.2222222 0.7777778 0.0000000 0.4444444 0.1111111
## [57] 0.4444444 0.6666667 0.0000000 0.6666667 0.4444444 0.4444444 0.4444444
## [64] 0.0000000 0.1111111 0.0000000 0.2222222 0.1111111 0.4444444 0.5555556
## [71] 0.0000000 0.0000000 0.3333333 0.1111111 0.6666667 0.3333333 0.4444444
## [78] 0.3333333 0.0000000 0.3333333 0.6666667 0.4444444 0.0000000 0.1111111
## [85] 0.1111111 0.0000000 0.6666667 0.3333333 0.0000000 0.2222222 0.0000000
## [92] 0.3333333 0.6666667 0.2222222 0.3333333 0.4444444 0.3333333 0.3333333
## [99] 0.7777778 0.3333333 0.1111111 0.0000000 0.8888889 0.2222222 0.2222222
## [106] 0.5555556 0.2222222 0.6666667 0.6666667 0.7777778 0.1111111 0.1111111
## [113] 0.5555556 0.7777778 0.4444444 0.4444444 0.0000000 0.1111111 0.5555556
## [120] 0.0000000 0.0000000 0.2222222 0.1111111 0.2222222 0.2222222 0.0000000
## [127] 0.7777778 0.6666667 0.3333333 0.2222222 0.2222222 0.2222222 0.4444444
## [134] 0.3333333 0.0000000 0.4444444 0.1111111 1.0000000 0.6666667 0.0000000
## [141] 0.2222222 0.6666667 0.0000000 0.2222222 0.5555556 0.5555556 0.7777778
## [148] 0.6666667 0.5555556 0.1111111 0.4444444 0.1111111 0.1111111 0.7777778
## [155] 0.6666667 0.2222222 0.1111111 0.6666667 0.0000000 0.1111111 0.3333333
## [162] 0.4444444 0.1111111 0.2222222 0.4444444 0.0000000 0.0000000 0.1111111
## [169] 0.0000000 0.2222222 0.3333333 0.0000000 0.6666667 0.6666667 0.2222222
## [176] 0.3333333 0.0000000 0.2222222 0.1111111 0.3333333 0.5555556 0.6666667
## [183] 0.0000000 0.7777778 0.3333333 0.1111111 0.5555556 0.2222222 0.0000000
## [190] 0.4444444 0.2222222 0.1111111 0.3333333 0.6666667 0.2222222 0.1111111
## [197] 0.0000000 0.0000000 0.1111111 0.8888889 0.5555556 0.3333333 0.3333333
## [204] 0.3333333 0.8888889 0.5555556 0.6666667 0.1111111 0.2222222 0.4444444
## [211] 0.5555556 0.5555556 0.7777778 0.1111111 0.3333333 0.4444444 0.1111111
## [218] 0.7777778 0.8888889 0.2222222 0.1111111 0.0000000 0.5555556 0.3333333
## [225] 0.6666667 0.4444444 0.5555556 0.8888889 0.3333333 0.0000000