Lab 10 Exercise

##Download the LabW9 dataset from spectrum and Load the dataset and appropriate packages

# Transt xlsx to csv
labW9 <- read.csv("C:/Users/Administrator/Desktop/labW9.csv")
View(labW9)

##Conduct data exploration and checking and cleaning if necessary

library(dplyr)

## Warning: 程辑包'dplyr'是用R版本4.1.2 来建造的

## 
## 载入程辑包：'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

str(labW9)

## 'data.frame':    768 obs. of  9 variables:
##  $ 锘縋regnancies          : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...

colSums(is.na(labW9))

##           锘縋regnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

boxplot(labW9)

##Partition data 70/30 using any method you feel comfortable with

library(caret)

## Warning: 程辑包'caret'是用R版本4.1.2 来建造的

## 载入需要的程辑包：ggplot2

## Warning: 程辑包'ggplot2'是用R版本4.1.2 来建造的

## 载入需要的程辑包：lattice

split = 0.7
trainIndex <- createDataPartition(labW9$Outcome,p=split,list=FALSE)
df_train <- labW9[trainIndex,]
df_test <- labW9[-trainIndex,]

##Check both your training and test subsets

str(df_train)

## 'data.frame':    538 obs. of  9 variables:
##  $ 锘縋regnancies          : int  8 1 0 3 10 8 10 1 5 7 ...
##  $ Glucose                 : int  183 89 137 78 115 125 139 189 166 100 ...
##  $ BloodPressure           : int  64 66 40 50 0 96 80 60 72 0 ...
##  $ SkinThickness           : int  0 23 35 32 0 0 0 23 19 0 ...
##  $ Insulin                 : int  0 94 168 88 0 0 0 846 175 0 ...
##  $ BMI                     : num  23.3 28.1 43.1 31 35.3 0 27.1 30.1 25.8 30 ...
##  $ DiabetesPedigreeFunction: num  0.672 0.167 2.288 0.248 0.134 ...
##  $ Age                     : int  32 21 33 26 29 54 57 59 51 32 ...
##  $ Outcome                 : int  1 0 1 1 0 1 0 1 1 1 ...

table(df_train$Outcome)

## 
##   0   1 
## 347 191

str(df_test)

## 'data.frame':    230 obs. of  9 variables:
##  $ 锘縋regnancies          : int  6 1 5 2 4 10 0 7 3 7 ...
##  $ Glucose                 : int  148 85 116 197 110 168 118 107 126 196 ...
##  $ BloodPressure           : int  72 66 74 70 92 74 84 74 88 90 ...
##  $ SkinThickness           : int  35 29 0 45 0 0 47 0 41 0 ...
##  $ Insulin                 : int  0 0 0 543 0 0 230 0 235 0 ...
##  $ BMI                     : num  33.6 26.6 25.6 30.5 37.6 38 45.8 29.6 39.3 39.8 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.201 0.158 0.191 0.537 0.551 0.254 0.704 0.451 ...
##  $ Age                     : int  50 31 30 53 30 34 31 31 27 41 ...
##  $ Outcome                 : int  1 0 0 1 0 1 1 1 0 1 ...

table(df_test$Outcome)

## 
##   0   1 
## 153  77

##Check for cross validation if the model allows for it

trControl <- trainControl(method = "cv")

##Train your test data using any model you feel is appropriate

knn <- train(Outcome~.,df_train,method="knn",trControl=trainControl(method = "cv"))

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

knn

## k-Nearest Neighbors 
## 
## 538 samples
##   8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 484, 484, 485, 484, 484, 484, ... 
## Resampling results across tuning parameters:
## 
##   k  RMSE       Rsquared   MAE      
##   5  0.4413380  0.1934718  0.3328465
##   7  0.4341022  0.2042671  0.3349905
##   9  0.4274715  0.2128323  0.3390749
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.

##Plot your model

plot(knn)

##Predict using your test data onto your model

predict_data <- predict(knn,select(df_test,-Outcome))
predict_data

##   [1] 0.4444444 0.1111111 0.3333333 0.6666667 0.4444444 0.7777778 0.3333333
##   [8] 0.5555556 0.4444444 0.7777778 0.6666667 0.3333333 0.0000000 0.7777778
##  [15] 0.1111111 0.3333333 0.4444444 0.5555556 1.0000000 0.2222222 0.0000000
##  [22] 0.8888889 0.0000000 0.7777778 0.1111111 0.1111111 0.0000000 0.0000000
##  [29] 0.0000000 0.1111111 0.0000000 0.0000000 0.6666667 0.2222222 0.0000000
##  [36] 0.0000000 0.3333333 0.4444444 0.0000000 0.8888889 0.4444444 0.6666667
##  [43] 0.6666667 0.6666667 0.2222222 0.2222222 0.4444444 0.7777778 0.2222222
##  [50] 0.0000000 0.5555556 0.2222222 0.7777778 0.0000000 0.4444444 0.1111111
##  [57] 0.4444444 0.6666667 0.0000000 0.6666667 0.4444444 0.4444444 0.4444444
##  [64] 0.0000000 0.1111111 0.0000000 0.2222222 0.1111111 0.4444444 0.5555556
##  [71] 0.0000000 0.0000000 0.3333333 0.1111111 0.6666667 0.3333333 0.4444444
##  [78] 0.3333333 0.0000000 0.3333333 0.6666667 0.4444444 0.0000000 0.1111111
##  [85] 0.1111111 0.0000000 0.6666667 0.3333333 0.0000000 0.2222222 0.0000000
##  [92] 0.3333333 0.6666667 0.2222222 0.3333333 0.4444444 0.3333333 0.3333333
##  [99] 0.7777778 0.3333333 0.1111111 0.0000000 0.8888889 0.2222222 0.2222222
## [106] 0.5555556 0.2222222 0.6666667 0.6666667 0.7777778 0.1111111 0.1111111
## [113] 0.5555556 0.7777778 0.4444444 0.4444444 0.0000000 0.1111111 0.5555556
## [120] 0.0000000 0.0000000 0.2222222 0.1111111 0.2222222 0.2222222 0.0000000
## [127] 0.7777778 0.6666667 0.3333333 0.2222222 0.2222222 0.2222222 0.4444444
## [134] 0.3333333 0.0000000 0.4444444 0.1111111 1.0000000 0.6666667 0.0000000
## [141] 0.2222222 0.6666667 0.0000000 0.2222222 0.5555556 0.5555556 0.7777778
## [148] 0.6666667 0.5555556 0.1111111 0.4444444 0.1111111 0.1111111 0.7777778
## [155] 0.6666667 0.2222222 0.1111111 0.6666667 0.0000000 0.1111111 0.3333333
## [162] 0.4444444 0.1111111 0.2222222 0.4444444 0.0000000 0.0000000 0.1111111
## [169] 0.0000000 0.2222222 0.3333333 0.0000000 0.6666667 0.6666667 0.2222222
## [176] 0.3333333 0.0000000 0.2222222 0.1111111 0.3333333 0.5555556 0.6666667
## [183] 0.0000000 0.7777778 0.3333333 0.1111111 0.5555556 0.2222222 0.0000000
## [190] 0.4444444 0.2222222 0.1111111 0.3333333 0.6666667 0.2222222 0.1111111
## [197] 0.0000000 0.0000000 0.1111111 0.8888889 0.5555556 0.3333333 0.3333333
## [204] 0.3333333 0.8888889 0.5555556 0.6666667 0.1111111 0.2222222 0.4444444
## [211] 0.5555556 0.5555556 0.7777778 0.1111111 0.3333333 0.4444444 0.1111111
## [218] 0.7777778 0.8888889 0.2222222 0.1111111 0.0000000 0.5555556 0.3333333
## [225] 0.6666667 0.4444444 0.5555556 0.8888889 0.3333333 0.0000000

Lab 10 Exercise

Zhao ZiKun

2022/1/6