library(mlbench)
data("PimaIndiansDiabetes")
head(PimaIndiansDiabetes, n=20)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
## 7 3 78 50 32 88 31.0 0.248 26 pos
## 8 10 115 0 0 0 35.3 0.134 29 neg
## 9 2 197 70 45 543 30.5 0.158 53 pos
## 10 8 125 96 0 0 0.0 0.232 54 pos
## 11 4 110 92 0 0 37.6 0.191 30 neg
## 12 10 168 74 0 0 38.0 0.537 34 pos
## 13 10 139 80 0 0 27.1 1.441 57 neg
## 14 1 189 60 23 846 30.1 0.398 59 pos
## 15 5 166 72 19 175 25.8 0.587 51 pos
## 16 7 100 0 0 0 30.0 0.484 32 pos
## 17 0 118 84 47 230 45.8 0.551 31 pos
## 18 7 107 74 0 0 29.6 0.254 31 pos
## 19 1 103 30 38 83 43.3 0.183 33 neg
## 20 1 115 70 30 96 34.6 0.529 32 pos
Summary: 20 Rows of the Pima Indian Diabetes dataset is shown above.
dim(PimaIndiansDiabetes)
## [1] 768 9
y <- PimaIndiansDiabetes$diabetes #focuses on the diabetes column
cbind(freq=table(y), percentage=prop.table(table(y))*100)
## freq percentage
## neg 500 65.10417
## pos 268 34.89583
summary(PimaIndiansDiabetes)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age diabetes
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00 neg:500
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00 pos:268
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
Below is a summary of each column in the dataset.
par(mfrow=c(1,4))
for(i in 1:4){
hist(PimaIndiansDiabetes[,i], main=names(PimaIndiansDiabetes)[i])
}
#### Density Plots
library(lattice)
#create a layout of simpler density plots by attribute
par(mfrow=c(1,4))
for (i in 1:4) {
plot(density(PimaIndiansDiabetes[,i]), main=names(PimaIndiansDiabetes)[i])}
#### Box and Whisker Plots
par(mfrow=c(1,4))
for(i in 1:4) {
boxplot(PimaIndiansDiabetes[,i], main=names(PimaIndiansDiabetes)[i])
}
#### Bar Plots
par(mfrow=c(1,4))
for(i in 1:4){
counts <- table(PimaIndiansDiabetes[,i])
name <-names(PimaIndiansDiabetes)[i]
barplot(counts,main=name)
}
Summary: - Blue represents positive correlation and red negative correlation. - The larger the dot the larger the correlation
library(corrplot)
## corrplot 0.92 loaded
correlations <-cor(PimaIndiansDiabetes[,1:8])
corrplot(correlations, method="circle")
#### Density Plots By Class
library(caret)
## Loading required package: ggplot2
x <- PimaIndiansDiabetes[,1:4]
y <- PimaIndiansDiabetes[,5]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
## NULL
x <- PimaIndiansDiabetes[,1:4]
y <- PimaIndiansDiabetes[,5]
featurePlot(x=x,y=y, plot="box")
## NULL
#summarize data
summary(PimaIndiansDiabetes[,1:8])
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method=c("scale"))
#Standard deviation of glucose - Just testing to see if scale was working properly.
cat("The standard deviation of glucose is: ", "\n")
## The standard deviation of glucose is:
sd(PimaIndiansDiabetes$glucose)
## [1] 31.97262
#summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (8)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#summarize the transformed dataset
summary(transformed)
## pregnant glucose pressure triceps
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.2968 1st Qu.:3.096 1st Qu.:3.203 1st Qu.:0.000
## Median :0.8903 Median :3.659 Median :3.720 Median :1.442
## Mean :1.1411 Mean :3.781 Mean :3.570 Mean :1.287
## 3rd Qu.:1.7806 3rd Qu.:4.387 3rd Qu.:4.133 3rd Qu.:2.006
## Max. :5.0451 Max. :6.224 Max. :6.303 Max. :6.206
## insulin mass pedigree age
## Min. :0.0000 Min. :0.000 Min. :0.2354 Min. :1.786
## 1st Qu.:0.0000 1st Qu.:3.463 1st Qu.:0.7357 1st Qu.:2.041
## Median :0.2647 Median :4.059 Median :1.1243 Median :2.466
## Mean :0.6924 Mean :4.058 Mean :1.4242 Mean :2.827
## 3rd Qu.:1.1042 3rd Qu.:4.642 3rd Qu.:1.8901 3rd Qu.:3.486
## Max. :7.3409 Max. :8.511 Max. :7.3039 Max. :6.888
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method = c("center"))
#Mean of glucose - Just testing to see if scale was working properly.
cat("The mean of glucose is: ", "\n")
## The mean of glucose is:
mean(PimaIndiansDiabetes$glucose)
## [1] 120.8945
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - centered (8)
## - ignored (0)
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#summarize the transformed dataset
summary(transformed)
## pregnant glucose pressure triceps
## Min. :-3.8451 Min. :-120.895 Min. :-69.105 Min. :-20.536
## 1st Qu.:-2.8451 1st Qu.: -21.895 1st Qu.: -7.105 1st Qu.:-20.536
## Median :-0.8451 Median : -3.895 Median : 2.895 Median : 2.464
## Mean : 0.0000 Mean : 0.000 Mean : 0.000 Mean : 0.000
## 3rd Qu.: 2.1549 3rd Qu.: 19.355 3rd Qu.: 10.895 3rd Qu.: 11.464
## Max. :13.1549 Max. : 78.105 Max. : 52.895 Max. : 78.464
## insulin mass pedigree age
## Min. :-79.80 Min. :-31.99258 Min. :-0.39388 Min. :-12.241
## 1st Qu.:-79.80 1st Qu.: -4.69258 1st Qu.:-0.22813 1st Qu.: -9.241
## Median :-49.30 Median : 0.00742 Median :-0.09938 Median : -4.241
## Mean : 0.00 Mean : 0.00000 Mean : 0.00000 Mean : 0.000
## 3rd Qu.: 47.45 3rd Qu.: 4.60742 3rd Qu.: 0.15437 3rd Qu.: 7.759
## Max. :766.20 Max. : 35.10742 Max. : 1.94812 Max. : 47.759
Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method = c("center","scale"))
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - centered (8)
## - ignored (0)
## - scaled (8)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#summarize the transformed dataset
summary(transformed)
## pregnant glucose pressure triceps
## Min. :-1.1411 Min. :-3.7812 Min. :-3.5703 Min. :-1.2874
## 1st Qu.:-0.8443 1st Qu.:-0.6848 1st Qu.:-0.3671 1st Qu.:-1.2874
## Median :-0.2508 Median :-0.1218 Median : 0.1495 Median : 0.1544
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6395 3rd Qu.: 0.6054 3rd Qu.: 0.5629 3rd Qu.: 0.7186
## Max. : 3.9040 Max. : 2.4429 Max. : 2.7327 Max. : 4.9187
## insulin mass pedigree age
## Min. :-0.6924 Min. :-4.057829 Min. :-1.1888 Min. :-1.0409
## 1st Qu.:-0.6924 1st Qu.:-0.595191 1st Qu.:-0.6885 1st Qu.:-0.7858
## Median :-0.4278 Median : 0.000941 Median :-0.2999 Median :-0.3606
## Mean : 0.0000 Mean : 0.000000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4117 3rd Qu.: 0.584390 3rd Qu.: 0.4659 3rd Qu.: 0.6598
## Max. : 6.6485 Max. : 4.452906 Max. : 5.8797 Max. : 4.0611# Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method = c("range"))
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (8)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#Summarize the transformed dataset
summary(transformed)
## pregnant glucose pressure triceps
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.05882 1st Qu.:0.4975 1st Qu.:0.5082 1st Qu.:0.0000
## Median :0.17647 Median :0.5879 Median :0.5902 Median :0.2323
## Mean :0.22618 Mean :0.6075 Mean :0.5664 Mean :0.2074
## 3rd Qu.:0.35294 3rd Qu.:0.7048 3rd Qu.:0.6557 3rd Qu.:0.3232
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## insulin mass pedigree age
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.4069 1st Qu.:0.07077 1st Qu.:0.0500
## Median :0.03605 Median :0.4769 Median :0.12575 Median :0.1333
## Mean :0.09433 Mean :0.4768 Mean :0.16818 Mean :0.2040
## 3rd Qu.:0.15041 3rd Qu.:0.5455 3rd Qu.:0.23409 3rd Qu.:0.3333
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000
summary(PimaIndiansDiabetes[,1:8])
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method = c("BoxCox"))
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - Box-Cox transformation (2)
## - ignored (0)
##
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#Summarize the transformed dataset
summary(transformed)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :-2.5510 Min. :0.8772
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:-1.4116 1st Qu.:0.8815
## Median : 30.5 Median :32.00 Median :-0.9875 Median :0.8867
## Mean : 79.8 Mean :31.99 Mean :-0.9599 Mean :0.8874
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:-0.4680 3rd Qu.:0.8938
## Max. :846.0 Max. :67.10 Max. : 0.8838 Max. :0.9019
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method = c("YeoJohnson"))
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - ignored (0)
## - Yeo-Johnson transformation (8)
##
## Lambda estimates for Yeo-Johnson transformation:
## 0.17, 0.97, 1.61, 0.51, -0.03, 1.28, -2.25, -1.15
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#Summarize the transformed dataset
summary(transformed)
## pregnant glucose pressure triceps
## Min. :0.0000 Min. : 0.00 Min. : 0.0 Min. : 0.000
## 1st Qu.:0.7363 1st Qu.: 87.61 1st Qu.: 483.5 1st Qu.: 0.000
## Median :1.5663 Median :102.99 Median : 612.8 Median : 7.980
## Mean :1.5344 Mean :106.17 Mean : 598.3 Mean : 6.367
## 3rd Qu.:2.3129 3rd Qu.:122.73 3rd Qu.: 724.4 3rd Qu.: 9.738
## Max. :3.7485 Max. :172.17 Max. :1417.8 Max. :18.662
## insulin mass pedigree age
## Min. :0.000 Min. : 0.00 Min. :0.0691 Min. :0.8450
## 1st Qu.:0.000 1st Qu.: 55.10 1st Qu.:0.1724 1st Qu.:0.8484
## Median :3.264 Median : 67.21 Median :0.2265 Median :0.8524
## Mean :2.286 Mean : 67.94 Mean :0.2317 Mean :0.8530
## 3rd Qu.:4.493 3rd Qu.: 79.53 3rd Qu.:0.2956 3rd Qu.:0.8580
## Max. :6.058 Max. :170.65 Max. :0.4164 Max. :0.8644
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes, method = c("center", "scale", "pca"))
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 9 variables
##
## Pre-processing:
## - centered (8)
## - ignored (1)
## - principal component signal extraction (8)
## - scaled (8)
##
## PCA needed 8 components to capture 95 percent of the variance
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#summarize the transformed dataset
summary(transformed)
## PC1 PC2 PC3 PC4
## Min. :-5.7234 Min. :-2.5884 Min. :-3.0881 Min. :-3.3193
## 1st Qu.:-0.8836 1st Qu.:-1.0341 1st Qu.:-0.6399 1st Qu.:-0.5607
## Median : 0.1187 Median :-0.2817 Median :-0.1248 Median :-0.1216
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.9679 3rd Qu.: 0.9630 3rd Qu.: 0.5221 3rd Qu.: 0.4842
## Max. : 5.1249 Max. : 3.6624 Max. : 4.9516 Max. : 4.5255
## PC5 PC6 PC7 PC8
## Min. :-2.53987 Min. :-3.2331 Min. :-3.35229 Min. :-3.07473
## 1st Qu.:-0.57205 1st Qu.:-0.4797 1st Qu.:-0.29719 1st Qu.:-0.36180
## Median :-0.06336 Median :-0.0770 Median : 0.04523 Median :-0.01823
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.56452 3rd Qu.: 0.3787 3rd Qu.: 0.38323 3rd Qu.: 0.38259
## Max. : 2.64228 Max. : 4.6103 Max. : 1.77314 Max. : 3.53473
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method = c("center", "scale", "ica"), n.comp=8)
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - centered (8)
## - independent component signal extraction (8)
## - ignored (0)
## - scaled (8)
##
## ICA used 8 components
#Transform the datset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#Summarize the transformed dataset
summary(transformed)
## ICA1 ICA2 ICA3 ICA4
## Min. :-4.4301 Min. :-5.1651 Min. :-5.8051 Min. :-3.3157
## 1st Qu.:-0.3025 1st Qu.:-0.3091 1st Qu.:-0.4401 1st Qu.:-0.7405
## Median : 0.1287 Median : 0.1130 Median : 0.3015 Median : 0.2582
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5586 3rd Qu.: 0.5222 3rd Qu.: 0.6919 3rd Qu.: 0.8413
## Max. : 2.4277 Max. : 3.4113 Max. : 1.3479 Max. : 1.5428
## ICA5 ICA6 ICA7 ICA8
## Min. :-7.4430 Min. :-4.27337 Min. :-4.48455 Min. :-5.05413
## 1st Qu.:-0.2265 1st Qu.:-0.61180 1st Qu.:-0.52315 1st Qu.:-0.76960
## Median : 0.1325 Median :-0.08804 Median : 0.08481 Median :-0.08342
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.4660 3rd Qu.: 0.55630 3rd Qu.: 0.56377 3rd Qu.: 0.97576
## Max. : 3.2383 Max. : 3.15379 Max. : 4.64048 Max. : 1.92491