#Chapter 6
#Load Data from CSV file
#Define the filename
filename <- "iris.csv"
#Load the CSV file from the local directory
dataset <- read.csv(filename, header = FALSE)
#Preview the first 5 rows
head(dataset)
## V1 V2 V3 V4 V5
## 1 sepal.length sepal.width petal.length petal.width variety
## 2 5.1 3.5 1.4 .2 Setosa
## 3 4.9 3 1.4 .2 Setosa
## 4 4.7 3.2 1.3 .2 Setosa
## 5 4.6 3.1 1.5 .2 Setosa
## 6 5 3.6 1.4 .2 Setosa
#Load Data from CSV URL
#Load the package
library(RCurl)
#Specify the URL for the Iris data CSV
urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
#Download the file
downloaded <- getURL(urlfile, ssl.verifypeer = FALSE)
#Treat the text data as a steam so we can read from it
connection <- textConnection(downloaded)
#Parse the downloaded data as CSV
dataset <- read.csv(connection, header = FALSE)
#Preview the first five rows
head(dataset)
## V1 V2 V3 V4 V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
#Chapter 7
#Peek at your Data
#Load the package
library(mlbench)
#Load the dataset
data(PimaIndiansDiabetes)
#Display first 20 rows
head(PimaIndiansDiabetes, n = 20)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
## 7 3 78 50 32 88 31.0 0.248 26 pos
## 8 10 115 0 0 0 35.3 0.134 29 neg
## 9 2 197 70 45 543 30.5 0.158 53 pos
## 10 8 125 96 0 0 0.0 0.232 54 pos
## 11 4 110 92 0 0 37.6 0.191 30 neg
## 12 10 168 74 0 0 38.0 0.537 34 pos
## 13 10 139 80 0 0 27.1 1.441 57 neg
## 14 1 189 60 23 846 30.1 0.398 59 pos
## 15 5 166 72 19 175 25.8 0.587 51 pos
## 16 7 100 0 0 0 30.0 0.484 32 pos
## 17 0 118 84 47 230 45.8 0.551 31 pos
## 18 7 107 74 0 0 29.6 0.254 31 pos
## 19 1 103 30 38 83 43.3 0.183 33 neg
## 20 1 115 70 30 96 34.6 0.529 32 pos
#Dimensions of your data
#Load the package
library(mlbench)
#Load the dataset
data(PimaIndiansDiabetes)
#Display the dimensions
dim(PimaIndiansDiabetes)
## [1] 768 9
#Data Types
#Load the package
library(mlbench)
#Load the dataset
data(BostonHousing)
#List types for each attribute
sapply(BostonHousing, class)
## crim zn indus chas nox rm age dis
## "numeric" "numeric" "numeric" "factor" "numeric" "numeric" "numeric" "numeric"
## rad tax ptratio b lstat medv
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
#Class Distribution
#Load the package
library(mlbench)
#Load the dataset
data(PimaIndiansDiabetes)
#Distribution of class variable
y <- PimaIndiansDiabetes$diabetes
cbind(freq=table(y), percentage=prop.table(table(y))*100)
## freq percentage
## neg 500 65.10417
## pos 268 34.89583
#Data Summary
#Load the Iris dataset
data(iris)
#Summarize the dataset
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
#Standard Deviations
#Load the package
library(mlbench)
#Load the dataset
data("PimaIndiansDiabetes")
#Calculate the standard deviation for all attribute
sapply(PimaIndiansDiabetes[,1:8], sd)
## pregnant glucose pressure triceps insulin mass
## 3.3695781 31.9726182 19.3558072 15.9522176 115.2440024 7.8841603
## pedigree age
## 0.3313286 11.7602315
#Skewness
#Library
library(mlbench)
library(e1071)
#Dataset
data("PimaIndiansDiabetes")
#Calc skewness for each variable
skew <- apply(PimaIndiansDiabetes[,1:8], 2, skewness)
#Display
print(skew)
## pregnant glucose pressure triceps insulin mass pedigree
## 0.8981549 0.1730754 -1.8364126 0.1089456 2.2633826 -0.4273073 1.9124179
## age
## 1.1251880
#Correlations
#Library
library(mlbench)
#Dataset
data("PimaIndiansDiabetes")
#Calc a correlation matrix for Numeric variables
correlations <- cor(PimaIndiansDiabetes[,1:8])
#Print
print(correlations)
## pregnant glucose pressure triceps insulin mass
## pregnant 1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309
## glucose 0.12945867 1.00000000 0.15258959 0.05732789 0.33135711 0.22107107
## pressure 0.14128198 0.15258959 1.00000000 0.20737054 0.08893338 0.28180529
## triceps -0.08167177 0.05732789 0.20737054 1.00000000 0.43678257 0.39257320
## insulin -0.07353461 0.33135711 0.08893338 0.43678257 1.00000000 0.19785906
## mass 0.01768309 0.22107107 0.28180529 0.39257320 0.19785906 1.00000000
## pedigree -0.03352267 0.13733730 0.04126495 0.18392757 0.18507093 0.14064695
## age 0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187
## pedigree age
## pregnant -0.03352267 0.54434123
## glucose 0.13733730 0.26351432
## pressure 0.04126495 0.23952795
## triceps 0.18392757 -0.11397026
## insulin 0.18507093 -0.04216295
## mass 0.14064695 0.03624187
## pedigree 1.00000000 0.03356131
## age 0.03356131 1.00000000
#Chapter 8
#Histograms
#Load data
data("iris")
#Create histogram
par(mfrow = c(1, 4))
for (i in 1:4) {
hist(iris[,i], main = names(iris)[i])
}

#Density plots
#Package
library(lattice)
#Dataset
data("iris")
#Create panel of density plots
par(mfrow = c(1, 4))
for (i in 1:4) {
plot(density(iris[,i]), main = names(iris)[i])
}

#Box and Whisker plots
#Load data
data("iris")
#Create box plot for each attribute
par(mfrow = c(1, 4))
for (i in 1:4) {
boxplot(iris[,i], main = names(iris)[i])
}

#Bar plots
#Package
library(mlbench)
#Dataset
data("BreastCancer")
#Create bar plot
par(mfrow = c(2,4))
for (i in 2:9) {
counts <- table(BreastCancer[,i])
name <- names(BreastCancer) [i]
barplot(counts, main = name)
}

#Missing Plot
#Library
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mlbench)
#Dataset
data(Soybean)
#Create missing map
missmap(Soybean, col = c("black", "grey"), legend = FALSE)

#Correlation plot
#Package
library(corrplot)
## corrplot 0.92 loaded
#Data
data("iris")
#Calc correlation
correlations <- cor(iris[,1:4])
#Create plot
corrplot(correlations, method = "circle")

#Scatterplot matrix
#Load the Iris dataset
data(iris)
#Pair-wise scatterplot
pairs(iris)

#Scatterplot by class
#Load the Iris dataset
data(iris)
#Pair-wise scatterplot
pairs(Species~., data = iris, col = iris$Species)

#Density by class
#Package
library(caret)
## Loading required package: ggplot2
library(ggplot2)
#Data
data("iris")
#Density plot for each attribute by class
x <- iris[,1:4]
y <- iris[,5]
scales <- list(x = list(relation = "free"), y = list(relation = "free"))
featurePlot(x = x, y = y, plot = "density", scales = scales)

#Box and Whisker by class
#Package
library(caret)
#Data
data("iris")
#box and whisker each attribute by class
x <- iris[,1:4]
y <- iris[,5]
featurePlot(x = x, y = y, plot = "box")

#Chapter 9
#Scale Data
#Packages
library(caret)
#Dataset
data(iris)
#Summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#Calc pre process parameters from dataset
preprocessParams <- preProcess(iris[,1:4], method = c("scale"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (4)
#Transform dataset using parameters
transformed <- predict(preprocessParams, iris[,1:4])
#Summarize transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :5.193 Min. : 4.589 Min. :0.5665 Min. :0.1312
## 1st Qu.:6.159 1st Qu.: 6.424 1st Qu.:0.9064 1st Qu.:0.3936
## Median :7.004 Median : 6.883 Median :2.4642 Median :1.7055
## Mean :7.057 Mean : 7.014 Mean :2.1288 Mean :1.5734
## 3rd Qu.:7.729 3rd Qu.: 7.571 3rd Qu.:2.8890 3rd Qu.:2.3615
## Max. :9.540 Max. :10.095 Max. :3.9087 Max. :3.2798
#Center Data
#Packages
library(caret)
#Dataset
data(iris)
#Summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#Calc pre process parameters from dataset
preprocessParams <- preProcess(iris[,1:4], method = c("center"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
#Transform dataset using parameters
transformed <- predict(preprocessParams, iris[,1:4])
#Summarize transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.54333 Min. :-1.05733 Min. :-2.758 Min. :-1.0993
## 1st Qu.:-0.74333 1st Qu.:-0.25733 1st Qu.:-2.158 1st Qu.:-0.8993
## Median :-0.04333 Median :-0.05733 Median : 0.592 Median : 0.1007
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 0.55667 3rd Qu.: 0.24267 3rd Qu.: 1.342 3rd Qu.: 0.6007
## Max. : 2.05667 Max. : 1.34267 Max. : 3.142 Max. : 1.3007
#Standardize data
#Packages
library(caret)
#Dataset
data(iris)
#Summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#Calc pre process parameters from dataset
preprocessParams <- preProcess(iris[,1:4], method = c("center", "scale"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#Transform dataset using parameters
transformed <- predict(preprocessParams, iris[,1:4])
#Summarize transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
#Normalize data
#Packages
library(caret)
#Dataset
data(iris)
#Summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#Calc pre process parameters from dataset
preprocessParams <- preProcess(iris[,1:4], method = c("range"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (4)
#Transform dataset using parameters
transformed <- predict(preprocessParams, iris[,1:4])
#Summarize transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
#Box-Cox Transform
#Packages
library(mlbench)
library(caret)
#Dataset
data(PimaIndiansDiabetes)
#Summarize pedigree and age
summary(PimaIndiansDiabetes[,7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
#Calc pre process parameters from dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method = c("BoxCox"))
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - Box-Cox transformation (2)
## - ignored (0)
##
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1
#Transform dataset using parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
#Summarize transformed dataset
summary(transformed)
## pedigree age
## Min. :-2.5510 Min. :0.8772
## 1st Qu.:-1.4116 1st Qu.:0.8815
## Median :-0.9875 Median :0.8867
## Mean :-0.9599 Mean :0.8874
## 3rd Qu.:-0.4680 3rd Qu.:0.8938
## Max. : 0.8838 Max. :0.9019
#Yeo-Johnson transform
#Packages
library(mlbench)
library(caret)
#Dataset
data(PimaIndiansDiabetes)
#Summarize pedigree and age
summary(PimaIndiansDiabetes[,7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
#Calc pre process parameters from dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method = c("YeoJohnson"))
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - ignored (0)
## - Yeo-Johnson transformation (2)
##
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15
#Transform dataset using parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
#Summarize transformed dataset (note pedigree and age)
summary(transformed)
## pedigree age
## Min. :0.0691 Min. :0.8450
## 1st Qu.:0.1724 1st Qu.:0.8484
## Median :0.2265 Median :0.8524
## Mean :0.2317 Mean :0.8530
## 3rd Qu.:0.2956 3rd Qu.:0.8580
## Max. :0.4164 Max. :0.8644
#Principal Component Analysis Transform
#Packages
library(mlbench)
#Dataset
data(iris)
#Summarize data
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
#Calc pre process parameters from dataset
preprocessParams <- preProcess(iris, method = c("center", "scale", "pca"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 5 variables
##
## Pre-processing:
## - centered (4)
## - ignored (1)
## - principal component signal extraction (4)
## - scaled (4)
##
## PCA needed 2 components to capture 95 percent of the variance
#Transform dataset using parameters
transformed <- predict(preprocessParams, iris)
#Summarize transformed dataset
summary(transformed)
## Species PC1 PC2
## setosa :50 Min. :-2.7651 Min. :-2.67732
## versicolor:50 1st Qu.:-2.0957 1st Qu.:-0.59205
## virginica :50 Median : 0.4169 Median :-0.01744
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.3385 3rd Qu.: 0.59649
## Max. : 3.2996 Max. : 2.64521
#Independent Component Analysis Transform
#Packages
library(mlbench)
#Dataset
data(PimaIndiansDiabetes)
#Summarize data
summary(PimaIndiansDiabetes[,1:8])
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
#Calc pre process parameters from dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method = c("center", "scale", "ica"), n.comp=5)
#Summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - centered (8)
## - independent component signal extraction (8)
## - ignored (0)
## - scaled (8)
##
## ICA used 5 components
#Transform dataset using parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
#Summarize transformed dataset
summary(transformed)
## ICA1 ICA2 ICA3 ICA4
## Min. :-4.89611 Min. :-2.1765 Min. :-1.3614 Min. :-1.9059
## 1st Qu.:-0.48097 1st Qu.:-0.6853 1st Qu.:-0.6963 1st Qu.:-0.8265
## Median : 0.04968 Median :-0.1826 Median :-0.2983 Median :-0.2394
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.56587 3rd Qu.: 0.4900 3rd Qu.: 0.4671 3rd Qu.: 0.7065
## Max. : 4.25493 Max. : 5.7207 Max. : 6.0287 Max. : 2.9733
## ICA5
## Min. :-3.757140
## 1st Qu.:-0.631258
## Median :-0.005211
## Mean : 0.000000
## 3rd Qu.: 0.644065
## Max. : 2.573035