library(RCurl)
# specify the URL for the Iris data CSV
urlfile <-'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
# download the file
downloaded <- getURL(urlfile, ssl.verifypeer=FALSE)
# treat the text data as a steam so we can read from it
connection <- textConnection(downloaded)
# parse the downloaded data as CSV
dataset <- read.csv(connection, header=FALSE)
# preview the first 5 rows
head(dataset)
## V1 V2 V3 V4 V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
# load the package
library(mlbench)
# load the dataset
data(PimaIndiansDiabetes)
# display first 20 rows of data
head(PimaIndiansDiabetes, n=20)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
## 7 3 78 50 32 88 31.0 0.248 26 pos
## 8 10 115 0 0 0 35.3 0.134 29 neg
## 9 2 197 70 45 543 30.5 0.158 53 pos
## 10 8 125 96 0 0 0.0 0.232 54 pos
## 11 4 110 92 0 0 37.6 0.191 30 neg
## 12 10 168 74 0 0 38.0 0.537 34 pos
## 13 10 139 80 0 0 27.1 1.441 57 neg
## 14 1 189 60 23 846 30.1 0.398 59 pos
## 15 5 166 72 19 175 25.8 0.587 51 pos
## 16 7 100 0 0 0 30.0 0.484 32 pos
## 17 0 118 84 47 230 45.8 0.551 31 pos
## 18 7 107 74 0 0 29.6 0.254 31 pos
## 19 1 103 30 38 83 43.3 0.183 33 neg
## 20 1 115 70 30 96 34.6 0.529 32 pos
# load the packages
library(mlbench)
# load the dataset
data(PimaIndiansDiabetes)
# display the dimensions of the dataset
dim(PimaIndiansDiabetes)
## [1] 768 9
# load package
library(mlbench)
# load dataset
data(BostonHousing)
# list types for each attribute
sapply(BostonHousing, class)
## crim zn indus chas nox rm age dis
## "numeric" "numeric" "numeric" "factor" "numeric" "numeric" "numeric" "numeric"
## rad tax ptratio b lstat medv
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# load the packages
library(mlbench)
# load the dataset
data(PimaIndiansDiabetes)
# distribution of class variable
y <- PimaIndiansDiabetes$diabetes
cbind(freq=table(y), percentage=prop.table(table(y))*100)
## freq percentage
## neg 500 65.10417
## pos 268 34.89583
# load the iris dataset
data(iris)
# summarize the dataset
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# load the packages
library(mlbench)
# load the dataset
data(PimaIndiansDiabetes)
# calculate standard deviation for all attributes
sapply(PimaIndiansDiabetes[,1:8], sd)
## pregnant glucose pressure triceps insulin mass
## 3.3695781 31.9726182 19.3558072 15.9522176 115.2440024 7.8841603
## pedigree age
## 0.3313286 11.7602315
# load packages
library(mlbench)
library(e1071)
# load the dataset
data(PimaIndiansDiabetes)
# calculate skewness for each variable
skew <- apply(PimaIndiansDiabetes[,1:8], 2, skewness)
# display skewness, larger/smaller deviations from 0 show more skew
print(skew)
## pregnant glucose pressure triceps insulin mass pedigree
## 0.8981549 0.1730754 -1.8364126 0.1089456 2.2633826 -0.4273073 1.9124179
## age
## 1.1251880
# load the packages
library(mlbench)
# load the dataset
data(PimaIndiansDiabetes)
# calculate a correlation matrix for numeric variables
correlations <- cor(PimaIndiansDiabetes[,1:8])
# display the correlation matrix
print(correlations)
## pregnant glucose pressure triceps insulin mass
## pregnant 1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309
## glucose 0.12945867 1.00000000 0.15258959 0.05732789 0.33135711 0.22107107
## pressure 0.14128198 0.15258959 1.00000000 0.20737054 0.08893338 0.28180529
## triceps -0.08167177 0.05732789 0.20737054 1.00000000 0.43678257 0.39257320
## insulin -0.07353461 0.33135711 0.08893338 0.43678257 1.00000000 0.19785906
## mass 0.01768309 0.22107107 0.28180529 0.39257320 0.19785906 1.00000000
## pedigree -0.03352267 0.13733730 0.04126495 0.18392757 0.18507093 0.14064695
## age 0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187
## pedigree age
## pregnant -0.03352267 0.54434123
## glucose 0.13733730 0.26351432
## pressure 0.04126495 0.23952795
## triceps 0.18392757 -0.11397026
## insulin 0.18507093 -0.04216295
## mass 0.14064695 0.03624187
## pedigree 1.00000000 0.03356131
## age 0.03356131 1.00000000
# Scale Data
# load packages
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method=c("scale"))
# summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (4)
# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
# summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :5.193 Min. : 4.589 Min. :0.5665 Min. :0.1312
## 1st Qu.:6.159 1st Qu.: 6.424 1st Qu.:0.9064 1st Qu.:0.3936
## Median :7.004 Median : 6.883 Median :2.4642 Median :1.7055
## Mean :7.057 Mean : 7.014 Mean :2.1288 Mean :1.5734
## 3rd Qu.:7.729 3rd Qu.: 7.571 3rd Qu.:2.8890 3rd Qu.:2.3615
## Max. :9.540 Max. :10.095 Max. :3.9087 Max. :3.2798
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method=c("center", "scale"))
# summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
# summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method=c("range"))
# summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (4)
# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
# summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize pedigree and age
summary(PimaIndiansDiabetes[,7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("BoxCox"))
# summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - Box-Cox transformation (2)
## - ignored (0)
##
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1
# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)
## pedigree age
## Min. :-2.5510 Min. :0.8772
## 1st Qu.:-1.4116 1st Qu.:0.8815
## Median :-0.9875 Median :0.8867
## Mean :-0.9599 Mean :0.8874
## 3rd Qu.:-0.4680 3rd Qu.:0.8938
## Max. : 0.8838 Max. :0.9019
# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize pedigree and age
summary(PimaIndiansDiabetes[,7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("YeoJohnson"))
# summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - ignored (0)
## - Yeo-Johnson transformation (2)
##
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15
# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)
## pedigree age
## Min. :0.0691 Min. :0.8450
## 1st Qu.:0.1724 1st Qu.:0.8484
## Median :0.2265 Median :0.8524
## Mean :0.2317 Mean :0.8530
## 3rd Qu.:0.2956 3rd Qu.:0.8580
## Max. :0.4164 Max. :0.8644
# load the packages
library(mlbench)
# load the dataset
data(iris)
# summarize dataset
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris, method=c("center", "scale", "pca"))
# summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 5 variables
##
## Pre-processing:
## - centered (4)
## - ignored (1)
## - principal component signal extraction (4)
## - scaled (4)
##
## PCA needed 2 components to capture 95 percent of the variance
# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris)
# summarize the transformed dataset
summary(transformed)
## Species PC1 PC2
## setosa :50 Min. :-2.7651 Min. :-2.67732
## versicolor:50 1st Qu.:-2.0957 1st Qu.:-0.59205
## virginica :50 Median : 0.4169 Median :-0.01744
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.3385 3rd Qu.: 0.59649
## Max. : 3.2996 Max. : 2.64521
# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize dataset
summary(PimaIndiansDiabetes[,1:8])
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method=c("center", "scale",
"ica"), n.comp=5)
# summarize transform parameters
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - centered (8)
## - independent component signal extraction (8)
## - ignored (0)
## - scaled (8)
##
## ICA used 5 components
# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
# summarize the transformed dataset
summary(transformed)
## ICA1 ICA2 ICA3 ICA4
## Min. :-1.4116 Min. :-4.89618 Min. :-5.5388 Min. :-2.38127
## 1st Qu.:-0.8440 1st Qu.:-0.48449 1st Qu.:-0.4700 1st Qu.:-0.73739
## Median :-0.2790 Median : 0.02406 Median : 0.1396 Median : 0.07218
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.7683 3rd Qu.: 0.59585 3rd Qu.: 0.6491 3rd Qu.: 0.72099
## Max. : 3.0662 Max. : 4.17273 Max. : 3.2236 Max. : 2.93288
## ICA5
## Min. :-1.5760
## 1st Qu.:-0.6829
## Median :-0.2597
## Mean : 0.0000
## 3rd Qu.: 0.4285
## Max. : 6.0214
# load the data
data(iris)
# create histograms for each attribute
par(mfrow=c(1,4))
for(i in 1:4) {
hist(iris[,i], main=names(iris)[i])
}

# load packages
library(lattice)
# load dataset
data(iris)
# create a panel of simpler density plots by attribute
par(mfrow=c(1,4))
for(i in 1:4) {
plot(density(iris[,i]), main=names(iris)[i])
}

# load dataset
data(iris)
# Create separate boxplots for each attribute
par(mfrow=c(1,4))
for(i in 1:4) {
boxplot(iris[,i], main=names(iris)[i])
}

# load the package
library(mlbench)
# load the dataset
data(BreastCancer)
# create a bar plot of each categorical attribute
par(mfrow=c(2,4))
for(i in 2:9) {
counts <- table(BreastCancer[,i])
name <- names(BreastCancer)[i]
barplot(counts, main=name)
}

# load packages
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mlbench)
# load dataset
data(Soybean)
# create a missing map
missmap(Soybean, col=c("black", "grey"), legend=FALSE)
# load package
library(corrplot)
## corrplot 0.92 loaded
# load the data
data(iris)
# calculate correlations
correlations <- cor(iris[,1:4])
# create correlation plot
corrplot(correlations, method="circle")
# load the data
data(iris)
# pair-wise scatterplots of all 4 attributes
pairs(iris)


# load the data
data(iris)
# pair-wise scatterplots colored by class
pairs(Species~., data=iris, col=iris$Species)

# load the package
library(caret)
# load the data
data(iris)
# density plots for each attribute by class value
x <- iris[,1:4]
y <- iris[,5]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

# load the package
library(caret)
# load the iris data set
data(iris)
# box and whisker plots for each attribute by class value
x <- iris[,1:4]
y <- iris[,5]
featurePlot(x=x, y=y, plot="box")
