filename<-"C:/Users/18324/Documents/AI4OPT/R stats/iris.csv"
dataset<-read.csv(filename, header= FALSE)
#preview the first 5 rows
head(dataset)
## V1 V2 V3 V4 V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
library(RCurl)
url<-"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
#download the file
downloaded<-getURL(url,ssl.verifypeer=FALSE)
connection<-textConnection(downloaded)
dataset<-read.csv(connection, header=FALSE)
#Preview first 5 rows
head(dataset)
## V1 V2 V3 V4 V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
data(iris)
par(mfrow=c(1,4))
for(i in 1:4){
hist(iris[,i], main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){ plot(density(iris[,i]), main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){boxplot(iris[,i], main=names(iris)[i])}

library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mlbench)
data("Soybean")
missmap(Soybean, col=c("black", "grey"),legend=FALSE)
# MULTIVARIATE VISUALIZATION
library(corrplot)
## corrplot 0.92 loaded
data("iris")
correlations<-cor(iris[, 1:4])
pairs(iris)


pairs(Species~., data=iris, col=iris$Species)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice

x<-iris[, 1:4]
y<-iris[,5]
scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x,y=y, plot="density", scales=scales)

featurePlot(x=x,y=y, plot="box")

#Section Peek At your Data
library(mlbench)
data("PimaIndiansDiabetes")
head(PimaIndiansDiabetes, n=20)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
## 7 3 78 50 32 88 31.0 0.248 26 pos
## 8 10 115 0 0 0 35.3 0.134 29 neg
## 9 2 197 70 45 543 30.5 0.158 53 pos
## 10 8 125 96 0 0 0.0 0.232 54 pos
## 11 4 110 92 0 0 37.6 0.191 30 neg
## 12 10 168 74 0 0 38.0 0.537 34 pos
## 13 10 139 80 0 0 27.1 1.441 57 neg
## 14 1 189 60 23 846 30.1 0.398 59 pos
## 15 5 166 72 19 175 25.8 0.587 51 pos
## 16 7 100 0 0 0 30.0 0.484 32 pos
## 17 0 118 84 47 230 45.8 0.551 31 pos
## 18 7 107 74 0 0 29.6 0.254 31 pos
## 19 1 103 30 38 83 43.3 0.183 33 neg
## 20 1 115 70 30 96 34.6 0.529 32 pos
dim(PimaIndiansDiabetes)
## [1] 768 9
data("BostonHousing")
sapply(BostonHousing, class)
## crim zn indus chas nox rm age dis
## "numeric" "numeric" "numeric" "factor" "numeric" "numeric" "numeric" "numeric"
## rad tax ptratio b lstat medv
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
#Distribution of class variables
y<-PimaIndiansDiabetes$diabetes
cbind(freq=table(y), percentage=prop.table(table(y)*100))
## freq percentage
## neg 500 0.6510417
## pos 268 0.3489583
summary(PimaIndiansDiabetes$glucose)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 99.0 117.0 120.9 140.2 199.0
#Chapter 9
library(caret)
data("iris")
print("Scale Data")
## [1] "Scale Data"
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
preProcessParams<-preProcess(iris[,1:4], method=c("scale"))
print(preProcessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (4)
transformed<-predict(preProcessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :5.193 Min. : 4.589 Min. :0.5665 Min. :0.1312
## 1st Qu.:6.159 1st Qu.: 6.424 1st Qu.:0.9064 1st Qu.:0.3936
## Median :7.004 Median : 6.883 Median :2.4642 Median :1.7055
## Mean :7.057 Mean : 7.014 Mean :2.1288 Mean :1.5734
## 3rd Qu.:7.729 3rd Qu.: 7.571 3rd Qu.:2.8890 3rd Qu.:2.3615
## Max. :9.540 Max. :10.095 Max. :3.9087 Max. :3.2798
#Checking on one attribute Sepal.Length
ISd<-sd(iris$Sepal.Length)
ISd
## [1] 0.8280661
min(iris$Sepal.Length)/ISd
## [1] 5.192822
print("Center Data")
## [1] "Center Data"
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
preProcessParams<-preProcess(iris[,1:4], method=c("center"))
print(preProcessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
transformed<-predict(preProcessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.54333 Min. :-1.05733 Min. :-2.758 Min. :-1.0993
## 1st Qu.:-0.74333 1st Qu.:-0.25733 1st Qu.:-2.158 1st Qu.:-0.8993
## Median :-0.04333 Median :-0.05733 Median : 0.592 Median : 0.1007
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 0.55667 3rd Qu.: 0.24267 3rd Qu.: 1.342 3rd Qu.: 0.6007
## Max. : 2.05667 Max. : 1.34267 Max. : 3.142 Max. : 1.3007
print("Standardize data")
## [1] "Standardize data"
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
preProcessParams<-preProcess(iris[,1:4], method=c("center", "scale"))
print(preProcessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
transformed<-predict(preProcessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
print("Normalize Data")
## [1] "Normalize Data"
preProcessParams<-preProcess(iris[,1:4], method=c("range"))
print(preProcessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (4)
transformed<-predict(preProcessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
print("Box-Cox Transform")
## [1] "Box-Cox Transform"
library(mlbench)
library(caret)
data("PimaIndiansDiabetes")
summary(PimaIndiansDiabetes[, 7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
preProcessParams<-preProcess(PimaIndiansDiabetes[,7:8], method=c("BoxCox"))
print(preProcessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - Box-Cox transformation (2)
## - ignored (0)
##
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1
transformed<-predict(preProcessParams, PimaIndiansDiabetes[, 7:8])
summary(transformed)
## pedigree age
## Min. :-2.5510 Min. :0.8772
## 1st Qu.:-1.4116 1st Qu.:0.8815
## Median :-0.9875 Median :0.8867
## Mean :-0.9599 Mean :0.8874
## 3rd Qu.:-0.4680 3rd Qu.:0.8938
## Max. : 0.8838 Max. :0.9019
print("Yeo Johnson Transform")
## [1] "Yeo Johnson Transform"
data("PimaIndiansDiabetes")
summary(PimaIndiansDiabetes[, 7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
preProcessParams<-preProcess(PimaIndiansDiabetes[,7:8], method=c("YeoJohnson"))
print(preProcessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - ignored (0)
## - Yeo-Johnson transformation (2)
##
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15
transformed<-predict(preProcessParams, PimaIndiansDiabetes[, 7:8])
summary(transformed)
## pedigree age
## Min. :0.0691 Min. :0.8450
## 1st Qu.:0.1724 1st Qu.:0.8484
## Median :0.2265 Median :0.8524
## Mean :0.2317 Mean :0.8530
## 3rd Qu.:0.2956 3rd Qu.:0.8580
## Max. :0.4164 Max. :0.8644
print("Principal Component Analysis Transform")
## [1] "Principal Component Analysis Transform"
data(iris)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
preProcessParams<-preProcess(iris, method=c("center","scale","pca"))
print(preProcessParams)
## Created from 150 samples and 5 variables
##
## Pre-processing:
## - centered (4)
## - ignored (1)
## - principal component signal extraction (4)
## - scaled (4)
##
## PCA needed 2 components to capture 95 percent of the variance
transformed<-predict(preProcessParams, iris)
summary(transformed)
## Species PC1 PC2
## setosa :50 Min. :-2.7651 Min. :-2.67732
## versicolor:50 1st Qu.:-2.0957 1st Qu.:-0.59205
## virginica :50 Median : 0.4169 Median :-0.01744
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.3385 3rd Qu.: 0.59649
## Max. : 3.2996 Max. : 2.64521
print("Independent Component Analysis Transform")
## [1] "Independent Component Analysis Transform"
library(fastICA)
data("PimaIndiansDiabetes")
summary(PimaIndiansDiabetes[, 7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
preProcessParams<-preProcess(PimaIndiansDiabetes[,1:8], method=c("center", "scale","ica"), n.comp=5)
print(preProcessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - centered (8)
## - independent component signal extraction (8)
## - ignored (0)
## - scaled (8)
##
## ICA used 5 components
transformed<-predict(preProcessParams, PimaIndiansDiabetes[, 1:8])
summary(transformed)
## ICA1 ICA2 ICA3 ICA4
## Min. :-2.38087 Min. :-4.89623 Min. :-6.0212 Min. :-1.4141
## 1st Qu.:-0.73737 1st Qu.:-0.48395 1st Qu.:-0.4290 1st Qu.:-0.8414
## Median : 0.07183 Median : 0.02404 Median : 0.2595 Median :-0.2787
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.72242 3rd Qu.: 0.59486 3rd Qu.: 0.6822 3rd Qu.: 0.7691
## Max. : 2.93767 Max. : 4.17244 Max. : 1.5749 Max. : 3.0683
## ICA5
## Min. :-5.5400
## 1st Qu.:-0.4688
## Median : 0.1392
## Mean : 0.0000
## 3rd Qu.: 0.6491
## Max. : 3.2184