print ("Iris Dataset Analysis")
## [1] "Iris Dataset Analysis"
print("Q1: Iris Dataset Characteristics ")
## [1] "Q1: Iris Dataset Characteristics "
library(mlbench)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(corrplot)
## corrplot 0.92 loaded
library (DMwR2)
## Warning: package 'DMwR2' was built under R version 4.2.1
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(iris)
dim(iris)
## [1] 150 5
head(iris, 20)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
round(prop.table(table(iris$Species)),2)
##
## setosa versicolor virginica
## 0.33 0.33 0.33
print("Finding Dataset statistics")
## [1] "Finding Dataset statistics"
iris_tbl<-tibble::as_tibble(iris)
iris_tbl%>% summarise(across(Sepal.Length:Petal.Width,list( mean=mean, median=median)))
## # A tibble: 1 × 8
## Sepal.Length_mean Sepal.Leng…¹ Sepal…² Sepal…³ Petal…⁴ Petal…⁵ Petal…⁶ Petal…⁷
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.84 5.8 3.06 3 3.76 4.35 1.20 1.3
## # … with abbreviated variable names ¹Sepal.Length_median, ²Sepal.Width_mean,
## # ³Sepal.Width_median, ⁴Petal.Length_mean, ⁵Petal.Length_median,
## # ⁶Petal.Width_mean, ⁷Petal.Width_median
iris_tbl%>%summarise(across(Sepal.Length:Petal.Width, quantile))
## # A tibble: 5 × 4
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 4.3 2 1 0.1
## 2 5.1 2.8 1.6 0.3
## 3 5.8 3 4.35 1.3
## 4 6.4 3.3 5.1 1.8
## 5 7.9 4.4 6.9 2.5
describe(iris)
## iris
##
## 5 Variables 150 Observations
## --------------------------------------------------------------------------------
## Sepal.Length
## n missing distinct Info Mean Gmd .05 .10
## 150 0 35 0.998 5.843 0.9462 4.600 4.800
## .25 .50 .75 .90 .95
## 5.100 5.800 6.400 6.900 7.255
##
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
## --------------------------------------------------------------------------------
## Sepal.Width
## n missing distinct Info Mean Gmd .05 .10
## 150 0 23 0.992 3.057 0.4872 2.345 2.500
## .25 .50 .75 .90 .95
## 2.800 3.000 3.300 3.610 3.800
##
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
## --------------------------------------------------------------------------------
## Petal.Length
## n missing distinct Info Mean Gmd .05 .10
## 150 0 43 0.998 3.758 1.979 1.30 1.40
## .25 .50 .75 .90 .95
## 1.60 4.35 5.10 5.80 6.10
##
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
## --------------------------------------------------------------------------------
## Petal.Width
## n missing distinct Info Mean Gmd .05 .10
## 150 0 22 0.99 1.199 0.8676 0.2 0.2
## .25 .50 .75 .90 .95
## 0.3 1.3 1.8 2.2 2.3
##
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
## --------------------------------------------------------------------------------
## Species
## n missing distinct
## 150 0 3
##
## Value setosa versicolor virginica
## Frequency 50 50 50
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
correlations<-cor(iris[, 1:4])
correlations
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
Mode<-function(x, na.rm=FALSE){
if(na.rm) x<-x[!is.na(x)]
ux<-unique(x)
return(ux[which.max(tabulate(match(x,ux)))])
}
Mode(iris$Sepal.Length)
## [1] 5
aggregate(iris$Sepal.Length,list(Species=iris$Species), quantile)
## Species x.0% x.25% x.50% x.75% x.100%
## 1 setosa 4.300 4.800 5.000 5.200 5.800
## 2 versicolor 4.900 5.600 5.900 6.300 7.000
## 3 virginica 4.900 6.225 6.500 6.900 7.900
aggregate(iris$Sepal.Length,list(Species=iris$Species), Mode)
## Species x
## 1 setosa 5.1
## 2 versicolor 5.5
## 3 virginica 6.3
print("Q2: IRIS Dataset Preprocessing")
## [1] "Q2: IRIS Dataset Preprocessing"
library(rlang)
## Warning: package 'rlang' was built under R version 4.2.1
library(caret)
## Warning: package 'caret' was built under R version 4.2.1
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
#summarize data
summary(iris[1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#Find the prep-process parameters
preprocessParams<-preProcess(iris[1:4], methods=c("center", "scale"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
iris_standardized<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_standardized)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
preprocessParams<-preProcess(iris[1:4], methods=c("range"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
iris_normalized<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_normalized)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
preprocessParams<-preProcess(iris[1:4], methods=c("BoxCox"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
iris_boxcox<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_boxcox)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
preprocessParams<-preProcess(iris[1:4], methods=c("YeoJohnson"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
iris_YJ<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_YJ)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
## Finding important features
preprocessParams<-preProcess(iris[1:4], methods=c("center", "scale", "pca"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
iris_scaled_pca<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_scaled_pca)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
preprocessParams<-preProcess(iris[1:4], methods=c("center", "scale", "ica"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
iris_scaled_ica<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_scaled_ica)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
library(CORElearn)
## Warning: package 'CORElearn' was built under R version 4.2.1
iris.pca<-princomp(iris[-5])
loadings(iris.pca)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## Sepal.Length 0.361 0.657 0.582 0.315
## Sepal.Width 0.730 -0.598 -0.320
## Petal.Length 0.857 -0.173 -0.480
## Petal.Width 0.358 -0.546 0.754
##
## Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings 1.00 1.00 1.00 1.00
## Proportion Var 0.25 0.25 0.25 0.25
## Cumulative Var 0.25 0.50 0.75 1.00
## IRIS dataset visualization
ggplot(data=iris)+ geom_point(mapping =aes(x=Petal.Length, y=Petal.Width, color=Species))

par(mfrow=c(1,4))
for(i in 1:4){
hist(iris[,i], main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){ plot(density(iris[,i]), main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){boxplot(iris[,i], main=names(iris)[i])}

library(corrplot)
## corrplot 0.92 loaded
data("iris")
correlations<-cor(iris[, 1:4])
pairs(iris)

pairs(Species~., data=iris, col=iris$Species)

library(caret)
x<-iris[, 1:4]
y<-iris[,5]
scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x,y=y, plot="density", scales=scales)

featurePlot(x=x,y=y, plot="box")
