print ("Iris Dataset Analysis")
## [1] "Iris Dataset Analysis"
print("Q1: Iris  Dataset Characteristics ")
## [1] "Q1: Iris  Dataset Characteristics "
library(mlbench)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(corrplot)
## corrplot 0.92 loaded
library (DMwR2)
## Warning: package 'DMwR2' was built under R version 4.2.1
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library (dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(iris)
dim(iris)
## [1] 150   5
head(iris, 20)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 13          4.8         3.0          1.4         0.1  setosa
## 14          4.3         3.0          1.1         0.1  setosa
## 15          5.8         4.0          1.2         0.2  setosa
## 16          5.7         4.4          1.5         0.4  setosa
## 17          5.4         3.9          1.3         0.4  setosa
## 18          5.1         3.5          1.4         0.3  setosa
## 19          5.7         3.8          1.7         0.3  setosa
## 20          5.1         3.8          1.5         0.3  setosa
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris[, 1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
round(prop.table(table(iris$Species)),2)
## 
##     setosa versicolor  virginica 
##       0.33       0.33       0.33
print("Finding Dataset statistics")
## [1] "Finding Dataset statistics"
iris_tbl<-tibble::as_tibble(iris)
iris_tbl%>% summarise(across(Sepal.Length:Petal.Width,list( mean=mean, median=median)))
## # A tibble: 1 × 8
##   Sepal.Length_mean Sepal.Leng…¹ Sepal…² Sepal…³ Petal…⁴ Petal…⁵ Petal…⁶ Petal…⁷
##               <dbl>        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1              5.84          5.8    3.06       3    3.76    4.35    1.20     1.3
## # … with abbreviated variable names ¹​Sepal.Length_median, ²​Sepal.Width_mean,
## #   ³​Sepal.Width_median, ⁴​Petal.Length_mean, ⁵​Petal.Length_median,
## #   ⁶​Petal.Width_mean, ⁷​Petal.Width_median
iris_tbl%>%summarise(across(Sepal.Length:Petal.Width, quantile))
## # A tibble: 5 × 4
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <dbl>       <dbl>        <dbl>       <dbl>
## 1          4.3         2           1            0.1
## 2          5.1         2.8         1.6          0.3
## 3          5.8         3           4.35         1.3
## 4          6.4         3.3         5.1          1.8
## 5          7.9         4.4         6.9          2.5
describe(iris)
## iris 
## 
##  5  Variables      150  Observations
## --------------------------------------------------------------------------------
## Sepal.Length 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       35    0.998    5.843   0.9462    4.600    4.800 
##      .25      .50      .75      .90      .95 
##    5.100    5.800    6.400    6.900    7.255 
## 
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
## --------------------------------------------------------------------------------
## Sepal.Width 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       23    0.992    3.057   0.4872    2.345    2.500 
##      .25      .50      .75      .90      .95 
##    2.800    3.000    3.300    3.610    3.800 
## 
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
## --------------------------------------------------------------------------------
## Petal.Length 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       43    0.998    3.758    1.979     1.30     1.40 
##      .25      .50      .75      .90      .95 
##     1.60     4.35     5.10     5.80     6.10 
## 
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
## --------------------------------------------------------------------------------
## Petal.Width 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       22     0.99    1.199   0.8676      0.2      0.2 
##      .25      .50      .75      .90      .95 
##      0.3      1.3      1.8      2.2      2.3 
## 
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
## --------------------------------------------------------------------------------
## Species 
##        n  missing distinct 
##      150        0        3 
##                                            
## Value          setosa versicolor  virginica
## Frequency          50         50         50
## Proportion      0.333      0.333      0.333
## --------------------------------------------------------------------------------
correlations<-cor(iris[, 1:4])
correlations
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
Mode<-function(x, na.rm=FALSE){
if(na.rm) x<-x[!is.na(x)] 
ux<-unique(x)
return(ux[which.max(tabulate(match(x,ux)))])
}
Mode(iris$Sepal.Length)
## [1] 5
aggregate(iris$Sepal.Length,list(Species=iris$Species), quantile)
##      Species  x.0% x.25% x.50% x.75% x.100%
## 1     setosa 4.300 4.800 5.000 5.200  5.800
## 2 versicolor 4.900 5.600 5.900 6.300  7.000
## 3  virginica 4.900 6.225 6.500 6.900  7.900
aggregate(iris$Sepal.Length,list(Species=iris$Species), Mode)
##      Species   x
## 1     setosa 5.1
## 2 versicolor 5.5
## 3  virginica 6.3



print("Q2: IRIS Dataset Preprocessing")
## [1] "Q2: IRIS Dataset Preprocessing"
library(rlang)
## Warning: package 'rlang' was built under R version 4.2.1
library(caret)
## Warning: package 'caret' was built under R version 4.2.1
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
#summarize data
summary(iris[1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
#Find the prep-process parameters
preprocessParams<-preProcess(iris[1:4], methods=c("center", "scale"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)
#transform the dataset using the parameters
iris_standardized<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_standardized)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
preprocessParams<-preProcess(iris[1:4], methods=c("range"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)
#transform the dataset using the parameters
iris_normalized<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_normalized)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
preprocessParams<-preProcess(iris[1:4], methods=c("BoxCox"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)
#transform the dataset using the parameters
iris_boxcox<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_boxcox)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
preprocessParams<-preProcess(iris[1:4], methods=c("YeoJohnson"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)
#transform the dataset using the parameters
iris_YJ<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_YJ)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
## Finding important features
preprocessParams<-preProcess(iris[1:4], methods=c("center", "scale", "pca"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)
#transform the dataset using the parameters
iris_scaled_pca<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_scaled_pca)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
preprocessParams<-preProcess(iris[1:4], methods=c("center", "scale", "ica"))
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)
#transform the dataset using the parameters
iris_scaled_ica<-predict(preprocessParams, iris[, 1:4])
#summarize the transformed dataset
summary(iris_scaled_ica)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
library(CORElearn)
## Warning: package 'CORElearn' was built under R version 4.2.1
iris.pca<-princomp(iris[-5])
loadings(iris.pca)
## 
## Loadings:
##              Comp.1 Comp.2 Comp.3 Comp.4
## Sepal.Length  0.361  0.657  0.582  0.315
## Sepal.Width          0.730 -0.598 -0.320
## Petal.Length  0.857 -0.173        -0.480
## Petal.Width   0.358        -0.546  0.754
## 
##                Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings      1.00   1.00   1.00   1.00
## Proportion Var   0.25   0.25   0.25   0.25
## Cumulative Var   0.25   0.50   0.75   1.00
## IRIS dataset visualization

ggplot(data=iris)+ geom_point(mapping =aes(x=Petal.Length, y=Petal.Width, color=Species))

par(mfrow=c(1,4))
for(i in 1:4){
  hist(iris[,i], main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){ plot(density(iris[,i]), main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){boxplot(iris[,i], main=names(iris)[i])}

library(corrplot)
## corrplot 0.92 loaded
data("iris")
correlations<-cor(iris[, 1:4])
pairs(iris)

pairs(Species~., data=iris, col=iris$Species)

library(caret)
x<-iris[, 1:4]
y<-iris[,5]
scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x,y=y, plot="density", scales=scales)

featurePlot(x=x,y=y, plot="box")