This is a statistical analysis on IRIS data to perform Exploratory analysis… A demo of Week 5 Lab session for CSE(AI&ML), Institute of Aeronautical Engineering

dat=iris
summary(dat)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
summary(dat)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
head(dat) #preview data
str(dat) #structure of data
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
min(dat$Sepal.Length)  
## [1] 4.3
max(dat$Sepal.Length)
## [1] 7.9
max(dat$Sepal.Length) - min(dat$Sepal.Length) #range
## [1] 3.6
range2 <- function(x) {
  range <- max(x) - min(x)
  return(range)
}

range2(dat$Sepal.Length)
## [1] 3.6
mean(dat$Sepal.Length)
## [1] 5.843333
median(dat$Sepal.Length)
## [1] 5.8
quantile(dat$Sepal.Length, 0.25) # first quartile
## 25% 
## 5.1
quantile(dat$Sepal.Length, 0.75) # third quartile
## 75% 
## 6.4
quantile(dat$Sepal.Length, 0.4) # 4th decile
## 40% 
## 5.6
IQR(dat$Sepal.Length) 
## [1] 1.3
sd(dat$Sepal.Length) # standard deviation
## [1] 0.8280661
var(dat$Sepal.Length) # variance
## [1] 0.6856935
lapply(dat[, 1:4], sd)
## $Sepal.Length
## [1] 0.8280661
## 
## $Sepal.Width
## [1] 0.4358663
## 
## $Petal.Length
## [1] 1.765298
## 
## $Petal.Width
## [1] 0.7622377
tab <- table(dat$Sepal.Length) # number of occurrences for each unique value
sort(tab, decreasing = TRUE) # sort highest to lowest
## 
##   5 5.1 6.3 5.7 6.7 5.5 5.8 6.4 4.9 5.4 5.6   6 6.1 4.8 6.5 4.6 5.2 6.2 6.9 7.7 
##  10   9   9   8   8   7   7   7   6   6   6   6   6   5   5   4   4   4   4   4 
## 4.4 5.9 6.8 7.2 4.7 6.6 4.3 4.5 5.3   7 7.1 7.3 7.4 7.6 7.9 
##   3   3   3   3   2   2   1   1   1   1   1   1   1   1   1
dat$size <- ifelse(dat$Sepal.Length < median(dat$Sepal.Length), "small", "big")
table(dat$Species, dat$size)
##             
##              big small
##   setosa       1    49
##   versicolor  29    21
##   virginica   47     3
library(corrplot) #correlogram
## corrplot 0.90 loaded
corr_matrix <- cor(iris[,1:3])
corr_matrix
##              Sepal.Length Sepal.Width Petal.Length
## Sepal.Length    1.0000000  -0.1175698    0.8717538
## Sepal.Width    -0.1175698   1.0000000   -0.4284401
## Petal.Length    0.8717538  -0.4284401    1.0000000

Including Plots

You can also embed plots, for example:

library(ggplot2) #data visualization
# with circles
corrplot(corr_matrix)

# with numbers and lower
corrplot(corr_matrix,method='number',type="lower")

barplot(table(dat$size))

hist(dat$Sepal.Length)

boxplot(dat$Sepal.Length)

boxplot(dat$Sepal.Length ~ dat$Species) #compare the length of the sepal across the different species

plot(dat$Sepal.Length, dat$Petal.Length)

plot(iris$Sepal.Length)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.