This is a statistical analysis on IRIS data to perform Exploratory analysis… A demo of Week 5 Lab session for CSE(AI&ML), Institute of Aeronautical Engineering
dat=iris
summary(dat)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
summary(dat)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(dat) #preview data
str(dat) #structure of data
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
min(dat$Sepal.Length)
## [1] 4.3
max(dat$Sepal.Length)
## [1] 7.9
max(dat$Sepal.Length) - min(dat$Sepal.Length) #range
## [1] 3.6
range2 <- function(x) {
range <- max(x) - min(x)
return(range)
}
range2(dat$Sepal.Length)
## [1] 3.6
mean(dat$Sepal.Length)
## [1] 5.843333
median(dat$Sepal.Length)
## [1] 5.8
quantile(dat$Sepal.Length, 0.25) # first quartile
## 25%
## 5.1
quantile(dat$Sepal.Length, 0.75) # third quartile
## 75%
## 6.4
quantile(dat$Sepal.Length, 0.4) # 4th decile
## 40%
## 5.6
IQR(dat$Sepal.Length)
## [1] 1.3
sd(dat$Sepal.Length) # standard deviation
## [1] 0.8280661
var(dat$Sepal.Length) # variance
## [1] 0.6856935
lapply(dat[, 1:4], sd)
## $Sepal.Length
## [1] 0.8280661
##
## $Sepal.Width
## [1] 0.4358663
##
## $Petal.Length
## [1] 1.765298
##
## $Petal.Width
## [1] 0.7622377
tab <- table(dat$Sepal.Length) # number of occurrences for each unique value
sort(tab, decreasing = TRUE) # sort highest to lowest
##
## 5 5.1 6.3 5.7 6.7 5.5 5.8 6.4 4.9 5.4 5.6 6 6.1 4.8 6.5 4.6 5.2 6.2 6.9 7.7
## 10 9 9 8 8 7 7 7 6 6 6 6 6 5 5 4 4 4 4 4
## 4.4 5.9 6.8 7.2 4.7 6.6 4.3 4.5 5.3 7 7.1 7.3 7.4 7.6 7.9
## 3 3 3 3 2 2 1 1 1 1 1 1 1 1 1
dat$size <- ifelse(dat$Sepal.Length < median(dat$Sepal.Length), "small", "big")
table(dat$Species, dat$size)
##
## big small
## setosa 1 49
## versicolor 29 21
## virginica 47 3
library(corrplot) #correlogram
## corrplot 0.90 loaded
corr_matrix <- cor(iris[,1:3])
corr_matrix
## Sepal.Length Sepal.Width Petal.Length
## Sepal.Length 1.0000000 -0.1175698 0.8717538
## Sepal.Width -0.1175698 1.0000000 -0.4284401
## Petal.Length 0.8717538 -0.4284401 1.0000000
You can also embed plots, for example:
library(ggplot2) #data visualization
# with circles
corrplot(corr_matrix)
# with numbers and lower
corrplot(corr_matrix,method='number',type="lower")
barplot(table(dat$size))
hist(dat$Sepal.Length)
boxplot(dat$Sepal.Length)
boxplot(dat$Sepal.Length ~ dat$Species) #compare the length of the sepal across the different species
plot(dat$Sepal.Length, dat$Petal.Length)
plot(iris$Sepal.Length)
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.