Load and explore iris data

data(iris)
#help(iris)
str(iris) # Give us information about the data that we are interested in analyzing
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
levels(iris$Species) # Display all the levels of Species
[1] "setosa"     "versicolor" "virginica" 
table(iris$Species) # Split data according to different levels

    setosa versicolor  virginica 
        50         50         50 
head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
tail(iris)
    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
145          6.7         3.3          5.7         2.5 virginica
146          6.7         3.0          5.2         2.3 virginica
147          6.3         2.5          5.0         1.9 virginica
148          6.5         3.0          5.2         2.0 virginica
149          6.2         3.4          5.4         2.3 virginica
150          5.9         3.0          5.1         1.8 virginica

Exploratory data analysis

summary(iris) # Five point summary statistics
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
       Species  
 setosa    :50  
 versicolor:50  
 virginica :50  
                
                
                
library(psych)
describe(iris[, c(1:4)])
             vars   n mean   sd median trimmed  mad min max range  skew
Sepal.Length    1 150 5.84 0.83   5.80    5.81 1.04 4.3 7.9   3.6  0.31
Sepal.Width     2 150 3.06 0.44   3.00    3.04 0.44 2.0 4.4   2.4  0.31
Petal.Length    3 150 3.76 1.77   4.35    3.76 1.85 1.0 6.9   5.9 -0.27
Petal.Width     4 150 1.20 0.76   1.30    1.18 1.04 0.1 2.5   2.4 -0.10
             kurtosis   se
Sepal.Length    -0.61 0.07
Sepal.Width      0.14 0.04
Petal.Length    -1.42 0.14
Petal.Width     -1.36 0.06

Calculate summary statistics for multivariate data

# Mean vector
sapply(iris[, c(1:4)], mean)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    5.843333     3.057333     3.758000     1.199333 
# Mean of a variable
mean(iris$Sepal.Length)
[1] 5.843333
# Variance vector
sapply(iris[, c(1:4)], var)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
   0.6856935    0.1899794    3.1162779    0.5810063 
# Standard deviation vector
sapply(iris[, c(1:4)], sd)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
   0.8280661    0.4358663    1.7652982    0.7622377 
# Variance-covariance matrix
cov(iris[, c(1:4)])
             Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length    0.6856935  -0.0424340    1.2743154   0.5162707
Sepal.Width    -0.0424340   0.1899794   -0.3296564  -0.1216394
Petal.Length    1.2743154  -0.3296564    3.1162779   1.2956094
Petal.Width     0.5162707  -0.1216394    1.2956094   0.5810063
# Correlation matrix
cor(iris[, c(1:4)])
             Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000

Test for association/correlation between two variables

cor.test(iris$Petal.Length, iris$Petal.Width)

    Pearson's product-moment correlation

data:  iris$Petal.Length and iris$Petal.Width
t = 43.387, df = 148, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9490525 0.9729853
sample estimates:
      cor 
0.9628654 

Standardizing variables

# Standardize a variable
stdSL <- scale(iris$Sepal.Length)
mean(stdSL)
[1] -4.484318e-16
sd(stdSL)
[1] 1

Create a new data frame of all the standardized variables (numerical)

stddf <- as.data.frame(scale(iris[, c(1:4)]))
head(stddf)
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   -0.8976739  1.01560199    -1.335752   -1.311052
2   -1.1392005 -0.13153881    -1.335752   -1.311052
3   -1.3807271  0.32731751    -1.392399   -1.311052
4   -1.5014904  0.09788935    -1.279104   -1.311052
5   -1.0184372  1.24503015    -1.335752   -1.311052
6   -0.5353840  1.93331463    -1.165809   -1.048667

Data shows that Petal.Length of the class setosa is shorter than the Petal.Length of other classes. Is that true?

# Get column "Species" for all flowers where Petal.Length < 2
subset(iris, Petal.Length < 2)[, "Species"]
 [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[11] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[21] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[31] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[41] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
Levels: setosa versicolor virginica

Cool, we have a first result that helps explaining a part of our data.

Plots to understand how the distribution of a variable varies by each class

#?boxplot
boxplot(iris[, c(1:4)])

This gives a rough estimate of the distribution of each variable. But maybe it makes more sense to see the distribution of each variable by each class.

boxplot(iris$Sepal.Length~iris$Species)

irisset <- subset(iris, Species=="setosa")
irisver <- subset(iris, Species=="versicolor")
irisvir <- subset(iris, Species=="virginica")

par(mfrow=c(1,3), mar=c(6,3,2,1)) # mar: more space to labels
boxplot(irisset[, 1:4], main="Setosa", ylim=c(0,8), las=2)
boxplot(irisver[, 1:4], main="Versicolor", ylim=c(0,8), las=2)
boxplot(irisvir[, 1:4], main="Virginica", ylim=c(0,8), las=2)

#dev.off()

Histogram per variable

hist(iris$Sepal.Length)

hist(iris$Sepal.Length, breaks=20)

hist(iris$Sepal.Length, breaks=20, main="Histogram of Sepal.Length", xlab="Sepal Length (cm)")

hist(iris$Sepal.Length, breaks=20, probability=T, xlim=c(4,8), ylim=c(0.0,0.8), main="Density curve of Sepal.Length", xlab="Sepal Length (cm)")
lines(density(iris$Sepal.Length), col="red", lwd=2) # Fit the density curve using the density() function. To do this, we need to set the prob=TRUE. This option converts the y-axis from frequency to probability.

Histograms of one particular variable for each class

par(mfrow=c(1,3))
#?par
hist(irisver$Petal.Length, breaks=seq(0,8,l=17), xlim=c(0,8), ylim=c(0,40))
hist(irisset$Petal.Length, breaks=seq(0,8,l=17), xlim=c(0,8), ylim=c(0,40))
hist(irisvir$Petal.Length, breaks=seq(0,8,l=17), xlim=c(0,8), ylim=c(0,40))

These show that the distributions of Petal.Length are different for each class. Thus, we may use Petal.Length measures to identify the class of each flower.

Correlations between variables

corr <- cor(iris[, c(1:4)])
round(corr, 3)
             Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length        1.000      -0.118        0.872       0.818
Sepal.Width        -0.118       1.000       -0.428      -0.366
Petal.Length        0.872      -0.428        1.000       0.963
Petal.Width         0.818      -0.366        0.963       1.000

? Try to see the correlation between the variables for each class


Scatterplot matrix

pairs(iris[, c(1:4)], pch=19)

pairs(iris[, c(1:4)], col=as.numeric(iris$Species), pch=19, main="Anderson's Iris Data with 3 Species", oma=c(8,8,8,8))
# oma: allow plotting of the legend outside the figure region (i.e., within the space left by making the margins big)

par(xpd=TRUE)
legend("bottom", col=unique(iris$Species), pch=19, legend=levels(iris$Species), bty="n", horiz=TRUE)

library(car)
scatterplotMatrix(iris[, c(1:4)], pch=19)

Pairwise scatter plot

plot(iris$Sepal.Length, iris$Sepal.Width, pch=19)

Parallel coordinates plot

library(MASS)
parcoord(iris[, c(1:4)], col=as.numeric(iris$Species), var.label=TRUE)
legend(x=1.5, y=-0.15, col=unique(iris$Species), lty=1, legend=levels(iris$Species), bty="n", horiz=TRUE)

parcoord(iris[, c(1,3,4,2)], col=as.numeric(iris$Species), var.label=TRUE)
legend(x=1.5, y=-0.15, col=unique(iris$Species), lty=1, legend=levels(iris$Species), bty="n", horiz=TRUE)