#install.packages("psych")
#install.packages("corrplot")
library(corrplot)
## corrplot 0.84 loaded
library (psych)
library(MASS)
#Using iris dataset.
data ("iris")
#Display the first few rows of the data.
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Our dataset contain 150 observations and 5 variables(4 numerical and 1 factor).All numeric variables are metric/independent variables, whereas Species is nonmetric (category) dependent variable.
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
describeBy(iris$Sepal.Length, iris$Species)
##
## Descriptive statistics by group
## group: setosa
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 5.01 0.35 5 5 0.3 4.3 5.8 1.5 0.11 -0.45 0.05
## ------------------------------------------------------------
## group: versicolor
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 5.94 0.52 5.9 5.94 0.52 4.9 7 2.1 0.1 -0.69 0.07
## ------------------------------------------------------------
## group: virginica
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 6.59 0.64 6.5 6.57 0.59 4.9 7.9 3 0.11 -0.2 0.09
describeBy(iris$Sepal.Width, iris$Species)
##
## Descriptive statistics by group
## group: setosa
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 3.43 0.38 3.4 3.42 0.37 2.3 4.4 2.1 0.04 0.6 0.05
## ------------------------------------------------------------
## group: versicolor
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 2.77 0.31 2.8 2.78 0.3 2 3.4 1.4 -0.34 -0.55 0.04
## ------------------------------------------------------------
## group: virginica
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 2.97 0.32 3 2.96 0.3 2.2 3.8 1.6 0.34 0.38 0.05
describeBy(iris$Petal.Length, iris$Species)
##
## Descriptive statistics by group
## group: setosa
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 1.46 0.17 1.5 1.46 0.15 1 1.9 0.9 0.1 0.65 0.02
## ------------------------------------------------------------
## group: versicolor
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 4.26 0.47 4.35 4.29 0.52 3 5.1 2.1 -0.57 -0.19 0.07
## ------------------------------------------------------------
## group: virginica
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 5.55 0.55 5.55 5.51 0.67 4.5 6.9 2.4 0.52 -0.37 0.08
describeBy(iris$Petal.Width, iris$Species)
##
## Descriptive statistics by group
## group: setosa
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 0.25 0.11 0.2 0.24 0 0.1 0.6 0.5 1.18 1.26 0.01
## ------------------------------------------------------------
## group: versicolor
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 1.33 0.2 1.3 1.32 0.22 1 1.8 0.8 -0.03 -0.59 0.03
## ------------------------------------------------------------
## group: virginica
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 2.03 0.27 2 2.03 0.3 1.4 2.5 1.1 -0.12 -0.75 0.04
#We can not use scotter plot for factor variable, so we will select the first four variables (numerical)
pairs.panels(iris[1:4],
gap =0,
bg = c("red", "green", "blue")[iris$Species],
pch =21)
The graph above shows a histogram for each numerical variable and scatterplot for each combination of two variable. We have Correlation coefficient in the upper triangle. We see in the scatterplot that for variables really help in some way to separate three different species, so we see red, blue and green separated in some cases but overlap in other cases. So, we want to carry LDA that will find linear combination of these four variables that will give the best possible separation among these three different species.
Note that sepal length is a fixed variable and other variables are being compared to it. Also, Petal width display the most separation.
LDA = lda( Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, iris)
LDA
## Call:
## lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
## data = iris)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.006 3.428 1.462 0.246
## versicolor 5.936 2.770 4.260 1.326
## virginica 6.588 2.974 5.552 2.026
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length 0.8293776 0.02410215
## Sepal.Width 1.5344731 2.16452123
## Petal.Length -2.2012117 -0.93192121
## Petal.Width -2.8104603 2.83918785
##
## Proportion of trace:
## LD1 LD2
## 0.9912 0.0088
plot(LDA, col = as.integer(iris$Species))
As you can see, there are three distinct groups with some overlap between virginica and versicolor.
M= cor(iris[,-5])
corrplot(M,method = "circle", type = "lower", tl.cex = 0.50, tl.srt = 45, tl.col = "red")
This corrplot displays a correlation matrix between variables.