data(iris)
#help(iris)
str(iris) # Give us information about the data that we are interested in analyzing
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
levels(iris$Species) # Display all the levels of Species
[1] "setosa" "versicolor" "virginica"
table(iris$Species) # Split data according to different levels
setosa versicolor virginica
50 50 50
head(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
tail(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
145 6.7 3.3 5.7 2.5 virginica
146 6.7 3.0 5.2 2.3 virginica
147 6.3 2.5 5.0 1.9 virginica
148 6.5 3.0 5.2 2.0 virginica
149 6.2 3.4 5.4 2.3 virginica
150 5.9 3.0 5.1 1.8 virginica
summary(iris) # Five point summary statistics
Sepal.Length Sepal.Width Petal.Length Petal.Width
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
Median :5.800 Median :3.000 Median :4.350 Median :1.300
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
Species
setosa :50
versicolor:50
virginica :50
library(psych)
describe(iris[, c(1:4)])
vars n mean sd median trimmed mad min max range skew
Sepal.Length 1 150 5.84 0.83 5.80 5.81 1.04 4.3 7.9 3.6 0.31
Sepal.Width 2 150 3.06 0.44 3.00 3.04 0.44 2.0 4.4 2.4 0.31
Petal.Length 3 150 3.76 1.77 4.35 3.76 1.85 1.0 6.9 5.9 -0.27
Petal.Width 4 150 1.20 0.76 1.30 1.18 1.04 0.1 2.5 2.4 -0.10
kurtosis se
Sepal.Length -0.61 0.07
Sepal.Width 0.14 0.04
Petal.Length -1.42 0.14
Petal.Width -1.36 0.06
# Mean vector
sapply(iris[, c(1:4)], mean)
Sepal.Length Sepal.Width Petal.Length Petal.Width
5.843333 3.057333 3.758000 1.199333
# Mean of a variable
mean(iris$Sepal.Length)
[1] 5.843333
# Variance vector
sapply(iris[, c(1:4)], var)
Sepal.Length Sepal.Width Petal.Length Petal.Width
0.6856935 0.1899794 3.1162779 0.5810063
# Standard deviation vector
sapply(iris[, c(1:4)], sd)
Sepal.Length Sepal.Width Petal.Length Petal.Width
0.8280661 0.4358663 1.7652982 0.7622377
# Variance-covariance matrix
cov(iris[, c(1:4)])
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 0.6856935 -0.0424340 1.2743154 0.5162707
Sepal.Width -0.0424340 0.1899794 -0.3296564 -0.1216394
Petal.Length 1.2743154 -0.3296564 3.1162779 1.2956094
Petal.Width 0.5162707 -0.1216394 1.2956094 0.5810063
# Correlation matrix
cor(iris[, c(1:4)])
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
cor.test(iris$Petal.Length, iris$Petal.Width)
Pearson's product-moment correlation
data: iris$Petal.Length and iris$Petal.Width
t = 43.387, df = 148, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9490525 0.9729853
sample estimates:
cor
0.9628654
# Standardize a variable
stdSL <- scale(iris$Sepal.Length)
mean(stdSL)
[1] -4.484318e-16
sd(stdSL)
[1] 1
stddf <- as.data.frame(scale(iris[, c(1:4)]))
head(stddf)
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 -0.8976739 1.01560199 -1.335752 -1.311052
2 -1.1392005 -0.13153881 -1.335752 -1.311052
3 -1.3807271 0.32731751 -1.392399 -1.311052
4 -1.5014904 0.09788935 -1.279104 -1.311052
5 -1.0184372 1.24503015 -1.335752 -1.311052
6 -0.5353840 1.93331463 -1.165809 -1.048667
# Get column "Species" for all flowers where Petal.Length < 2
subset(iris, Petal.Length < 2)[, "Species"]
[1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[11] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[21] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[31] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[41] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
Levels: setosa versicolor virginica
Cool, we have a first result that helps explaining a part of our data.
#?boxplot
boxplot(iris[, c(1:4)])
This gives a rough estimate of the distribution of each variable. But maybe it makes more sense to see the distribution of each variable by each class.
boxplot(iris$Sepal.Length~iris$Species)
irisset <- subset(iris, Species=="setosa")
irisver <- subset(iris, Species=="versicolor")
irisvir <- subset(iris, Species=="virginica")
par(mfrow=c(1,3), mar=c(6,3,2,1)) # mar: more space to labels
boxplot(irisset[, 1:4], main="Setosa", ylim=c(0,8), las=2)
boxplot(irisver[, 1:4], main="Versicolor", ylim=c(0,8), las=2)
boxplot(irisvir[, 1:4], main="Virginica", ylim=c(0,8), las=2)
#dev.off()
hist(iris$Sepal.Length)
hist(iris$Sepal.Length, breaks=20)
hist(iris$Sepal.Length, breaks=20, main="Histogram of Sepal.Length", xlab="Sepal Length (cm)")
hist(iris$Sepal.Length, breaks=20, probability=T, xlim=c(4,8), ylim=c(0.0,0.8), main="Density curve of Sepal.Length", xlab="Sepal Length (cm)")
lines(density(iris$Sepal.Length), col="red", lwd=2) # Fit the density curve using the density() function. To do this, we need to set the prob=TRUE. This option converts the y-axis from frequency to probability.
par(mfrow=c(1,3))
#?par
hist(irisver$Petal.Length, breaks=seq(0,8,l=17), xlim=c(0,8), ylim=c(0,40))
hist(irisset$Petal.Length, breaks=seq(0,8,l=17), xlim=c(0,8), ylim=c(0,40))
hist(irisvir$Petal.Length, breaks=seq(0,8,l=17), xlim=c(0,8), ylim=c(0,40))
These show that the distributions of Petal.Length are different for each class. Thus, we may use Petal.Length measures to identify the class of each flower.
corr <- cor(iris[, c(1:4)])
round(corr, 3)
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 1.000 -0.118 0.872 0.818
Sepal.Width -0.118 1.000 -0.428 -0.366
Petal.Length 0.872 -0.428 1.000 0.963
Petal.Width 0.818 -0.366 0.963 1.000
pairs(iris[, c(1:4)], pch=19)
pairs(iris[, c(1:4)], col=as.numeric(iris$Species), pch=19, main="Anderson's Iris Data with 3 Species", oma=c(8,8,8,8))
# oma: allow plotting of the legend outside the figure region (i.e., within the space left by making the margins big)
par(xpd=TRUE)
legend("bottom", col=unique(iris$Species), pch=19, legend=levels(iris$Species), bty="n", horiz=TRUE)
library(car)
scatterplotMatrix(iris[, c(1:4)], pch=19)
plot(iris$Sepal.Length, iris$Sepal.Width, pch=19)
library(MASS)
parcoord(iris[, c(1:4)], col=as.numeric(iris$Species), var.label=TRUE)
legend(x=1.5, y=-0.15, col=unique(iris$Species), lty=1, legend=levels(iris$Species), bty="n", horiz=TRUE)
parcoord(iris[, c(1,3,4,2)], col=as.numeric(iris$Species), var.label=TRUE)
legend(x=1.5, y=-0.15, col=unique(iris$Species), lty=1, legend=levels(iris$Species), bty="n", horiz=TRUE)