## Warning: package 'pacman' was built under R version 3.4.3
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
There are four continuous variables: sepal length, sepal width, petal length, and petal width. The only categorical variable is species.
pairs(~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width,data=iris,
upper.panel = NULL)
pacman::p_load(plyr)
ddply(iris, .(Species),
summarise,
mean = round(mean(Sepal.Length), 1),
sem = round(sd(Sepal.Length)/sqrt(length(Sepal.Length)), 2))
The data in a summary table is convenient but not always easy to interpret. It tells, rather than shows, what the data says.
boxplot(Sepal.Width ~ Species, iris)
{boxplot(Sepal.Width ~ Species, iris, las=1,
xaxt="n", lwd=2,
xlab="Species",
ylab="Sepal Width",
cex.lab=1.3,
cex.axis=1.3,
col=c("blue","orange","green"))
axis(side=1, cex.lab=1.0,
at=c(1.0,2.0,3.0),
labels=c("IRSE", "IRVE", "IRVI"))
legend("top", cex=0.8, bty="n",
title="Species", c("I. setosa","I. versicolor","I. virginica"),
fill=c("blue", "orange","green"))}
This box plot shows that I. setosa has the largest sepal width, I. versicolor has the smallest sepal width, and I. virginica appears to have the most range of sepal widths.
plot(Sepal.Length ~ Sepal.Width, iris)
This scatter plot doesn’t show any very strong trend, but maybe a weak trend where increasing sepal width is related to increasing sepal length.
{plot(Sepal.Length ~ Sepal.Width, iris)
abline(lm(Sepal.Length ~ Sepal.Width, iris))}
This trend line shows that my above interpretation was incorrect, and in fact the trend is a weakly negative relationship: as sepal width increases, sepal length decreases.
levels(iris$Species)
## [1] "setosa" "versicolor" "virginica"
By adding the categorical variable and grouping the points by species, you can now see a different relationship between sepal width and sepal length. Across all three species, as sepal width increases, sepal length also increases.
plot(Sepal.Length ~ Sepal.Width, iris, las=1, type="n")
points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="setosa"),pch=1)
points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="versicolor"),pch=10)
points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="virginica"),pch=19)
set.iris <- subset(iris,Species=="setosa",pch=1)
ver.iris <- subset(iris,Species=="versicolor",pch=10)
vir.iris <- subset(iris,Species=="virginica",pch=19)
{plot(Sepal.Length ~ Sepal.Width, iris)
par(las=0)
points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="setosa"),pch=1, col="blue")
points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="versicolor"),pch=10, col="orange")
points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="virginica"),pch=19, col="green")
abline(lm(Sepal.Length ~ Sepal.Width, set.iris),lty=1, col="blue")
abline(lm(Sepal.Length ~ Sepal.Width, ver.iris),lty=2, col="orange")
abline(lm(Sepal.Length ~ Sepal.Width, vir.iris),lty=3, col="green")
title(xlab="Sepal Width", ylab="Sepal Length", col.lab="red")
legend("topleft", cex=0.8, bty="n",
title="Species", c("I. setosa","I. versicolor","I. virginica"),
fill=c("blue", "orange","green"))}