## Warning: package 'pacman' was built under R version 3.4.3

Data structure

str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

There are four continuous variables: sepal length, sepal width, petal length, and petal width. The only categorical variable is species.

Scatterplot matrix

pairs(~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width,data=iris, 
        upper.panel = NULL) 

Summary table

pacman::p_load(plyr)
ddply(iris, .(Species), 
        summarise, 
        mean = round(mean(Sepal.Length), 1), 
        sem = round(sd(Sepal.Length)/sqrt(length(Sepal.Length)), 2)) 

The data in a summary table is convenient but not always easy to interpret. It tells, rather than shows, what the data says.

Boxplot

boxplot(Sepal.Width ~ Species, iris)

{boxplot(Sepal.Width ~ Species, iris, las=1, 
          xaxt="n", lwd=2,
          xlab="Species", 
          ylab="Sepal Width", 
          cex.lab=1.3, 
          cex.axis=1.3,  
          col=c("blue","orange","green"))
  axis(side=1, cex.lab=1.0, 
       at=c(1.0,2.0,3.0), 
       labels=c("IRSE", "IRVE", "IRVI"))
  legend("top", cex=0.8, bty="n", 
         title="Species", c("I. setosa","I. versicolor","I. virginica"), 
         fill=c("blue", "orange","green"))}

This box plot shows that I. setosa has the largest sepal width, I. versicolor has the smallest sepal width, and I. virginica appears to have the most range of sepal widths.

Scatterplot

Simple defaults

plot(Sepal.Length ~ Sepal.Width, iris)

This scatter plot doesn’t show any very strong trend, but maybe a weak trend where increasing sepal width is related to increasing sepal length.

With trend line

{plot(Sepal.Length ~ Sepal.Width, iris)

abline(lm(Sepal.Length ~ Sepal.Width, iris))}

This trend line shows that my above interpretation was incorrect, and in fact the trend is a weakly negative relationship: as sepal width increases, sepal length decreases.

Customized plot

levels(iris$Species)
## [1] "setosa"     "versicolor" "virginica"

By adding the categorical variable and grouping the points by species, you can now see a different relationship between sepal width and sepal length. Across all three species, as sepal width increases, sepal length also increases.

plot(Sepal.Length ~ Sepal.Width, iris, las=1, type="n")
  points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="setosa"),pch=1)
  points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="versicolor"),pch=10)
  points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="virginica"),pch=19)

  set.iris <- subset(iris,Species=="setosa",pch=1)
  ver.iris <- subset(iris,Species=="versicolor",pch=10)
  vir.iris <- subset(iris,Species=="virginica",pch=19)
  
  
  {plot(Sepal.Length ~ Sepal.Width, iris) 
    par(las=0)
    points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="setosa"),pch=1, col="blue")
    points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="versicolor"),pch=10, col="orange")
    points(Sepal.Length ~ Sepal.Width, subset(iris,Species=="virginica"),pch=19, col="green")
    abline(lm(Sepal.Length ~ Sepal.Width, set.iris),lty=1, col="blue")
    abline(lm(Sepal.Length ~ Sepal.Width, ver.iris),lty=2, col="orange")
    abline(lm(Sepal.Length ~ Sepal.Width, vir.iris),lty=3, col="green")
    
    title(xlab="Sepal Width", ylab="Sepal Length", col.lab="red")
    legend("topleft", cex=0.8, bty="n", 
         title="Species", c("I. setosa","I. versicolor","I. virginica"), 
         fill=c("blue", "orange","green"))}