Exploratory Data Analysis with ggplot

Loading Packages
Multiple plotting
AMES HOUSING DATA

Loading Packages

library(tidyverse)
data(iris)
iris%>%head()

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

select=dplyr::select
pacman::p_load(plotly)

Iris_summary <- iris %>% # the names of the new data frame and the data frame to be summarised
  group_by(Species) %>%   # the grouping variable
  summarise(mean_PL = mean(Petal.Length),  # calculates the mean of each group
            sd_PL = sd(Petal.Length), # calculates the standard deviation of each group
            n_PL = n(),  # calculates the sample size per group
            SE_PL = sd(Petal.Length)/sqrt(n())) # calculates the standard error of each group
Iris_summary

## # A tibble: 3 x 5
##   Species    mean_PL sd_PL  n_PL  SE_PL
##   <fct>        <dbl> <dbl> <int>  <dbl>
## 1 setosa        1.46 0.174    50 0.0246
## 2 versicolor    4.26 0.470    50 0.0665
## 3 virginica     5.55 0.552    50 0.0780

Iris_summary1 <- iris %>% # the names of the new data frame and the data frame to be summarised
  group_by(Species) %>%   # the grouping variable
  summarise_at("Petal.Length", c(sd,mean))
  #summarise_at("Petal.Length",fivenum)
Iris_summary1

## # A tibble: 3 x 3
##   Species    `function (x, na.rm = FALSE) ...` `function (x, ...) ...`
##   <fct>                                  <dbl>                   <dbl>
## 1 setosa                                 0.174                    1.46
## 2 versicolor                             0.470                    4.26
## 3 virginica                              0.552                    5.55

Iris_summary2 <- iris %>% # the names of the new data frame and the data frame to be summarised
  group_by(Species) %>%   # the grouping variable
  summarise_at("Petal.Length",funs(min, max,mean(.,na.rm=TRUE),sd,n(),SE_PL = sd(.)/sqrt(n())))
Iris_summary2

## # A tibble: 3 x 7
##   Species      min   max  mean    sd     n  SE_PL
##   <fct>      <dbl> <dbl> <dbl> <dbl> <int>  <dbl>
## 1 setosa      1.00  1.90  1.46 0.174    50 0.0246
## 2 versicolor  3.00  5.10  4.26 0.470    50 0.0665
## 3 virginica   4.50  6.90  5.55 0.552    50 0.0780

Iris_summary2 <- iris %>% # the names of the new data frame and the data frame to be summarised
  group_by(Species)
# %>%   # the grouping variable
#   summarise_at("Petal.Length",funs(fivenum(., na.rm = TRUE)))

Iris_summary2

## # A tibble: 150 x 5
## # Groups:   Species [3]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1         5.10        3.50         1.40       0.200 setosa 
##  2         4.90        3.00         1.40       0.200 setosa 
##  3         4.70        3.20         1.30       0.200 setosa 
##  4         4.60        3.10         1.50       0.200 setosa 
##  5         5.00        3.60         1.40       0.200 setosa 
##  6         5.40        3.90         1.70       0.400 setosa 
##  7         4.60        3.40         1.40       0.300 setosa 
##  8         5.00        3.40         1.50       0.200 setosa 
##  9         4.40        2.90         1.40       0.200 setosa 
## 10         4.90        3.10         1.50       0.100 setosa 
## # ... with 140 more rows

tapply(iris$Petal.Length,iris$Species,mean)

##     setosa versicolor  virginica 
##      1.462      4.260      5.552

tapply(iris$Petal.Length,iris$Species,fivenum)

## $setosa
## [1] 1.0 1.4 1.5 1.6 1.9
## 
## $versicolor
## [1] 3.00 4.00 4.35 4.60 5.10
## 
## $virginica
## [1] 4.50 5.10 5.55 5.90 6.90

((tapply(iris$Petal.Length,iris$Species,fivenum)))

## $setosa
## [1] 1.0 1.4 1.5 1.6 1.9
## 
## $versicolor
## [1] 3.00 4.00 4.35 4.60 5.10
## 
## $virginica
## [1] 4.50 5.10 5.55 5.90 6.90

c=by(iris[,"Petal.Length"],iris[,"Species"],mean)
str(c)

##  by [1:3(1d)] 1.46 4.26 5.55
##  - attr(*, "dimnames")=List of 1
##   ..$ iris[, "Species"]: chr [1:3] "setosa" "versicolor" "virginica"
##  - attr(*, "call")= language by.default(data = iris[, "Petal.Length"], INDICES = iris[, "Species"],      FUN = mean)

## iris[, "Species"]: setosa
## [1] 1.462
## -------------------------------------------------------- 
## iris[, "Species"]: versicolor
## [1] 4.26
## -------------------------------------------------------- 
## iris[, "Species"]: virginica
## [1] 5.552

d=by(iris[,"Petal.Length"],iris[,"Species"],fivenum)
str(d)

## List of 3
##  $ setosa    : num [1:5] 1 1.4 1.5 1.6 1.9
##  $ versicolor: num [1:5] 3 4 4.35 4.6 5.1
##  $ virginica : num [1:5] 4.5 5.1 5.55 5.9 6.9
##  - attr(*, "dim")= int 3
##  - attr(*, "dimnames")=List of 1
##   ..$ iris[, "Species"]: chr [1:3] "setosa" "versicolor" "virginica"
##  - attr(*, "call")= language by.default(data = iris[, "Petal.Length"], INDICES = iris[, "Species"],      FUN = fivenum)
##  - attr(*, "class")= chr "by"

names(d)

## [1] "setosa"     "versicolor" "virginica"

#data_frame(d[[1]],d[[2]],d[[3]])

#require(stats)
head(warpbreaks)

##   breaks wool tension
## 1     26    A       L
## 2     30    A       L
## 3     54    A       L
## 4     25    A       L
## 5     70    A       L
## 6     52    A       L

e=by(warpbreaks[, 1:2], warpbreaks[,"tension"], summary)
e[[1]]

##      breaks      wool 
##  Min.   :14.00   A:9  
##  1st Qu.:26.00   B:9  
##  Median :29.50        
##  Mean   :36.39        
##  3rd Qu.:49.25        
##  Max.   :70.00

IrisPlot<-ggplot(Iris_summary, aes(Species, mean_PL)) + geom_bar(stat="identity") +  geom_errorbar(aes(ymin=mean_PL-sd_PL, ymax=mean_PL+sd_PL),width=0.2)

print(IrisPlot + labs(y="Petal length (cm) ± s.d.", x = "Species") + theme_classic())

Fig. 30

df <- ToothGrowth
df$dose <- as.factor(df$dose)
head(df)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5

str(df)

## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: Factor w/ 3 levels "0.5","1","2": 1 1 1 1 1 1 1 1 1 1 ...

df2<-df%>%group_by(supp,dose)%>%summarise_at(vars(len),funs(mean(.,na.rm=TRUE),sd))
df2=df2%>%rename(len=mean)
df2

## # A tibble: 6 x 4
## # Groups:   supp [2]
##   supp  dose    len    sd
##   <fct> <fct> <dbl> <dbl>
## 1 OJ    0.5   13.2   4.46
## 2 OJ    1     22.7   3.91
## 3 OJ    2     26.1   2.66
## 4 VC    0.5    7.98  2.75
## 5 VC    1     16.8   2.52
## 6 VC    2     26.1   4.80

Multiple plotting

# Default bar plot
library(gridExtra)
p<- ggplot(df2, aes(x=dose, y=len, fill=supp)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.2,
                 position=position_dodge(.9)) 
print(p)

Fig. 30

# Finished bar plot
p2=p+labs(title="Tooth length per dose", x="Dose (mg)", y = "Length")+
   theme_classic() +
   scale_fill_manual(values=c('#999999','#E69F00'))


grid.arrange( p, p2, ncol=2)

Fig. 30

# Keep only upper error bars
 ggplot(df2, aes(x=dose, y=len, fill=supp)) + 
  geom_bar(stat="identity", color="black", position=position_dodge()) +
  geom_errorbar(aes(ymin=len, ymax=len+sd), width=.2,
                 position=position_dodge(.9))

Fig. 30

Multiplot

library(devtools)
#install_github("easyGgplot2", "kassambara")
library(easyGgplot2)


# Default line plot
p3<- ggplot(df2, aes(x=dose, y=len, group=supp, color=supp)) + 
  geom_line() +
  geom_point()+
  geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.2,
                 position=position_dodge(0.05))+
  theme_dark()
#print(p)
# Finished line plot
p4=p3+labs(title="Tooth length per dose", x="Dose (mg)", y = "Length")+
   theme_classic() +
   scale_color_manual(values=c('#999999','#E69F00'))
#multiplot(p3,p4)

# Multiple graphs on the same page
ggplot2.multiplot(p3,p4, cols=2)

Fig. 30

theme_set(theme_gray())
# Use geom_pointrange
p5=ggplot(df2, aes(x=dose, y=len, group=supp, color=supp)) + 
geom_pointrange(aes(ymin=len-sd, ymax=len+sd))

# Use geom_line()+geom_pointrange()
p6=ggplot(df2, aes(x=dose, y=len, group=supp, color=supp)) + 
  geom_line()+
  geom_pointrange(aes(ymin=len-sd, ymax=len+sd))
ggplot2.multiplot(p5,p6)

Fig. 30

Dot plot with mean point and error bars

The functions geom_dotplot() and stat_summary() are used :

The mean +/- SD can be added as a crossbar , a error bar or a pointrange :

p7 <- ggplot(df, aes(x=dose, y=len)) + 
    geom_dotplot(binaxis='y', stackdir='center',binwidth = 1,method="histodot")

# use geom_crossbar()
p8=p7 + stat_summary(fun.data="mean_sdl", fun.args = list(mult=1), 
                 geom="crossbar", width=0.5)
# Use geom_errorbar()
p9=p8 + stat_summary(fun.data=mean_sdl, fun.args = list(mult=1), 
        geom="errorbar", color="red", width=0.2) +
  stat_summary(fun.y=mean, geom="point", color="red")
   
# Use geom_pointrange()
p10=p9 + stat_summary(fun.data=mean_sdl, fun.args = list(mult=1), 
                 geom="pointrange", color="red")

ggplot2.multiplot(p7,p8,p9,p10,cols = 2)

Fig. 30

multiple arrangement

set.seed(123)
pl <- lapply(1:11, function(.x) 
             qplot(1:10, rnorm(10), main=paste("plot", .x)))
ml <- marrangeGrob(pl, nrow=2, ncol=2)
## non-interactive use, multipage pdf
## ggsave("multipage.pdf", ml)
## interactive use; calling `dev.new` multiple times
ml

Fig. 30

Title and/or annotations

library(grid)
library(lattice)
gs <- lapply(1:9, function(ii) 
  grobTree(rectGrob(gp=gpar(fill=ii, alpha=0.5)), textGrob(ii)))
grid.arrange(grobs=gs, ncol=4, 
               top="top label", bottom="bottom\nlabel", 
               left="left label", right="right label")
grid.rect(gp=gpar(fill=NA))

Fig. 30

library(gtable)
g2 <- ggplotGrob(p)
g3 <- ggplotGrob(p2)
g <- rbind(g2, g3, size="first")
g$widths <- unit.pmax(g2$widths, g3$widths)
grid.newpage()
grid.draw(g)

Fig. 30

multiplot

ggplot2.multiplot(p,p2, cols = 2)

Fig. 30

ggplot2.multiplot(p, p2, cols = 2)

Fig. 30

library(easyGgplot2)

# library(devtools)
# install_github("easyGgplot2", "kassambara")
# library(easyGgplot2)
# # Multiple graphs on the same page
# ggplot2.multiplot(p,p2, cols=2)

#install.packages("cowplot")

require(cowplot)

plot.mpg<-plot_grid(p, p2, align='h', labels=c('A', 'B'))
plot.mpg

Fig. 30

# use save_plot() instead of ggsave() when using cowplot
#save_plot("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/ggplot2/mpg.pdf", plot.mpg,base_aspect_ratio = 1.3 )# make room for figure legend)

Generic plot annotations

Finally, let’s discuss how we can use cowplot to create more unusual plot designs. For example, let’s take the mpg image from the previous section, label it with an A in the top-left corner, and mark it as a draft:

ggdraw(plot.mpg) + 
  draw_plot_label("A", size = 14) + 
  draw_label("DRAFT!", angle = 45, size = 80, alpha = .2)

Fig. 30

The function ggdraw() sets up the drawing layer, and functions that are meant to operate on this drawing layer all start with draw_. The resulting object is again a standard ggplot2 object, and you can do with it whatever you might do with a regular ggplot2 plot, such as save it with ggsave(). [However, as mentioned before, I recommend using save_plot() instead.]

In fact, because ggdraw() produces a standard ggplot2 object, we can draw on it with standard geoms if we want to. For example:

t <- (0:1000)/1000
spiral <- data.frame(x = .45+.55*t*cos(t*15), y = .55-.55*t*sin(t*15), t)
ggdraw(plot.mpg) + 
  geom_path(data = spiral, aes(x = x, y = y, colour = t), size = 6, alpha = .4)

Fig. 30

plot.iris <- ggplot(iris, aes(Sepal.Length, Sepal.Width)) + 
  geom_point() + facet_grid(. ~ Species) + stat_smooth(method = "lm") +
  background_grid(major = 'y', minor = "none") + # add thin horizontal lines 
  panel_border() # and a border around each panel
# plot.mpg and plot.diamonds were defined earlier
ggdraw() +
  draw_plot(plot.iris, 0, .5, 1, .5) +
  draw_plot(plot.mpg, 0, 0, 1, .5) +
  #draw_plot(plot.diamonds, .5, 0, .5, .5) +
  draw_plot_label(c("A", ""), c(0, 0), c(1, 0.5), size = 15)

Fig. 30

df%>%group_by(supp,dose)%>%summarise_at(vars(len),funs(min,max))

## # A tibble: 6 x 4
## # Groups:   supp [?]
##   supp  dose    min   max
##   <fct> <fct> <dbl> <dbl>
## 1 OJ    0.5    8.20  21.5
## 2 OJ    1     14.5   27.3
## 3 OJ    2     22.4   30.9
## 4 VC    0.5    4.20  11.5
## 5 VC    1     13.6   22.5
## 6 VC    2     18.5   33.9

m=by(df[,"len"],df[,c("supp","dose")],mean)
s.d=by(df[,"len"],df[,c("supp","dose")],sd)
m

## supp: OJ
## dose: 0.5
## [1] 13.23
## -------------------------------------------------------- 
## supp: VC
## dose: 0.5
## [1] 7.98
## -------------------------------------------------------- 
## supp: OJ
## dose: 1
## [1] 22.7
## -------------------------------------------------------- 
## supp: VC
## dose: 1
## [1] 16.77
## -------------------------------------------------------- 
## supp: OJ
## dose: 2
## [1] 26.06
## -------------------------------------------------------- 
## supp: VC
## dose: 2
## [1] 26.14

s.d

## supp: OJ
## dose: 0.5
## [1] 4.459709
## -------------------------------------------------------- 
## supp: VC
## dose: 0.5
## [1] 2.746634
## -------------------------------------------------------- 
## supp: OJ
## dose: 1
## [1] 3.910953
## -------------------------------------------------------- 
## supp: VC
## dose: 1
## [1] 2.515309
## -------------------------------------------------------- 
## supp: OJ
## dose: 2
## [1] 2.655058
## -------------------------------------------------------- 
## supp: VC
## dose: 2
## [1] 4.797731

ms=function(x){
return(data_frame(m=mean(x),sd=sd(x)))
}

x=rnorm(10)
#data_frame(m=mean(x),sd=sd(x))

ms(x)

## # A tibble: 1 x 2
##          m    sd
##      <dbl> <dbl>
## 1 0.000478 0.775

mss=function(x){
return(list(m=mean(x),sd=sd(x)))
}

mss(x)

## $m
## [1] 0.0004784766
## 
## $sd
## [1] 0.7749196

mssout=by(df[,"len"],df[,c("supp","dose")],mss)
mssout

## supp: OJ
## dose: 0.5
## $m
## [1] 13.23
## 
## $sd
## [1] 4.459709
## 
## -------------------------------------------------------- 
## supp: VC
## dose: 0.5
## $m
## [1] 7.98
## 
## $sd
## [1] 2.746634
## 
## -------------------------------------------------------- 
## supp: OJ
## dose: 1
## $m
## [1] 22.7
## 
## $sd
## [1] 3.910953
## 
## -------------------------------------------------------- 
## supp: VC
## dose: 1
## $m
## [1] 16.77
## 
## $sd
## [1] 2.515309
## 
## -------------------------------------------------------- 
## supp: OJ
## dose: 2
## $m
## [1] 26.06
## 
## $sd
## [1] 2.655058
## 
## -------------------------------------------------------- 
## supp: VC
## dose: 2
## $m
## [1] 26.14
## 
## $sd
## [1] 4.797731

rbind_list(mssout)

## # A tibble: 6 x 2
##       m    sd
##   <dbl> <dbl>
## 1 13.2   4.46
## 2  7.98  2.75
## 3 22.7   3.91
## 4 16.8   2.52
## 5 26.1   2.66
## 6 26.1   4.80

do.call(rbind,mssout)

##      m     sd      
## [1,] 13.23 4.459709
## [2,] 7.98  2.746634
## [3,] 22.7  3.910953
## [4,] 16.77 2.515309
## [5,] 26.06 2.655058
## [6,] 26.14 4.797731

(do.call(rbind,mssout))

##      m     sd      
## [1,] 13.23 4.459709
## [2,] 7.98  2.746634
## [3,] 22.7  3.910953
## [4,] 16.77 2.515309
## [5,] 26.06 2.655058
## [6,] 26.14 4.797731

multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
  require(grid)

  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                 ncol = cols, nrow = ceiling(numPlots/cols))
}

if (numPlots == 1) {
print(plots[[1]])

} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

for (i in 1:numPlots) {
  matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

  print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                  layout.pos.col = matchidx$col))
 }
}
 }

library(tidyverse)
library(caret)
library(GGally)
library(treemap)

AMES HOUSING DATA

ames

Overall, the histograms consistently show the most common income level to be right around $130,000. We can also find the most frequent bin by combining ggplot2::cut_width (ggplot2::cut_interval and ggplot2::cut_number are additional options) with dplyr::count. We see that the most frequent bin when using increments of $5,000 is $128,000 - $132,000.

pacman::p_load(AmesHousing)

ames <- AmesHousing::make_ames()

ames %>%
  count(cut_width(Sale_Price, width = 5000)) %>%
  arrange(desc(n))%>%head()

## # A tibble: 6 x 2
##   `cut_width(Sale_Price, width = 5000)`     n
##   <fct>                                 <int>
## 1 (1.28e+05,1.32e+05]                     137
## 2 (1.42e+05,1.48e+05]                     130
## 3 (1.32e+05,1.38e+05]                     125
## 4 (1.38e+05,1.42e+05]                     125
## 5 (1.22e+05,1.28e+05]                     118
## 6 (1.52e+05,1.58e+05]                     103

ggplot(ames, aes(Sale_Price)) +
  geom_histogram()

Fig. 30

By default, geom_histogram will divide your data into 30 equal bins or intervals. Since sales prices range from $12,789 - $755,000, dividing this range into 30 equal bins means the bin width is $24,740. So the first bar will represent the frequency of Sale_Price values that range from about $12,500 to about $37,5002, the second bar represents the income range from about 37,500 to 62,300, and so on.

However, we can control this parameter by changing the bin width argument in geom_histogram. By changing the bin width when doing exploratory analysis you can get a more detailed picture of the relative densities of the distribution. For instance, in the default histogram there was a bin of $136,000 - $161,000 values that had the highest frequency but as the histograms that follow show, we can gather more information as we adjust the binning.

p1 <- ggplot(ames, aes(Sale_Price)) +
  geom_histogram(binwidth = 100000) +
  ggtitle("Bin width = $100,000")

p2 <- ggplot(ames, aes(Sale_Price)) +
  geom_histogram(binwidth = 50000) +
  ggtitle("Bin width = $50,000")

p3 <- ggplot(ames, aes(Sale_Price)) +
  geom_histogram(binwidth = 5000) +
  ggtitle("Bin width = $5,000")

p4 <- ggplot(ames, aes(Sale_Price)) +
  geom_histogram(binwidth = 1000) +
  ggtitle("Bin width = $1,000")

gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)

Fig. 30

ggplot(ames, aes(Sale_Price)) +
  geom_histogram(bins = 100) +
  geom_vline(xintercept = c(150000, 170000), color = "red", lty = "dashed") +
  scale_x_log10(
    labels = scales::dollar, 
    breaks = c(50000, 125000, 300000)
    )

Fig. 30

g1 <- ggplot(ames, aes("var", Sale_Price)) +
  geom_boxplot(outlier.alpha = .25) +
  scale_y_log10(
    labels = scales::dollar, 
    breaks = quantile(ames$Sale_Price)
  )

g2 <- ggplot(ames, aes("var", Sale_Price)) +
  geom_point() +
  geom_violin() +
  scale_y_log10(
    labels = scales::dollar, 
    breaks = quantile(ames$Sale_Price)
  )

gridExtra::grid.arrange(g1, g2, ncol = 2)

Fig. 30

g1=ggplotly(g1)

g2=ggplotly(g2)

p <- subplot(g1, g2,nrows = 1)
p

Fig. 30

The boxplot starts to answer the question of what potential outliers exist in our data. Outliers in data can distort predictions and affect their accuracy. Consequently, its important to understand if outliers are present and, if so, which observations are considered outliers. Boxplots provide a visual assessment of potential outliers while the outliers package provides a number of useful functions to systematically extract these outliers. The most useful function is the scores function, which computes normal, t, chi-squared, IQR and MAD scores of the given data which you can use to find observation(s) that lie beyond a given value.

Here, I use the outliers::score function to extract those observations beyond the whiskers in our boxplot and then use a stem-and-leaf plot to assess them. A stem-and-leaf plot is a special table where each data value is split into a “stem” (the first digit or digits) and a “leaf” (usually the second digit). Since the decimal point is located 5 digits to the right of the “ ”” the last stem of “7” and and first leaf of “5” means an outlier exists at around $750,000. The last stem of “7” and and second leaf of “6” means an outlier exists at around $760,000. This is a concise way to see approximately where our outliers are. In fact, I can now see that I have 28 lower end outliers ranging from $10,000-$60,000 and 32 upper end outliers ranging from $450,000-$760,000.

pacman::p_load(outliers)
outliers <- outliers::scores(log(ames$Sale_Price), type = "iqr", lim = 1.5)
stem(ames$Sale_Price[outliers])

## 
##   The decimal point is 5 digit(s) to the right of the |
## 
##   0 | 1134444445555555666666666666
##   1 | 
##   2 | 
##   3 | 
##   4 | 56666777788899
##   5 | 000445566889
##   6 | 1123
##   7 | 56

p1 <- ggplot(ames, aes(Sale_Price)) +
  geom_density()

p2 <- ggplot(ames, aes(Sale_Price)) +
  geom_density() +
  geom_rug()

gridExtra::grid.arrange(p1, p2, nrow = 1)

Fig. 30

ggplot(ames, aes(Sale_Price)) +
  geom_histogram(aes(y = ..density..),
                 binwidth = 5000, color = "grey30", fill = "white") +
  geom_density(alpha = .2, fill = "antiquewhite3")

Fig. 30

p1 <- ggplot(mtcars, aes(x = mpg)) +
  geom_dotplot(method = "histodot", binwidth = 1) +
  ggtitle("dotplot")

p2 <- ggplot(mtcars, aes(x = mpg)) +
  geom_histogram(binwidth = 1) +
  ggtitle("histogram")

gridExtra::grid.arrange(p1, p2, nrow = 1)

Fig. 30

# total count
p1 <- ames %>% 
  count(MS_Zoning) %>%
  ggplot(aes(reorder(MS_Zoning, n), n)) +
  geom_col() +
  coord_flip() +
  ggtitle("Total count")

# percent of whole
p2 <- ames %>% 
  count(MS_Zoning) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(reorder(MS_Zoning, pct), pct)) +
  geom_col() +
  coord_flip() +
  ggtitle("Percent of whole")

gridExtra::grid.arrange(p1, p2, nrow = 1)

Fig. 30

Now we can see that properties zoned as residential low density make up nearly 80% of all observations . We also see that properties zoned as aggricultural (A_agr), industrial (I_all), commercial (C_all), and residential high density make up a very small amount of observations. In fact, below we see that these imbalanced category levels each make up less than 1% of all observations.

ames %>% 
  count(MS_Zoning) %>%
  mutate(pct = n / sum(n)) %>%
  arrange(pct)

## # A tibble: 7 x 3
##   MS_Zoning                        n      pct
##   <fct>                        <int>    <dbl>
## 1 A_agr                            2 0.000683
## 2 I_all                            2 0.000683
## 3 C_all                           25 0.00853 
## 4 Residential_High_Density        27 0.00922 
## 5 Floating_Village_Residential   139 0.0474  
## 6 Residential_Medium_Density     462 0.158   
## 7 Residential_Low_Density       2273 0.776

This imbalanced nature can cause problems in future analytic models so it may make sense to combine these infrequent levels into an “other” category. An easy way to do that is to use fct_lump.4 Here we use n = 2 to retain the top 2 levels in our variable and condense the remaining into an “other” category. You can see that this combined category still represents less than 10% of all observations.

ames %>% 
  mutate(MS_Zoning = fct_lump(MS_Zoning, n = 2)) %>% 
  count(MS_Zoning) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(reorder(MS_Zoning, pct), pct)) +
  geom_col() +
  coord_flip()

Fig. 30

Basic bar charts such as these are great when the number of category levels is smaller. However, as the number of levels increase the thick nature of the bar can be distracting. Cleveland dot plots and lollipop charts are useful for assessing the frequency or proportion of many levels while minizing the amount of ink on the graphic.

For example, if we assess the frequencies and proportions of home sales by the 38 different neighborhoods a dotplot simplifies the chart.

ames %>%  
  count(Neighborhood) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(pct, reorder(Neighborhood, pct))) +
  geom_point()

Fig. 30

Similar to the Cleveland dot plot, a lollipop chart minimizes the visual ink but uses a line to draw the readers attention to the specific x-axis value achieved by each category. In the lollipop chart we use geom_segment to plot the lines and we explicitly state that we want the lines to start at x = 0 and extend to the neighborhood value with xend = pct. We simply need to include y = neighborhood and yend = neighborhood to tell R the lines are horizontally attached to each neighborhood.

ames %>%  
  count(Neighborhood) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(pct, reorder(Neighborhood, pct))) +
  geom_point() +
  geom_segment(aes(x = 0, xend = pct, y = Neighborhood, yend = Neighborhood), size = .15)

Fig. 30

ggplot(ames, aes(Kitchen_Qual)) + 
  geom_bar()

Fig. 30

ames %>%
  mutate(Kitchen_Qual = fct_relevel(Kitchen_Qual, "Poor", "Fair", "Typical", "Good")) %>%
  ggplot(aes(Kitchen_Qual)) + 
  geom_bar()

Fig. 30

Bivariate Relationships and Associations Having a solid understanding of univariate distributions is important; however, most analyses want to take the next step to understand associations and relationships between variables. Features we are generally interested in include:

Associations Outliers Clusters Gaps Barriers Change points

p1 <- ggplot(ames, aes(x = Gr_Liv_Area, y = Sale_Price)) +
  geom_point(alpha = .3) +
  geom_smooth(method = "lm", se = FALSE, color = "red", lty = "dashed") +
  geom_smooth(se = FALSE, lty = "dashed") +
  ggtitle("Non-transformed variables")

p2 <- ggplot(ames, aes(x = Gr_Liv_Area, y = Sale_Price)) +
  geom_point(alpha = .3) +
  geom_smooth(method = "lm", se = FALSE, color = "red", lty = "dashed") +
  geom_smooth(se = FALSE, lty = "dashed") +
  scale_x_log10() +
  scale_y_log10() +
  ggtitle("log-transformed variables")

gridExtra::grid.arrange(p1, p2, nrow = 1)

Fig. 30

p1 <- ggplot(ames, aes(x = Garage_Area, y = Sale_Price)) + 
  geom_point(alpha = .2)

p2 <- ggplot(ames, aes(x = Garage_Area, y = Sale_Price)) + 
  geom_point(alpha = .2) + 
  geom_density2d()

p3 <- ggplot(ames, aes(x = Garage_Area, y = Sale_Price)) + 
  geom_hex(bins = 50, show.legend = FALSE)

gridExtra::grid.arrange(p1, p2, p3, nrow = 1)

Fig. 30

p1 <- ggplot(ames, aes(x = factor(Bedroom_AbvGr), y = Sale_Price)) +
  geom_point(alpha = .2)

p2 <- ggplot(ames, aes(x = factor(Bedroom_AbvGr), y = Sale_Price)) +
  geom_jitter(alpha = .2, width = .2)

p3 <- ggplot(ames, aes(x = factor(Bedroom_AbvGr), y = Sale_Price)) +
  geom_boxplot()

p4 <- ggplot(ames, aes(x = factor(Bedroom_AbvGr), y = Sale_Price)) +
  geom_violin()

gridExtra::grid.arrange(p1, p2, p3, p4, nrow = 2)

Fig. 30

p1 <- ggplot(ames, aes(x = Sale_Price, color = Overall_Qual)) +
  geom_freqpoly() +
  scale_x_log10(breaks = c(50, 150, 400, 750) * 1000, labels = scales::dollar)
  
p2 <- ggplot(ames, aes(x = Sale_Price, color = Overall_Qual, fill = Overall_Qual)) +
  geom_density(alpha = .15) +
  scale_x_log10(breaks = c(50, 150, 400, 750) * 1000, labels = scales::dollar)

gridExtra::grid.arrange(p1, p2, nrow = 2)

Fig. 30

ggplot(ames, aes(x = Sale_Price, y = Overall_Qual)) + 
  ggridges::geom_density_ridges() +
  scale_x_continuous(labels = scales::dollar)

Fig. 30

ames %>%
  mutate(
    Above_Avg = ifelse(Sale_Price > mean(Sale_Price), "Above", "Below"),
    Kitchen_Qual = fct_relevel(Kitchen_Qual, "Poor", "Fair", "Typical", "Good")
    ) %>%
  group_by(Neighborhood, Above_Avg, Kitchen_Qual) %>%
  tally() %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(Kitchen_Qual, pct)) + 
  geom_col() +
  facet_grid(Neighborhood ~ Above_Avg) +
  theme(strip.text.y = element_text(angle = 0, hjust = 0))

Fig. 30

Multivariate Relationships In most analyses, data are usually multivariate by nature, and the analytics are designed to capture and measure multivariate relationships. Visual exploration should therefore also incorporate this important aspect. We can extend these basic principles and add in additional features to assess multidimensional relationships. One approach is to add additional variables with features such as color, shape, or size. For example, here we compare the sales price to above ground square footage of homes with and without central air conditioning. We can see that there are far more homes with central air and that those homes without central air tend to have less square footage and sell for lower sales prices.

ggplot(ames, aes(x = Gr_Liv_Area, y = Sale_Price, color = Central_Air, shape = Central_Air)) +
  geom_point(alpha = .3) +
  scale_x_log10() +
  scale_y_log10()

Fig. 30

ggplot(ames, aes(x = Gr_Liv_Area, y = Sale_Price)) +
  geom_point(alpha = .3) +
  scale_x_log10() +
  scale_y_log10(labels = scales::dollar) +
  facet_wrap(~ House_Style, nrow = 2) +
  theme_bw()

Fig. 30

ggplot(ames, aes(x = Gr_Liv_Area, y = Sale_Price, color = Central_Air, shape = Central_Air)) +
  geom_point(alpha = .3) +
  geom_density2d(alpha = .5) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_x_log10() +
  scale_y_log10(labels = scales::dollar) +
  facet_wrap(~ House_Style, nrow = 2) +
  ggtitle("Sale Price vs. Above Ground Sq.Ft",
          subtitle = "How does central air and house style influence this relationship?") +
  theme_bw()

Fig. 30

ggplot(ames, aes(x = Gr_Liv_Area, y = Sale_Price, color = Central_Air, shape = Central_Air)) +
  geom_point(alpha = .3) +
  geom_density2d(alpha = .5) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_x_log10() +
  scale_y_log10(labels = scales::dollar) +
  facet_wrap(~ House_Style, nrow = 2) +
  ggtitle("Sale Price vs. Above Ground Sq.Ft",
          subtitle = "How does central air and house style influence this relationship?") +
  theme_bw()

Fig. 30

Parallel coordinate plots (PCP) are also a great way to visualize continuous variables across multiple variables. In these plots, a vertical axis is drawn for each variable. Then each observation is represented by drawing a line that connects its values on the different axes, thereby creating a multivariate profile. To create a PCP, we can use ggparcoord from the GGally package. By default, ggparcoord will standardize the variables based on a Z-score distribution;

variables <-c("Sale_Price", "Year_Built", "Year_Remod_Add", "Overall_Qual")

ames[,variables]%>%
#ames %>%select(!!variables) %>%
  ggparcoord(alpha = .05, scale = "center")

Fig. 30

The darker bands in the above plot illustrate several features. The observations with higher sales prices tend to be built in more recent years, be remodeled in recent years and be categorized in the top half of the overall quality measures. In contracts, homes with lower sales prices tend to be more out-dated (based on older built and remodel dates) and have lower quality ratings. We also see some homes with exceptionally old build dates that have much newer remodel dates but still have just average quality ratings.

We can make this more explicit by adding a new variable to indicate if a sale price is above average. We can then tell ggparcood to group by this new variable. Now we clearly see that above average sale prices are related to much newer homes.

  ames[,variables]%>%
  mutate(Above_Avg = Sale_Price > mean(Sale_Price)) %>%
  ggparcoord(
    alpha = .05,
    scale = "center",
    columns = 1:4,
    groupColumn = "Above_Avg"
    )

Fig. 30

Mosaic plots are a graphical method for visualizing data from two or more qualitative variables. In this visual the graphics area is divided up into rectangles proportional in size to the counts of the combinations they represent.

ames2 <- ames %>%
  mutate(
    Above_Avg = Sale_Price > mean(Sale_Price),
    Garage_Type = abbreviate(Garage_Type),
    Garage_Qual = abbreviate(Garage_Qual)
         )

par(mfrow = c(1, 2))
mosaicplot(Above_Avg ~ Garage_Type, data = ames2, las = 1)
mosaicplot(Above_Avg ~ Garage_Type + Garage_Cars, data = ames2, las = 1)

Fig. 30

Treemaps are also a useful visualization aimed at assessing the hierarchical structure of data. Treemaps are primarily used to assess a numeric value across multiple categories. It can be useful to assess the counts or porportions of a categorical variable nested within other categorical variables. For example, we can use a treemap to visualize the above right mosaic plot that illustrates the number of homes sold above and below average sales price with different garage characteristics. We can see in the treemap that houses with above average prices tend to have attached 2 and 3-car garages. Houses sold below average price have more attached 1-car garages and also have far more detached garages.

ames %>% 
  mutate(Above_Below = ifelse(Sale_Price > mean(Sale_Price), "Above Avg", "Below Avg")) %>%
  count(Garage_Type, Garage_Cars, Above_Below) %>%
  treemap(
    index = c("Above_Below", "Garage_Type", "Garage_Cars"),
    vSize = "n"
  )

Fig. 30

pacman::p_load(treemapify)

ames %>% 
  mutate(Above_Below = ifelse(Sale_Price > mean(Sale_Price), "Above Avg", "Below Avg")) %>%
  count(Garage_Type, Garage_Cars, Above_Below)%>% 

ggplot( aes(area = n, fill = Garage_Type, label = Above_Below,subgroup = Garage_Cars)) +
  geom_treemap() +
  geom_treemap_text(fontface = "italic", colour = "white", place = "centre",
                    grow = TRUE)+
    geom_treemap_subgroup_border() +
  geom_treemap_subgroup_text(place = "top", grow = T, alpha = 0.7, colour =
                             "black", fontface = "italic", min.size = 0)+
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position = "bottom")

Fig. 30

ames %>% 
  mutate(Above_Below = ifelse(Sale_Price > mean(Sale_Price), "Above Avg", "Below Avg")) %>%
  count(Garage_Type, Garage_Cars, Above_Below)%>%
ggplot( aes(area = n, fill = Garage_Type, label = Garage_Cars)) +
  geom_treemap() +
  geom_treemap_text(grow = T, reflow = T, colour = "black") +
  facet_wrap( ~ Above_Below) +
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position = "bottom") +
  labs(
    title = "AMES Housing Data",
    caption = "The area of garage type
garage is proportional to its number of cars
              ",
    fill = "Garage Type"
  )

Fig. 30

A heatmap is a graphical display of numerical data where color is used to denote the case value realtive to other values in the column. Heatmaps can be extremely useful in identify clusters of strongly correlated values. We can select all numeric variables in our ames data set, compute the correlation matrix and visualize this matrix with a heatmap. Those locations with dark red represent correlations with smaller values while lighter colored cells represent larger values. Looking at Sale_Price (3rd row from top) you can see that the smaller values are clustered to the left of the plot suggestion weaker linear relationships with variables such as BsmtFin_Sf_1, Bsmt_Unf_SF, Longitude, Enclosed_Porch, etc. The larger correlations values for Sale_Price align with variables to the right of the plot such as Garage_Cars, Garage_Area, First_Flr_SF, etc.

ames %>%
  select_if(is.numeric) %>%
  cor() %>%
  heatmap()

$Fig. 30$

Fig. 30

select Sale_Price and all variables that contain “sf” (all square footage variables), I scale all variables, and then visualize the scatter plot and correlation values with GGally::ggpairs

ames %>%
  dplyr::select(Sale_Price, contains("sf")) %>%
  map_df(scale) %>%
  ggpairs()

Fig. 30

Data Quality Graphical displays can also assist in summarizing certain data quality features. In the previous sections we illustrated how we can identify outliers but missing data also represent an important characteristic of our data. This tutorial has been using the processed version of the Ames housing data set. However, if we use the raw data we see that there are 13,997 missing values. It is important to understand how these missing values are dispersed acrossed a data set as that can determine if you need to eliminate a variable or if you can impute.

sum(is.na(AmesHousing::ames_raw))

## [1] 13997

Heatmaps have a nice alternative use case for visualizing missing values across a data set. In this example is.na(AmesHousing::ames_raw) will return a Boolean (TRUE/FALSE) output indicating the location of missing values and by multiplying this value by 1 converts the output to 0/1 binary to be plotted. All yellow locations in our heatmap represent missing values. This allows us to see those variables where the majority of the observations have missing values (i.e. Alley, Fireplace Qual, Pool QC, Fence, and Misc Feature). Due to their high frequency of missingness, these variables would likely need to be removed from future analytic approaches. However, we can also spot other unique features of missingness. For example, missing values appear to occur across all garage variables for the same observations.

heatmap(1 * is.na(AmesHousing::ames_raw), Rowv = NA, Colv = NA)

Fig. 30

If we dig a little deeper into these variables we would notice that Garage Cars and Garage Area all contain the value 0 for every observation where the other Garage_xx variables have missing values. This is because in the raw Ames housing data set, they did not have an option to identify houses with no garages. Therefore, all houses with no garage were identified by including nothing. This would be opportunity to create a new categorical level (“None”) for these garage variables.

AmesHousing::ames_raw %>% 
  filter(is.na(`Garage Type`)) %>% 
  select(contains("garage"))%>%tail()

## # A tibble: 6 x 7
##   `Garage Type` `Garage Yr Blt` `Garage Finish` `Garage Cars`
##   <chr>                   <int> <chr>                   <int>
## 1 <NA>                       NA <NA>                        0
## 2 <NA>                       NA <NA>                        0
## 3 <NA>                       NA <NA>                        0
## 4 <NA>                       NA <NA>                        0
## 5 <NA>                       NA <NA>                        0
## 6 <NA>                       NA <NA>                        0
## # ... with 3 more variables: `Garage Area` <int>, `Garage Qual` <chr>,
## #   `Garage Cond` <chr>

An alternative approach is to use extracat::visna, which allows us to visualize missing patters. The columns represent the 82 variables and the rows the missing patterns. The cells for the variables with missing values in a pattern are drawn in blue. The variables and patterns have been ordered by numbers of missings on both rows and columns (sort = “b”). The bars beneath the columns show the proporations of missings by variable and the bars on the right show the relative frequencies of patterns.

pacman::p_load(extracat)

extracat::visna(AmesHousing::ames_raw, sort = "b")

Fig. 30

Exploratory Data Analysis with ggplot

Nana Boateng

February 03, 2018

Loading Packages

Multiple plotting

Multiplot

Generic plot annotations

AMES HOUSING DATA

AMES HOUSING DATA