Ensemble Graphics

These examples modified from Chapter 12 Ensemble Graphics and Case Studies in Graphical Data Analysis with R,by Antony Unwin

Motivating Example 1: Coffee

#install.packages("pgmm")
library(pgmm)
data("coffee")
head(coffee)
##   Variety  Country     Water Bean Weight Extract Yield ph Value Free Acid
## 1       1   mexico  8.939999       156.6          33.5     5.80      32.7
## 2       1   mexico  7.400000       157.3          32.1     5.81      30.8
## 3       1 guatemal  9.740000       152.9          33.1     5.26      36.7
## 4       1 honduras 10.400000       174.0          31.5     5.61      34.2
## 5       1 salvador 10.540000       145.1          35.2     5.77      31.8
## 6       1 salvador 10.000000       156.4          34.5     5.83      32.6
##   Mineral Content  Fat Caffine Trigonelline Chlorogenic Acid
## 1            3.80 15.2    1.13         1.03             5.38
## 2            3.71 15.0    1.25         1.01             5.13
## 3            4.15 16.1    1.21         1.05             5.94
## 4            3.94 15.8    1.06         0.94             5.87
## 5            4.09 15.2    1.11         0.99             5.09
## 6            3.88 15.4    1.20         0.81             5.30
##   Neochlorogenic Acid Isochlorogenic Acid
## 1                0.40                0.79
## 2                0.32                0.97
## 3                0.24                0.76
## 4                0.39                0.59
## 5                0.49                0.72
## 6                0.43                0.69
names(coffee)
##  [1] "Variety"             "Country"             "Water"              
##  [4] "Bean Weight"         "Extract Yield"       "ph Value"           
##  [7] "Free Acid"           "Mineral Content"     "Fat"                
## [10] "Caffine"             "Trigonelline"        "Chlorogenic Acid"   
## [13] "Neochlorogenic Acid" "Isochlorogenic Acid"

1.1 Exploration:

Is that any missing data?

A unicorn!

#install.packages("naniar")
library(naniar)
## Warning: package 'naniar' was built under R version 3.6.2
vis_miss(coffee)

1.1.1 Visualize One Categorical Variable: Bar Charts

What is the distribution of coffee varieties?
library(tidyverse)

# barcharts for categorical vars
ggplot(coffee, aes(factor(Variety)))+
  geom_bar()

# more arabia than robusta
What is the distribution of coffee origin countries?
length(unique(coffee$Country))
## [1] 29
# 29 different countries (with odd spellings)

ggplot(coffee, aes(Country))+
  geom_bar()+
  coord_flip()

1.1.2 Visualize Multiple Categorical Variables: Mosaic plots

What is the distribution of coffee varieties across country?
#mosaic for categorica
#install.packages("vcd")
library(vcd)
## Warning: package 'vcd' was built under R version 3.6.2
## Loading required package: grid
mosaic(Variety~Country, data=coffee)

# only one country has both varieties

Here’s another example of a mosaic plot that is built into this package:

#example
mosaic(HairEyeColor, shade=TRUE, legend=TRUE)

1.1.3 Visualize Continuous Variables: Standardized boxplots

Look for outliers

# distribution of continuous variables
boxplot(scale(coffee[,3:14]))

# standardizes

1.1.4 Visualize Relationships between Two Continuous Variables: Pairs plots

# scatterplot matrix of continuous 
pairs(coffee[,3:14], pch=16)

It can be hard to see a pairs plot with too many variables. Let’s try fewer.

pairs(coffee[,3:7], pch=16)

I also really like the GGally package for pairs plots.

#install.packages("GGally")
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(coffee[,3:7])

1.2 Polishing/Explanation

  • Labels
  • Names of Varieties
  • Names of Variables
  • Consistent colors
1.2.1 Names of Varieties
coffee<-within(coffee, Type <- ifelse(Variety==1,
                                      "Arabica", "Robusta"))
head(coffee)
##   Variety  Country     Water Bean Weight Extract Yield ph Value Free Acid
## 1       1   mexico  8.939999       156.6          33.5     5.80      32.7
## 2       1   mexico  7.400000       157.3          32.1     5.81      30.8
## 3       1 guatemal  9.740000       152.9          33.1     5.26      36.7
## 4       1 honduras 10.400000       174.0          31.5     5.61      34.2
## 5       1 salvador 10.540000       145.1          35.2     5.77      31.8
## 6       1 salvador 10.000000       156.4          34.5     5.83      32.6
##   Mineral Content  Fat Caffine Trigonelline Chlorogenic Acid
## 1            3.80 15.2    1.13         1.03             5.38
## 2            3.71 15.0    1.25         1.01             5.13
## 3            4.15 16.1    1.21         1.05             5.94
## 4            3.94 15.8    1.06         0.94             5.87
## 5            4.09 15.2    1.11         0.99             5.09
## 6            3.88 15.4    1.20         0.81             5.30
##   Neochlorogenic Acid Isochlorogenic Acid    Type
## 1                0.40                0.79 Arabica
## 2                0.32                0.97 Arabica
## 3                0.24                0.76 Arabica
## 4                0.39                0.59 Arabica
## 5                0.49                0.72 Arabica
## 6                0.43                0.69 Arabica
1.2.2 Names of Variables
names(coffee)<-abbreviate(names(coffee), 8)
names(coffee)
##  [1] "Variety"  "Country"  "Water"    "BeanWght" "ExtrctYl" "ph Value"
##  [7] "FreeAcid" "MnrlCntn" "Fat"      "Caffine"  "Trignlln" "ChlrgncA"
## [13] "NchlrgnA" "IschlrgA" "Type"

1.3 Graphics

1.3.1 Bargraph
a<- ggplot(coffee, aes(x=Type)) + 
  geom_bar(aes(fill=Type))+
  scale_fill_manual(values=c("grey70", "red"))+
  guides(fill=FALSE)+
  ylab("")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
a

1.3.2 Scatterplot
b<-ggplot(coffee, aes(x=Fat, y=Caffine, colour=Type))+
  geom_point(size=3)+
  scale_colour_manual(values=c("grey70", "red"))
b

1.3.3 Parallel Coordinate Plot
#install.packages("GGally")
library(GGally)

c<-ggparcoord(coffee[order(coffee$Type),], columns=3:14,
              groupColumn="Type", scale="uniminmax",
              mapping=aes(size=1))+
  xlab("") + ylab("")+
  theme(legend.position="none")+
  scale_colour_manual(values=c("grey", "red"))+
  theme(axis.ticks.y=element_blank(),
        axis.text.y=element_blank())
c

1.4 Ensemble Graphic

#install.packages("gridExtra")
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(arrangeGrob(a, b, ncol=2, widths=c(1, 2)),
             c, nrow=2)

Case Study: Fertility

Mothers in this dataset had two or more children.

### Fertility dataset
#install.packages("AER")
library(AER)
data("Fertility")

head(Fertility)
##   morekids gender1 gender2 age afam hispanic other work
## 1       no    male  female  27   no       no    no    0
## 2       no  female    male  30   no       no    no   30
## 3       no    male  female  27   no       no    no    0
## 4       no    male  female  35  yes       no    no    0
## 5       no  female  female  30   no       no    no   22
## 6       no    male  female  26   no       no    no   40

2.1 Exploration

2.1.1 What is the distribution of age?

“As expected, the numbers of women with two or more children rise with age, although it is perhaps surprising that the pattern is so regular before dropping off for the two oldest cohorts”

ggplot(Fertility, aes(x=age))+
  geom_bar()

2.1.2 Is the distribution of age different across demographic groups?

The same pattern can be seen across all race groups.

ggplot(Fertility, aes(x=age))+
  geom_bar(binwidth=1)+
  facet_wrap(~afam+hispanic+other, scales="free_y")
## Warning: Ignoring unknown parameters: binwidth

2.1.3 Does whether or not the mother worked affect number of children?

In these data, a majority of the women either didn’t work or worked, with some in-between.

We can look at the proportions:

with(Fertility, prop.table(table(work)))
## work
##            0            1            2            3            4            5 
## 0.4717813190 0.0056036819 0.0066796516 0.0053366529 0.0084428283 0.0040800459 
##            6            7            8            9           10           11 
## 0.0078420131 0.0022854540 0.0116707375 0.0025446292 0.0086352463 0.0014254636 
##           12           13           14           15           16           17 
## 0.0195363120 0.0029844416 0.0046847880 0.0047515452 0.0152874096 0.0025485561 
##           18           19           20           21           22           23 
## 0.0043863438 0.0010170663 0.0186370526 0.0014568787 0.0031297368 0.0011702153 
##           24           25           26           27           28           29 
## 0.0092046463 0.0064911606 0.0112034368 0.0010170663 0.0059492488 0.0007539642 
##           30           31           32           33           34           35 
## 0.0128762949 0.0007461104 0.0088512256 0.0010367008 0.0022147698 0.0045041507 
##           36           37           38           39           40           41 
## 0.0180912140 0.0019673753 0.0046141038 0.0025210678 0.0289176687 0.0011741422 
##           42           43           44           45           46           47 
## 0.0054976556 0.0014686594 0.0057332695 0.0055408515 0.0047044225 0.0023011616 
##           48           49           50           51           52 
## 0.0140700715 0.0047122763 0.0261256450 0.0063694268 0.1854241441

2.2 Graphics

Since we are using the same dataset and similar geometries we can use the same “base” and then add modifiers to it.

2.2.1 Age and Weeks Worked
p0<-ggplot(Fertility)+geom_bar(binwidth=1)+ylab("")
## Warning: Ignoring unknown parameters: binwidth
p1<- p0+aes(x=age)
p2<-p0+aes(x=work)+xlab("Weeks worked in 1979")

p1

p2

2.2.2 Other Variables
k<-ggplot(Fertility)+geom_bar()+ylab("")+ylim(0, 250000)
p3<-k+aes(x=morekids) + xlab("has more children")
p4<-k+aes(x=gender1) + xlab("first child")
p5<-k+aes(x=gender2) + xlab("second child")
p6<-k+aes(x=afam) + xlab("African American")
p7<-k+aes(x=hispanic) + xlab("Hispanic")
p8<-k+aes(x=other) + xlab("other race")
2.2.3 Now combine!
grid.arrange(arrangeGrob(p1, p2, ncol=2, widths = c(3,3)),
             arrangeGrob(p3, p4, p5, p6, p7, p8, ncol=6),
             nrow=2, heights=c(1.25, 1))

2.3 Spinogram plots

Note that these are different than colored bar charts because the widths also show the proportions

2.3.1 More than two kids by age
doubledecker(morekids~age, data=Fertility,
             gp=gpar(fill=c("grey90", "forestgreen")),
             spacing=spacing_equal(0))

2.3.2 More than two kids by genders of previous children
doubledecker(morekids~gender1+gender2, data=Fertility,
             gp=gpar(fill=c("grey90", "forestgreen")))

2.3.3 More than two kids by age and genders of previous children
doubledecker(morekids~age+gender1+gender2, data=Fertility,
             gp=gpar(fill=c("grey90", "forestgreen")),
             spacing=spacing_dimequal(c(0.1, 0, 0, 0)))

3. Case Studies

3.1 Moral statistics of France

#install.packages("HistData")
library(HistData)
## Warning: package 'HistData' was built under R version 3.6.2
data(Guerry)
head(Guerry)
##   dept Region   Department Crime_pers Crime_prop Literacy Donations Infants
## 1    1      E          Ain      28870      15890       37      5098   33120
## 2    2      N        Aisne      26226       5521       51      8901   14572
## 3    3      C       Allier      26747       7925       13     10973   17044
## 4    4      E Basses-Alpes      12935       7289       46      2733   23018
## 5    5      E Hautes-Alpes      17488       8174       69      6962   23076
## 6    7      S      Ardeche       9474      10263       27      3188   42117
##   Suicides MainCity Wealth Commerce Clergy Crime_parents Infanticide
## 1    35039    2:Med     73       58     11            71          60
## 2    12831    2:Med     22       10     82             4          82
## 3   114121    2:Med     61       66     68            46          42
## 4    14238     1:Sm     76       49      5            70          12
## 5    16171     1:Sm     83       65     10            22          23
## 6    52547     1:Sm     84        1     28            76          47
##   Donation_clergy Lottery Desertion Instruction Prostitutes Distance Area
## 1              69      41        55          46          13  218.372 5762
## 2              36      38        82          24         327   65.945 7369
## 3              76      66        16          85          34  161.927 7340
## 4              37      80        32          29           2  351.399 6925
## 5              64      79        35           7           1  320.280 5549
## 6              67      70        19          62           1  279.413 5529
##   Pop1831
## 1  346.03
## 2  513.00
## 3  298.26
## 4  155.90
## 5  129.10
## 6  340.73

3.2 Airbags and car accidents

#install.packages("DAAG")
library(DAAG)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.2
## 
## Attaching package: 'DAAG'
## The following object is masked from 'package:survival':
## 
##     lung
## The following object is masked from 'package:car':
## 
##     vif
data(nassCDS)
head(nassCDS)
##   dvcat  weight  dead airbag seatbelt frontal sex ageOFocc yearacc yearVeh
## 1 25-39  25.069 alive   none   belted       1   f       26    1997    1990
## 2 10-24  25.069 alive airbag   belted       1   f       72    1997    1995
## 3 10-24  32.379 alive   none     none       1   f       69    1997    1988
## 4 25-39 495.444 alive airbag   belted       1   f       53    1997    1995
## 5 25-39  25.069 alive   none   belted       1   f       32    1997    1988
## 6 40-54  25.069 alive   none   belted       1   f       22    1997    1985
##     abcat occRole deploy injSeverity caseid
## 1 unavail  driver      0           3  2:3:1
## 2  deploy  driver      1           1  2:3:2
## 3 unavail  driver      0           4  2:5:1
## 4  deploy  driver      1           1 2:10:1
## 5 unavail  driver      0           3 2:11:1
## 6 unavail  driver      0           3 2:11:2

3.3 Athletes’ blood measurements

library(DAAG)
data(ais)
head(ais)
##    rcc wcc   hc   hg ferr   bmi   ssf pcBfat   lbm    ht   wt sex  sport
## 1 3.96 7.5 37.5 12.3   60 20.56 109.1  19.75 63.32 195.9 78.9   f B_Ball
## 2 4.41 8.3 38.2 12.7   68 20.67 102.8  21.30 58.55 189.7 74.4   f B_Ball
## 3 4.14 5.0 36.4 11.6   21 21.86 104.6  19.88 55.36 177.8 69.1   f B_Ball
## 4 4.11 5.3 37.3 12.6   69 21.88 126.4  23.66 57.18 185.0 74.9   f B_Ball
## 5 4.45 6.8 41.5 14.0   29 18.96  80.3  17.64 53.20 184.6 64.6   f B_Ball
## 6 4.10 4.4 37.4 12.5   42 21.04  75.2  15.58 53.77 174.0 63.7   f B_Ball

3.4 Marijuana arrests

#install.packages("effects")
library(effects)
## Warning: package 'effects' was built under R version 3.6.2
## Use the command
##     lattice::trellis.par.set(effectsTheme())
##   to customize lattice options for effects plots.
## See ?effectsTheme for details.
data("Arrests")
head(Arrests)
##   released colour year age    sex employed citizen checks
## 1      Yes  White 2002  21   Male      Yes     Yes      3
## 2       No  Black 1999  17   Male      Yes     Yes      3
## 3      Yes  White 2000  24   Male      Yes     Yes      3
## 4       No  Black 2000  46   Male      Yes     Yes      1
## 5      Yes  Black 1999  27 Female      Yes     Yes      1
## 6      Yes  Black 1998  16 Female      Yes     Yes      0

3.5 Crohn’s disease

#install.packages("robustbase")
library(robustbase)
## Warning: package 'robustbase' was built under R version 3.6.2
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:DAAG':
## 
##     milk
## The following object is masked from 'package:survival':
## 
##     heart
data("CrohnD")
head(CrohnD)
##      ID nrAdvE   BMI height country sex age weight   treat
## 1 19908      4 25.22    163      c1   F  47     67 placebo
## 2 19909      4 23.80    164      c1   F  53     64      d1
## 3 19910      1 23.05    164      c1   F  68     62 placebo
## 4 20908      1 25.71    165      c1   F  48     70      d2
## 5 20909      2 25.95    170      c1   F  67     75 placebo
## 6 20910      2 28.70    168      c1   F  54     81      d1

3.6 Footballers in the four major European leagues

#install.packages("SportsAnalytics")
library(SportsAnalytics)
data(EURO4PlayerSkillsSep11)
head(EURO4PlayerSkillsSep11)
##             Name     League                 Team Number   Position Positions
## 1  Tobias Sippel Bundesliga 1. FC Kaiserslautern      1 Goalkeeper       GK!
## 2    Kevin Trapp Bundesliga 1. FC Kaiserslautern     29 Goalkeeper       GK!
## 3    Jan Simunek Bundesliga 1. FC Kaiserslautern      2   Defender       CB!
## 4    Leon Jessen Bundesliga 1. FC Kaiserslautern      3   Defender   SB!, WB
## 5 Martin Amedick Bundesliga 1. FC Kaiserslautern      5   Defender       CB!
## 6  Matthias Abel Bundesliga 1. FC Kaiserslautern      6   Defender   CB!, SB
##       Birthday Nationality Age Height Weight InjuryTolerance Foot Side Attack
## 1         <NA>      German  NA    180     79               B    R    B     30
## 2 (08/01/1990)      German  21    189     83               B    R    B     30
## 3         <NA>       Czech  NA    189     79               B    R    B     56
## 4 (11/06/1986)      Danish  25    182     75               B    L    L     70
## 5         <NA>      German  NA    194     85               B    R    B     60
## 6 (22/06/1981)      German  30    188     82               B    R    R     63
##   Defence Balance Stamina TopSpeed Acceleration Response Agility
## 1      81      83      69       70           72       84      80
## 2      82      84      69       70           72       84      83
## 3      78      82      84       78           76       78      78
## 4      66      79      83       82           80       76      75
## 5      78      87      83       74           75       79      73
## 6      77      84      82       76           74       78      72
##   DribbleAccuracy DribbleSpeed ShortPassAccuracy ShortPassSpeed
## 1              52           58                56             65
## 2              54           58                56             65
## 3              71           76                71             70
## 4              75           78                72             74
## 5              71           73                73             73
## 6              71           72                70             73
##   LongPassAccuracy LongPassSpeed ShotAccuracy ShotPower ShotTechnique
## 1               63            69           44        83            44
## 2               63            68           44        83            44
## 3               68            70           66        80            64
## 4               76            79           64        81            65
## 5               71            71           66        81            65
## 6               71            73           64        80            65
##   FreeKickAccuracy Curling Header Jump Technique Aggression Mentality
## 1               44      48     44   83        54         64        78
## 2               44      48     44   84        53         64        82
## 3               65      66     82   80        72         67        75
## 4               70      76     72   73        76         78        76
## 5               63      67     82   80        71         66        81
## 6               63      68     80   76        71         65        77
##   KeeperSkills Teamwork ConditionFitness WeakFootAccuracy WeakFootFrequency
## 1           81       76                6                4                 4
## 2           82       79                6                4                 4
## 3           50       75                6                4                 4
## 4           50       78                6                5                 5
## 5           50       80                7                5                 5
## 6           50       77                6                4                 4

3.7 Decathlon

#install.packages("GDAdata")
library(GDAdata)
data("Decathlon")
head(Decathlon)
##   Totalpoints   DecathleteName Nationality  m100 Longjump Shotput Highjump
## 1        8559     Torsten Voss         DDR 10.66     8.00   14.73     2.06
## 2        8504     Uwe Freimuth         DDR 11.10     7.66   16.30     1.94
## 3        8440  Siegfried Wentz         BRD 11.21     7.22   15.84     2.09
## 4        8409 Aleksandr Nevski          SU 10.95     7.35   14.99     2.08
## 5        8381       John Sayre         USA 10.86     7.41   14.22     2.00
## 6        8366  Vadim Podmaryov          SU 11.09     7.56   15.28     2.08
##    m400 m110hurdles Discus Polevault Javelin  m1500 yearEvent P100m  Plj Psp
## 1 48.28       14.50  43.28       4.9   61.28 268.80      1985   938 1061 773
## 2 48.46       14.77  47.72       4.9   68.26 270.56      1985   839  975 870
## 3 47.75       14.28  45.52       4.7   69.66 278.38      1985   814  866 841
## 4 49.29       14.76  46.12       4.6   68.16 261.09      1985   872  898 789
## 5 49.98       14.84  46.08       5.3   67.68 277.07      1985   892  913 742
## 6 50.00       14.89  48.58       4.6   67.46 272.31      1985   841  950 807
##   Phj P400m P110h  Ppv Pdt Pjt P1500
## 1 859   896   911  880 732 757   752
## 2 749   887   878  880 823 863   741
## 3 887   921   939  819 778 884   691
## 4 878   848   879  790 790 861   804
## 5 803   816   869 1004 789 854   699
## 6 878   815   863  790 841 851   730