These examples modified from Chapter 12 Ensemble Graphics and Case Studies in Graphical Data Analysis with R,by Antony Unwin
#install.packages("pgmm")
library(pgmm)
data("coffee")
head(coffee)
## Variety Country Water Bean Weight Extract Yield ph Value Free Acid
## 1 1 mexico 8.939999 156.6 33.5 5.80 32.7
## 2 1 mexico 7.400000 157.3 32.1 5.81 30.8
## 3 1 guatemal 9.740000 152.9 33.1 5.26 36.7
## 4 1 honduras 10.400000 174.0 31.5 5.61 34.2
## 5 1 salvador 10.540000 145.1 35.2 5.77 31.8
## 6 1 salvador 10.000000 156.4 34.5 5.83 32.6
## Mineral Content Fat Caffine Trigonelline Chlorogenic Acid
## 1 3.80 15.2 1.13 1.03 5.38
## 2 3.71 15.0 1.25 1.01 5.13
## 3 4.15 16.1 1.21 1.05 5.94
## 4 3.94 15.8 1.06 0.94 5.87
## 5 4.09 15.2 1.11 0.99 5.09
## 6 3.88 15.4 1.20 0.81 5.30
## Neochlorogenic Acid Isochlorogenic Acid
## 1 0.40 0.79
## 2 0.32 0.97
## 3 0.24 0.76
## 4 0.39 0.59
## 5 0.49 0.72
## 6 0.43 0.69
names(coffee)
## [1] "Variety" "Country" "Water"
## [4] "Bean Weight" "Extract Yield" "ph Value"
## [7] "Free Acid" "Mineral Content" "Fat"
## [10] "Caffine" "Trigonelline" "Chlorogenic Acid"
## [13] "Neochlorogenic Acid" "Isochlorogenic Acid"
A unicorn!
#install.packages("naniar")
library(naniar)
## Warning: package 'naniar' was built under R version 3.6.2
vis_miss(coffee)
library(tidyverse)
# barcharts for categorical vars
ggplot(coffee, aes(factor(Variety)))+
geom_bar()
# more arabia than robusta
length(unique(coffee$Country))
## [1] 29
# 29 different countries (with odd spellings)
ggplot(coffee, aes(Country))+
geom_bar()+
coord_flip()
#mosaic for categorica
#install.packages("vcd")
library(vcd)
## Warning: package 'vcd' was built under R version 3.6.2
## Loading required package: grid
mosaic(Variety~Country, data=coffee)
# only one country has both varieties
Here’s another example of a mosaic plot that is built into this package:
#example
mosaic(HairEyeColor, shade=TRUE, legend=TRUE)
Look for outliers
# distribution of continuous variables
boxplot(scale(coffee[,3:14]))
# standardizes
# scatterplot matrix of continuous
pairs(coffee[,3:14], pch=16)
It can be hard to see a pairs plot with too many variables. Let’s try fewer.
pairs(coffee[,3:7], pch=16)
I also really like the GGally
package for pairs plots.
#install.packages("GGally")
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(coffee[,3:7])
coffee<-within(coffee, Type <- ifelse(Variety==1,
"Arabica", "Robusta"))
head(coffee)
## Variety Country Water Bean Weight Extract Yield ph Value Free Acid
## 1 1 mexico 8.939999 156.6 33.5 5.80 32.7
## 2 1 mexico 7.400000 157.3 32.1 5.81 30.8
## 3 1 guatemal 9.740000 152.9 33.1 5.26 36.7
## 4 1 honduras 10.400000 174.0 31.5 5.61 34.2
## 5 1 salvador 10.540000 145.1 35.2 5.77 31.8
## 6 1 salvador 10.000000 156.4 34.5 5.83 32.6
## Mineral Content Fat Caffine Trigonelline Chlorogenic Acid
## 1 3.80 15.2 1.13 1.03 5.38
## 2 3.71 15.0 1.25 1.01 5.13
## 3 4.15 16.1 1.21 1.05 5.94
## 4 3.94 15.8 1.06 0.94 5.87
## 5 4.09 15.2 1.11 0.99 5.09
## 6 3.88 15.4 1.20 0.81 5.30
## Neochlorogenic Acid Isochlorogenic Acid Type
## 1 0.40 0.79 Arabica
## 2 0.32 0.97 Arabica
## 3 0.24 0.76 Arabica
## 4 0.39 0.59 Arabica
## 5 0.49 0.72 Arabica
## 6 0.43 0.69 Arabica
names(coffee)<-abbreviate(names(coffee), 8)
names(coffee)
## [1] "Variety" "Country" "Water" "BeanWght" "ExtrctYl" "ph Value"
## [7] "FreeAcid" "MnrlCntn" "Fat" "Caffine" "Trignlln" "ChlrgncA"
## [13] "NchlrgnA" "IschlrgA" "Type"
a<- ggplot(coffee, aes(x=Type)) +
geom_bar(aes(fill=Type))+
scale_fill_manual(values=c("grey70", "red"))+
guides(fill=FALSE)+
ylab("")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
a
b<-ggplot(coffee, aes(x=Fat, y=Caffine, colour=Type))+
geom_point(size=3)+
scale_colour_manual(values=c("grey70", "red"))
b
#install.packages("GGally")
library(GGally)
c<-ggparcoord(coffee[order(coffee$Type),], columns=3:14,
groupColumn="Type", scale="uniminmax",
mapping=aes(size=1))+
xlab("") + ylab("")+
theme(legend.position="none")+
scale_colour_manual(values=c("grey", "red"))+
theme(axis.ticks.y=element_blank(),
axis.text.y=element_blank())
c
#install.packages("gridExtra")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(arrangeGrob(a, b, ncol=2, widths=c(1, 2)),
c, nrow=2)
Mothers in this dataset had two or more children.
### Fertility dataset
#install.packages("AER")
library(AER)
data("Fertility")
head(Fertility)
## morekids gender1 gender2 age afam hispanic other work
## 1 no male female 27 no no no 0
## 2 no female male 30 no no no 30
## 3 no male female 27 no no no 0
## 4 no male female 35 yes no no 0
## 5 no female female 30 no no no 22
## 6 no male female 26 no no no 40
“As expected, the numbers of women with two or more children rise with age, although it is perhaps surprising that the pattern is so regular before dropping off for the two oldest cohorts”
ggplot(Fertility, aes(x=age))+
geom_bar()
The same pattern can be seen across all race groups.
ggplot(Fertility, aes(x=age))+
geom_bar(binwidth=1)+
facet_wrap(~afam+hispanic+other, scales="free_y")
## Warning: Ignoring unknown parameters: binwidth
In these data, a majority of the women either didn’t work or worked, with some in-between.
We can look at the proportions:
with(Fertility, prop.table(table(work)))
## work
## 0 1 2 3 4 5
## 0.4717813190 0.0056036819 0.0066796516 0.0053366529 0.0084428283 0.0040800459
## 6 7 8 9 10 11
## 0.0078420131 0.0022854540 0.0116707375 0.0025446292 0.0086352463 0.0014254636
## 12 13 14 15 16 17
## 0.0195363120 0.0029844416 0.0046847880 0.0047515452 0.0152874096 0.0025485561
## 18 19 20 21 22 23
## 0.0043863438 0.0010170663 0.0186370526 0.0014568787 0.0031297368 0.0011702153
## 24 25 26 27 28 29
## 0.0092046463 0.0064911606 0.0112034368 0.0010170663 0.0059492488 0.0007539642
## 30 31 32 33 34 35
## 0.0128762949 0.0007461104 0.0088512256 0.0010367008 0.0022147698 0.0045041507
## 36 37 38 39 40 41
## 0.0180912140 0.0019673753 0.0046141038 0.0025210678 0.0289176687 0.0011741422
## 42 43 44 45 46 47
## 0.0054976556 0.0014686594 0.0057332695 0.0055408515 0.0047044225 0.0023011616
## 48 49 50 51 52
## 0.0140700715 0.0047122763 0.0261256450 0.0063694268 0.1854241441
Since we are using the same dataset and similar geometries we can use the same “base” and then add modifiers to it.
p0<-ggplot(Fertility)+geom_bar(binwidth=1)+ylab("")
## Warning: Ignoring unknown parameters: binwidth
p1<- p0+aes(x=age)
p2<-p0+aes(x=work)+xlab("Weeks worked in 1979")
p1
p2
k<-ggplot(Fertility)+geom_bar()+ylab("")+ylim(0, 250000)
p3<-k+aes(x=morekids) + xlab("has more children")
p4<-k+aes(x=gender1) + xlab("first child")
p5<-k+aes(x=gender2) + xlab("second child")
p6<-k+aes(x=afam) + xlab("African American")
p7<-k+aes(x=hispanic) + xlab("Hispanic")
p8<-k+aes(x=other) + xlab("other race")
grid.arrange(arrangeGrob(p1, p2, ncol=2, widths = c(3,3)),
arrangeGrob(p3, p4, p5, p6, p7, p8, ncol=6),
nrow=2, heights=c(1.25, 1))
Note that these are different than colored bar charts because the widths also show the proportions
doubledecker(morekids~age, data=Fertility,
gp=gpar(fill=c("grey90", "forestgreen")),
spacing=spacing_equal(0))
doubledecker(morekids~gender1+gender2, data=Fertility,
gp=gpar(fill=c("grey90", "forestgreen")))
doubledecker(morekids~age+gender1+gender2, data=Fertility,
gp=gpar(fill=c("grey90", "forestgreen")),
spacing=spacing_dimequal(c(0.1, 0, 0, 0)))
#install.packages("HistData")
library(HistData)
## Warning: package 'HistData' was built under R version 3.6.2
data(Guerry)
head(Guerry)
## dept Region Department Crime_pers Crime_prop Literacy Donations Infants
## 1 1 E Ain 28870 15890 37 5098 33120
## 2 2 N Aisne 26226 5521 51 8901 14572
## 3 3 C Allier 26747 7925 13 10973 17044
## 4 4 E Basses-Alpes 12935 7289 46 2733 23018
## 5 5 E Hautes-Alpes 17488 8174 69 6962 23076
## 6 7 S Ardeche 9474 10263 27 3188 42117
## Suicides MainCity Wealth Commerce Clergy Crime_parents Infanticide
## 1 35039 2:Med 73 58 11 71 60
## 2 12831 2:Med 22 10 82 4 82
## 3 114121 2:Med 61 66 68 46 42
## 4 14238 1:Sm 76 49 5 70 12
## 5 16171 1:Sm 83 65 10 22 23
## 6 52547 1:Sm 84 1 28 76 47
## Donation_clergy Lottery Desertion Instruction Prostitutes Distance Area
## 1 69 41 55 46 13 218.372 5762
## 2 36 38 82 24 327 65.945 7369
## 3 76 66 16 85 34 161.927 7340
## 4 37 80 32 29 2 351.399 6925
## 5 64 79 35 7 1 320.280 5549
## 6 67 70 19 62 1 279.413 5529
## Pop1831
## 1 346.03
## 2 513.00
## 3 298.26
## 4 155.90
## 5 129.10
## 6 340.73
#install.packages("DAAG")
library(DAAG)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.2
##
## Attaching package: 'DAAG'
## The following object is masked from 'package:survival':
##
## lung
## The following object is masked from 'package:car':
##
## vif
data(nassCDS)
head(nassCDS)
## dvcat weight dead airbag seatbelt frontal sex ageOFocc yearacc yearVeh
## 1 25-39 25.069 alive none belted 1 f 26 1997 1990
## 2 10-24 25.069 alive airbag belted 1 f 72 1997 1995
## 3 10-24 32.379 alive none none 1 f 69 1997 1988
## 4 25-39 495.444 alive airbag belted 1 f 53 1997 1995
## 5 25-39 25.069 alive none belted 1 f 32 1997 1988
## 6 40-54 25.069 alive none belted 1 f 22 1997 1985
## abcat occRole deploy injSeverity caseid
## 1 unavail driver 0 3 2:3:1
## 2 deploy driver 1 1 2:3:2
## 3 unavail driver 0 4 2:5:1
## 4 deploy driver 1 1 2:10:1
## 5 unavail driver 0 3 2:11:1
## 6 unavail driver 0 3 2:11:2
library(DAAG)
data(ais)
head(ais)
## rcc wcc hc hg ferr bmi ssf pcBfat lbm ht wt sex sport
## 1 3.96 7.5 37.5 12.3 60 20.56 109.1 19.75 63.32 195.9 78.9 f B_Ball
## 2 4.41 8.3 38.2 12.7 68 20.67 102.8 21.30 58.55 189.7 74.4 f B_Ball
## 3 4.14 5.0 36.4 11.6 21 21.86 104.6 19.88 55.36 177.8 69.1 f B_Ball
## 4 4.11 5.3 37.3 12.6 69 21.88 126.4 23.66 57.18 185.0 74.9 f B_Ball
## 5 4.45 6.8 41.5 14.0 29 18.96 80.3 17.64 53.20 184.6 64.6 f B_Ball
## 6 4.10 4.4 37.4 12.5 42 21.04 75.2 15.58 53.77 174.0 63.7 f B_Ball
#install.packages("effects")
library(effects)
## Warning: package 'effects' was built under R version 3.6.2
## Use the command
## lattice::trellis.par.set(effectsTheme())
## to customize lattice options for effects plots.
## See ?effectsTheme for details.
data("Arrests")
head(Arrests)
## released colour year age sex employed citizen checks
## 1 Yes White 2002 21 Male Yes Yes 3
## 2 No Black 1999 17 Male Yes Yes 3
## 3 Yes White 2000 24 Male Yes Yes 3
## 4 No Black 2000 46 Male Yes Yes 1
## 5 Yes Black 1999 27 Female Yes Yes 1
## 6 Yes Black 1998 16 Female Yes Yes 0
#install.packages("robustbase")
library(robustbase)
## Warning: package 'robustbase' was built under R version 3.6.2
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:DAAG':
##
## milk
## The following object is masked from 'package:survival':
##
## heart
data("CrohnD")
head(CrohnD)
## ID nrAdvE BMI height country sex age weight treat
## 1 19908 4 25.22 163 c1 F 47 67 placebo
## 2 19909 4 23.80 164 c1 F 53 64 d1
## 3 19910 1 23.05 164 c1 F 68 62 placebo
## 4 20908 1 25.71 165 c1 F 48 70 d2
## 5 20909 2 25.95 170 c1 F 67 75 placebo
## 6 20910 2 28.70 168 c1 F 54 81 d1
#install.packages("SportsAnalytics")
library(SportsAnalytics)
data(EURO4PlayerSkillsSep11)
head(EURO4PlayerSkillsSep11)
## Name League Team Number Position Positions
## 1 Tobias Sippel Bundesliga 1. FC Kaiserslautern 1 Goalkeeper GK!
## 2 Kevin Trapp Bundesliga 1. FC Kaiserslautern 29 Goalkeeper GK!
## 3 Jan Simunek Bundesliga 1. FC Kaiserslautern 2 Defender CB!
## 4 Leon Jessen Bundesliga 1. FC Kaiserslautern 3 Defender SB!, WB
## 5 Martin Amedick Bundesliga 1. FC Kaiserslautern 5 Defender CB!
## 6 Matthias Abel Bundesliga 1. FC Kaiserslautern 6 Defender CB!, SB
## Birthday Nationality Age Height Weight InjuryTolerance Foot Side Attack
## 1 <NA> German NA 180 79 B R B 30
## 2 (08/01/1990) German 21 189 83 B R B 30
## 3 <NA> Czech NA 189 79 B R B 56
## 4 (11/06/1986) Danish 25 182 75 B L L 70
## 5 <NA> German NA 194 85 B R B 60
## 6 (22/06/1981) German 30 188 82 B R R 63
## Defence Balance Stamina TopSpeed Acceleration Response Agility
## 1 81 83 69 70 72 84 80
## 2 82 84 69 70 72 84 83
## 3 78 82 84 78 76 78 78
## 4 66 79 83 82 80 76 75
## 5 78 87 83 74 75 79 73
## 6 77 84 82 76 74 78 72
## DribbleAccuracy DribbleSpeed ShortPassAccuracy ShortPassSpeed
## 1 52 58 56 65
## 2 54 58 56 65
## 3 71 76 71 70
## 4 75 78 72 74
## 5 71 73 73 73
## 6 71 72 70 73
## LongPassAccuracy LongPassSpeed ShotAccuracy ShotPower ShotTechnique
## 1 63 69 44 83 44
## 2 63 68 44 83 44
## 3 68 70 66 80 64
## 4 76 79 64 81 65
## 5 71 71 66 81 65
## 6 71 73 64 80 65
## FreeKickAccuracy Curling Header Jump Technique Aggression Mentality
## 1 44 48 44 83 54 64 78
## 2 44 48 44 84 53 64 82
## 3 65 66 82 80 72 67 75
## 4 70 76 72 73 76 78 76
## 5 63 67 82 80 71 66 81
## 6 63 68 80 76 71 65 77
## KeeperSkills Teamwork ConditionFitness WeakFootAccuracy WeakFootFrequency
## 1 81 76 6 4 4
## 2 82 79 6 4 4
## 3 50 75 6 4 4
## 4 50 78 6 5 5
## 5 50 80 7 5 5
## 6 50 77 6 4 4
#install.packages("GDAdata")
library(GDAdata)
data("Decathlon")
head(Decathlon)
## Totalpoints DecathleteName Nationality m100 Longjump Shotput Highjump
## 1 8559 Torsten Voss DDR 10.66 8.00 14.73 2.06
## 2 8504 Uwe Freimuth DDR 11.10 7.66 16.30 1.94
## 3 8440 Siegfried Wentz BRD 11.21 7.22 15.84 2.09
## 4 8409 Aleksandr Nevski SU 10.95 7.35 14.99 2.08
## 5 8381 John Sayre USA 10.86 7.41 14.22 2.00
## 6 8366 Vadim Podmaryov SU 11.09 7.56 15.28 2.08
## m400 m110hurdles Discus Polevault Javelin m1500 yearEvent P100m Plj Psp
## 1 48.28 14.50 43.28 4.9 61.28 268.80 1985 938 1061 773
## 2 48.46 14.77 47.72 4.9 68.26 270.56 1985 839 975 870
## 3 47.75 14.28 45.52 4.7 69.66 278.38 1985 814 866 841
## 4 49.29 14.76 46.12 4.6 68.16 261.09 1985 872 898 789
## 5 49.98 14.84 46.08 5.3 67.68 277.07 1985 892 913 742
## 6 50.00 14.89 48.58 4.6 67.46 272.31 1985 841 950 807
## Phj P400m P110h Ppv Pdt Pjt P1500
## 1 859 896 911 880 732 757 752
## 2 749 887 878 880 823 863 741
## 3 887 921 939 819 778 884 691
## 4 878 848 879 790 790 861 804
## 5 803 816 869 1004 789 854 699
## 6 878 815 863 790 841 851 730