| title: “DataViz: Ensemble Graphics |
| ” |
| author: “Karol Orozco” |
| date: “12/1/2021” |
| output: html_document |
str(coffee)
## 'data.frame': 43 obs. of 14 variables:
## $ Variety : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Country : Factor w/ 29 levels "angola","brasile",..: 19 19 12 15 24 24 24 21 20 7 ...
## $ Water : num 8.94 7.4 9.74 10.4 10.54 ...
## $ Bean Weight : num 157 157 153 174 145 ...
## $ Extract Yield : num 33.5 32.1 33.1 31.5 35.2 ...
## $ ph Value : num 5.8 5.81 5.26 5.61 5.77 5.83 5.62 5.89 5.75 5.82 ...
## $ Free Acid : num 32.7 30.8 36.7 34.2 31.8 ...
## $ Mineral Content : num 3.8 3.71 4.15 3.94 4.09 3.88 3.85 3.85 4.22 4.01 ...
## $ Fat : num 15.2 15 16.1 15.8 15.2 15.4 15.6 15.1 14.3 15.1 ...
## $ Caffine : num 1.13 1.25 1.21 1.06 1.11 1.2 1.33 1.28 1.16 1.32 ...
## $ Trigonelline : num 1.03 1.01 1.05 0.94 0.99 0.81 1.16 0.98 0.97 0.97 ...
## $ Chlorogenic Acid : num 5.38 5.13 5.94 5.87 5.09 5.3 4.8 4.96 5.53 5.12 ...
## $ Neochlorogenic Acid: num 0.4 0.32 0.24 0.39 0.49 0.43 0.32 0.32 0.36 0.29 ...
## $ Isochlorogenic Acid: num 0.79 0.97 0.76 0.59 0.72 0.69 0.74 0.66 0.8 0.69 ...
vis_miss(coffee)
ggplot(coffee, aes(factor(Variety)))+
geom_bar()
length(unique(coffee$Country))
## [1] 29
ggplot(coffee, aes(Country))+
geom_bar()+
coord_flip()
## Mosaic plots
library(vcd)
## Warning: package 'vcd' was built under R version 4.1.2
## Loading required package: grid
mosaic(Variety~Country, data= coffee)
# Here’s another example of a mosaic plot that is built into this package:
#example
mosaic(HairEyeColor, shade=TRUE, legend=TRUE)
1.1.3 Visualize Continuous Variables: Standardized boxplots Look for outliers
# distribution of continuous variables
boxplot(scale(coffee[,3:14]))
1.1.4 Visualize Relationships between Two Continuous Variables: Pairs plots
pairs(coffee[,3:14], pch=16)
1.2 Polishing/Explanation
Labels Names of Varieties Names of Variables Consistent colors
1.2.1 Names of Varieties
coffee<-within(coffee, Type <- ifelse(Variety==1,
"Arabica", "Robusta"))
head(coffee)
## Variety Country Water Bean Weight Extract Yield ph Value Free Acid
## 1 1 mexico 8.939999 156.6 33.5 5.80 32.7
## 2 1 mexico 7.400000 157.3 32.1 5.81 30.8
## 3 1 guatemal 9.740000 152.9 33.1 5.26 36.7
## 4 1 honduras 10.400000 174.0 31.5 5.61 34.2
## 5 1 salvador 10.540000 145.1 35.2 5.77 31.8
## 6 1 salvador 10.000000 156.4 34.5 5.83 32.6
## Mineral Content Fat Caffine Trigonelline Chlorogenic Acid
## 1 3.80 15.2 1.13 1.03 5.38
## 2 3.71 15.0 1.25 1.01 5.13
## 3 4.15 16.1 1.21 1.05 5.94
## 4 3.94 15.8 1.06 0.94 5.87
## 5 4.09 15.2 1.11 0.99 5.09
## 6 3.88 15.4 1.20 0.81 5.30
## Neochlorogenic Acid Isochlorogenic Acid Type
## 1 0.40 0.79 Arabica
## 2 0.32 0.97 Arabica
## 3 0.24 0.76 Arabica
## 4 0.39 0.59 Arabica
## 5 0.49 0.72 Arabica
## 6 0.43 0.69 Arabica
1.2.2 Names of Variables
names(coffee)<-abbreviate(names(coffee), 8)
names(coffee)
## [1] "Variety" "Country" "Water" "BeanWght" "ExtrctYl" "ph Value"
## [7] "FreeAcid" "MnrlCntn" "Fat" "Caffine" "Trignlln" "ChlrgncA"
## [13] "NchlrgnA" "IschlrgA" "Type"
1.3 Graphics 1.3.1 Bargraph
a<- ggplot(coffee, aes(x=Type)) +
geom_bar(aes(fill=Type))+
scale_fill_manual(values=c("grey70", "red"))+
guides(fill=FALSE)+
ylab("")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
a
1.3.2 Scatterplot
b<-ggplot(coffee, aes(x=Fat, y=Caffine, colour=Type))+
geom_point(size=3)+
scale_colour_manual(values=c("grey70", "red"))
b
1.3.3 Parallel Coordinate Plot
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
c<-ggparcoord(coffee[order(coffee$Type),], columns=3:14,
groupColumn="Type", scale="uniminmax",
mapping=aes(size=1))+
xlab("") + ylab("")+
theme(legend.position="none")+
scale_colour_manual(values=c("grey", "red"))+
theme(axis.ticks.y=element_blank(),
axis.text.y=element_blank())
c
1.4 Ensemble Graphic
#install.packages("gridExtra")
grid.arrange(arrangeGrob(a, b, ncol=2, widths=c(1, 2)),
c, nrow=2)
grid.arrange(a, b, c, nrow = 2)
Case Study: Fertility Mothers in this dataset had two or more children.
# install.packages("AER")
library(AER)
## Warning: package 'AER' was built under R version 4.1.2
## Loading required package: car
## Warning: package 'car' was built under R version 4.1.2
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
data("Fertility")
head(Fertility)
## morekids gender1 gender2 age afam hispanic other work
## 1 no male female 27 no no no 0
## 2 no female male 30 no no no 30
## 3 no male female 27 no no no 0
## 4 no male female 35 yes no no 0
## 5 no female female 30 no no no 22
## 6 no male female 26 no no no 40
2.1 Exploration 2.1.1 What is the distribution of age? “As expected, the numbers of women with two or more children rise with age, although it is perhaps surprising that the pattern is so regular before dropping off for the two oldest cohorts”
ggplot(Fertility, aes(x=age))+
geom_bar()
2.1.2 Is the distribution of age different across demographic groups? The same pattern can be seen across all race groups.
ggplot(Fertility, aes(x=age))+
geom_bar(binwidth=1)+
facet_wrap(~afam+hispanic+other, scales="free_y")
## Warning: Ignoring unknown parameters: binwidth
2.1.3 Does whether or not the mother worked affect number of children? In these data, a majority of the women either didn’t work or worked, with some in-between.
We can look at the proportions:
with(Fertility, prop.table(table(work)))
## work
## 0 1 2 3 4 5
## 0.4717813190 0.0056036819 0.0066796516 0.0053366529 0.0084428283 0.0040800459
## 6 7 8 9 10 11
## 0.0078420131 0.0022854540 0.0116707375 0.0025446292 0.0086352463 0.0014254636
## 12 13 14 15 16 17
## 0.0195363120 0.0029844416 0.0046847880 0.0047515452 0.0152874096 0.0025485561
## 18 19 20 21 22 23
## 0.0043863438 0.0010170663 0.0186370526 0.0014568787 0.0031297368 0.0011702153
## 24 25 26 27 28 29
## 0.0092046463 0.0064911606 0.0112034368 0.0010170663 0.0059492488 0.0007539642
## 30 31 32 33 34 35
## 0.0128762949 0.0007461104 0.0088512256 0.0010367008 0.0022147698 0.0045041507
## 36 37 38 39 40 41
## 0.0180912140 0.0019673753 0.0046141038 0.0025210678 0.0289176687 0.0011741422
## 42 43 44 45 46 47
## 0.0054976556 0.0014686594 0.0057332695 0.0055408515 0.0047044225 0.0023011616
## 48 49 50 51 52
## 0.0140700715 0.0047122763 0.0261256450 0.0063694268 0.1854241441
2.2 Graphics Since we are using the same dataset and similar geometries we can use the same “base” and then add modifiers to it.
2.2.1 Age and Weeks Worked
p0<-ggplot(Fertility)+geom_bar(binwidth=1)+ylab("")
## Warning: Ignoring unknown parameters: binwidth
## Warning: Ignoring unknown parameters: binwidth
p1<- p0+aes(x=age)
p2<-p0+aes(x=work)+xlab("Weeks worked in 1979")
p1
p2
2.2.2 Other Variables
k<-ggplot(Fertility)+geom_bar()+ylab("")+ylim(0, 250000)
p3<-k+aes(x=morekids) + xlab("has more children")
p4<-k+aes(x=gender1) + xlab("first child")
p5<-k+aes(x=gender2) + xlab("second child")
p6<-k+aes(x=afam) + xlab("African American")
p7<-k+aes(x=hispanic) + xlab("Hispanic")
p8<-k+aes(x=other) + xlab("other race")
2.2.3 Now combine!
grid.arrange(arrangeGrob(p1, p2, ncol=2, widths = c(3,3)),
arrangeGrob(p3, p4, p5, p6, p7, p8, ncol=6),
nrow=2, heights=c(1.25, 1))
2.3 Spinogram plots Note that these are different than colored bar charts because the widths also show the proportions
2.3.1 More than two kids by age
doubledecker(morekids~age, data=Fertility,
gp=gpar(fill=c("grey90", "forestgreen")),
spacing=spacing_equal(0))
2.3.2 More than two kids by genders of previous children
doubledecker(morekids~gender1+gender2, data=Fertility,
gp=gpar(fill=c("grey90", "forestgreen")))
2.3.3 More than two kids by age and genders of previous children
doubledecker(morekids~age+gender1+gender2, data=Fertility,
gp=gpar(fill=c("grey90", "forestgreen")),
spacing=spacing_dimequal(c(0.1, 0, 0, 0)))
3.1 Decathlon
# install.packages("GDAdata")
library(GDAdata)
## Warning: package 'GDAdata' was built under R version 4.1.2
data("Decathlon")
head(Decathlon)
## Totalpoints DecathleteName Nationality m100 Longjump Shotput Highjump
## 1 8559 Torsten Voss DDR 10.66 8.00 14.73 2.06
## 2 8504 Uwe Freimuth DDR 11.10 7.66 16.30 1.94
## 3 8440 Siegfried Wentz BRD 11.21 7.22 15.84 2.09
## 4 8409 Aleksandr Nevski SU 10.95 7.35 14.99 2.08
## 5 8381 John Sayre USA 10.86 7.41 14.22 2.00
## 6 8366 Vadim Podmaryov SU 11.09 7.56 15.28 2.08
## m400 m110hurdles Discus Polevault Javelin m1500 yearEvent P100m Plj Psp
## 1 48.28 14.50 43.28 4.9 61.28 268.80 1985 938 1061 773
## 2 48.46 14.77 47.72 4.9 68.26 270.56 1985 839 975 870
## 3 47.75 14.28 45.52 4.7 69.66 278.38 1985 814 866 841
## 4 49.29 14.76 46.12 4.6 68.16 261.09 1985 872 898 789
## 5 49.98 14.84 46.08 5.3 67.68 277.07 1985 892 913 742
## 6 50.00 14.89 48.58 4.6 67.46 272.31 1985 841 950 807
## Phj P400m P110h Ppv Pdt Pjt P1500
## 1 859 896 911 880 732 757 752
## 2 749 887 878 880 823 863 741
## 3 887 921 939 819 778 884 691
## 4 878 848 879 790 790 861 804
## 5 803 816 869 1004 789 854 699
## 6 878 815 863 790 841 851 730
df1 <- Decathlon %>%
group_by(Nationality)%>%
summarise(Tp = sum(Totalpoints)) %>%
arrange(desc(Tp)) %>%
slice(1:5)
p1<- ggplot(df1, aes(x=reorder(Nationality,Tp), y= Tp, fill= Nationality))+
geom_col()+
labs(title= "Top 5 Countries in the Decathlon",
x= "",
y= "Total Points")+
scale_y_continuous(labels = scales::comma) +
theme_minimal()+
theme(
axis.text= element_text( size= 7, face = "bold"),
plot.title = element_text(hjust=0.5, size= 8, face = "bold"),
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid = element_line(color = "#e5e5e5"))
p1
p2<- ggplot(Decathlon, aes(Plj)) + geom_histogram()+
labs( x= "Points",
y= "",
title= "Distribution of Long Jump Points")+
theme_minimal()+
theme( plot.title = element_text(size =12, face = "bold", hjust = 0.50),
axis.text = element_text(size= 10),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
p2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df11 <- Decathlon %>%
group_by(Nationality)%>%
summarise(Tp = sum(Totalpoints)) %>%
arrange(desc(Tp)) %>%
slice(1:3)
df2 <- Decathlon %>%
filter(Nationality %in% c(df11$Nationality))
p3<- ggplot(df2, aes(P100m, Plj, color= Nationality)) + geom_jitter(alpha= 0.5)+
geom_smooth()+
labs( title= "Points for performance in a 100m vs Long Jump",
x = "Points for 100 metres",
y ="Points for long jump")+
theme_minimal()+
theme(
axis.text= element_text( size= 7, face = "bold"),
plot.title = element_text(hjust=0.5, size= 8, face = "bold"),
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid = element_line(color = "#e5e5e5"))
ggplotly(p3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
grid.arrange(arrangeGrob(p1, p2, ncol=2, widths = c(10,8)),
p3, nrow= 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'