title: “DataViz: Ensemble Graphics
author: “Karol Orozco”
date: “12/1/2021”
output: html_document
str(coffee)
## 'data.frame':    43 obs. of  14 variables:
##  $ Variety            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Country            : Factor w/ 29 levels "angola","brasile",..: 19 19 12 15 24 24 24 21 20 7 ...
##  $ Water              : num  8.94 7.4 9.74 10.4 10.54 ...
##  $ Bean Weight        : num  157 157 153 174 145 ...
##  $ Extract Yield      : num  33.5 32.1 33.1 31.5 35.2 ...
##  $ ph Value           : num  5.8 5.81 5.26 5.61 5.77 5.83 5.62 5.89 5.75 5.82 ...
##  $ Free Acid          : num  32.7 30.8 36.7 34.2 31.8 ...
##  $ Mineral Content    : num  3.8 3.71 4.15 3.94 4.09 3.88 3.85 3.85 4.22 4.01 ...
##  $ Fat                : num  15.2 15 16.1 15.8 15.2 15.4 15.6 15.1 14.3 15.1 ...
##  $ Caffine            : num  1.13 1.25 1.21 1.06 1.11 1.2 1.33 1.28 1.16 1.32 ...
##  $ Trigonelline       : num  1.03 1.01 1.05 0.94 0.99 0.81 1.16 0.98 0.97 0.97 ...
##  $ Chlorogenic Acid   : num  5.38 5.13 5.94 5.87 5.09 5.3 4.8 4.96 5.53 5.12 ...
##  $ Neochlorogenic Acid: num  0.4 0.32 0.24 0.39 0.49 0.43 0.32 0.32 0.36 0.29 ...
##  $ Isochlorogenic Acid: num  0.79 0.97 0.76 0.59 0.72 0.69 0.74 0.66 0.8 0.69 ...
vis_miss(coffee)

Vis one categorical variable: bar charts

ggplot(coffee, aes(factor(Variety)))+
  geom_bar()

Coffee countries

length(unique(coffee$Country))
## [1] 29
ggplot(coffee, aes(Country))+
  geom_bar()+
  coord_flip()

## Mosaic plots

library(vcd)
## Warning: package 'vcd' was built under R version 4.1.2
## Loading required package: grid
mosaic(Variety~Country, data= coffee)

# Here’s another example of a mosaic plot that is built into this package:

#example
mosaic(HairEyeColor, shade=TRUE, legend=TRUE)

1.1.3 Visualize Continuous Variables: Standardized boxplots Look for outliers

# distribution of continuous variables
boxplot(scale(coffee[,3:14]))

standardizes

1.1.4 Visualize Relationships between Two Continuous Variables: Pairs plots

scatterplot matrix of continuous

pairs(coffee[,3:14], pch=16)

1.2 Polishing/Explanation

Labels Names of Varieties Names of Variables Consistent colors

1.2.1 Names of Varieties

coffee<-within(coffee, Type <- ifelse(Variety==1,
                                      "Arabica", "Robusta"))
head(coffee)
##   Variety  Country     Water Bean Weight Extract Yield ph Value Free Acid
## 1       1   mexico  8.939999       156.6          33.5     5.80      32.7
## 2       1   mexico  7.400000       157.3          32.1     5.81      30.8
## 3       1 guatemal  9.740000       152.9          33.1     5.26      36.7
## 4       1 honduras 10.400000       174.0          31.5     5.61      34.2
## 5       1 salvador 10.540000       145.1          35.2     5.77      31.8
## 6       1 salvador 10.000000       156.4          34.5     5.83      32.6
##   Mineral Content  Fat Caffine Trigonelline Chlorogenic Acid
## 1            3.80 15.2    1.13         1.03             5.38
## 2            3.71 15.0    1.25         1.01             5.13
## 3            4.15 16.1    1.21         1.05             5.94
## 4            3.94 15.8    1.06         0.94             5.87
## 5            4.09 15.2    1.11         0.99             5.09
## 6            3.88 15.4    1.20         0.81             5.30
##   Neochlorogenic Acid Isochlorogenic Acid    Type
## 1                0.40                0.79 Arabica
## 2                0.32                0.97 Arabica
## 3                0.24                0.76 Arabica
## 4                0.39                0.59 Arabica
## 5                0.49                0.72 Arabica
## 6                0.43                0.69 Arabica

1.2.2 Names of Variables

names(coffee)<-abbreviate(names(coffee), 8)
names(coffee)
##  [1] "Variety"  "Country"  "Water"    "BeanWght" "ExtrctYl" "ph Value"
##  [7] "FreeAcid" "MnrlCntn" "Fat"      "Caffine"  "Trignlln" "ChlrgncA"
## [13] "NchlrgnA" "IschlrgA" "Type"

1.3 Graphics 1.3.1 Bargraph

a<- ggplot(coffee, aes(x=Type)) + 
  geom_bar(aes(fill=Type))+
  scale_fill_manual(values=c("grey70", "red"))+
  guides(fill=FALSE)+
  ylab("")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
a

1.3.2 Scatterplot

b<-ggplot(coffee, aes(x=Fat, y=Caffine, colour=Type))+
  geom_point(size=3)+
  scale_colour_manual(values=c("grey70", "red"))
b

1.3.3 Parallel Coordinate Plot

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
c<-ggparcoord(coffee[order(coffee$Type),], columns=3:14,
              groupColumn="Type", scale="uniminmax",
              mapping=aes(size=1))+
  xlab("") + ylab("")+
  theme(legend.position="none")+
  scale_colour_manual(values=c("grey", "red"))+
  theme(axis.ticks.y=element_blank(),
        axis.text.y=element_blank())
c

1.4 Ensemble Graphic

#install.packages("gridExtra")

grid.arrange(arrangeGrob(a, b, ncol=2, widths=c(1, 2)),
             c, nrow=2)

grid.arrange(a, b, c, nrow = 2)

Case Study: Fertility Mothers in this dataset had two or more children.

Fertility dataset

# install.packages("AER")
library(AER)
## Warning: package 'AER' was built under R version 4.1.2
## Loading required package: car
## Warning: package 'car' was built under R version 4.1.2
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
data("Fertility")

head(Fertility)
##   morekids gender1 gender2 age afam hispanic other work
## 1       no    male  female  27   no       no    no    0
## 2       no  female    male  30   no       no    no   30
## 3       no    male  female  27   no       no    no    0
## 4       no    male  female  35  yes       no    no    0
## 5       no  female  female  30   no       no    no   22
## 6       no    male  female  26   no       no    no   40

2.1 Exploration 2.1.1 What is the distribution of age? “As expected, the numbers of women with two or more children rise with age, although it is perhaps surprising that the pattern is so regular before dropping off for the two oldest cohorts”

ggplot(Fertility, aes(x=age))+
  geom_bar()

2.1.2 Is the distribution of age different across demographic groups? The same pattern can be seen across all race groups.

ggplot(Fertility, aes(x=age))+
  geom_bar(binwidth=1)+
  facet_wrap(~afam+hispanic+other, scales="free_y")
## Warning: Ignoring unknown parameters: binwidth

2.1.3 Does whether or not the mother worked affect number of children? In these data, a majority of the women either didn’t work or worked, with some in-between.

We can look at the proportions:

with(Fertility, prop.table(table(work)))
## work
##            0            1            2            3            4            5 
## 0.4717813190 0.0056036819 0.0066796516 0.0053366529 0.0084428283 0.0040800459 
##            6            7            8            9           10           11 
## 0.0078420131 0.0022854540 0.0116707375 0.0025446292 0.0086352463 0.0014254636 
##           12           13           14           15           16           17 
## 0.0195363120 0.0029844416 0.0046847880 0.0047515452 0.0152874096 0.0025485561 
##           18           19           20           21           22           23 
## 0.0043863438 0.0010170663 0.0186370526 0.0014568787 0.0031297368 0.0011702153 
##           24           25           26           27           28           29 
## 0.0092046463 0.0064911606 0.0112034368 0.0010170663 0.0059492488 0.0007539642 
##           30           31           32           33           34           35 
## 0.0128762949 0.0007461104 0.0088512256 0.0010367008 0.0022147698 0.0045041507 
##           36           37           38           39           40           41 
## 0.0180912140 0.0019673753 0.0046141038 0.0025210678 0.0289176687 0.0011741422 
##           42           43           44           45           46           47 
## 0.0054976556 0.0014686594 0.0057332695 0.0055408515 0.0047044225 0.0023011616 
##           48           49           50           51           52 
## 0.0140700715 0.0047122763 0.0261256450 0.0063694268 0.1854241441

2.2 Graphics Since we are using the same dataset and similar geometries we can use the same “base” and then add modifiers to it.

2.2.1 Age and Weeks Worked

p0<-ggplot(Fertility)+geom_bar(binwidth=1)+ylab("")
## Warning: Ignoring unknown parameters: binwidth
## Warning: Ignoring unknown parameters: binwidth
p1<- p0+aes(x=age)
p2<-p0+aes(x=work)+xlab("Weeks worked in 1979")

p1

p2

2.2.2 Other Variables

k<-ggplot(Fertility)+geom_bar()+ylab("")+ylim(0, 250000)
p3<-k+aes(x=morekids) + xlab("has more children")
p4<-k+aes(x=gender1) + xlab("first child")
p5<-k+aes(x=gender2) + xlab("second child")
p6<-k+aes(x=afam) + xlab("African American")
p7<-k+aes(x=hispanic) + xlab("Hispanic")
p8<-k+aes(x=other) + xlab("other race")

2.2.3 Now combine!

grid.arrange(arrangeGrob(p1, p2, ncol=2, widths = c(3,3)),
             arrangeGrob(p3, p4, p5, p6, p7, p8, ncol=6),
             nrow=2, heights=c(1.25, 1))

2.3 Spinogram plots Note that these are different than colored bar charts because the widths also show the proportions

2.3.1 More than two kids by age

doubledecker(morekids~age, data=Fertility,
             gp=gpar(fill=c("grey90", "forestgreen")),
             spacing=spacing_equal(0))

2.3.2 More than two kids by genders of previous children

doubledecker(morekids~gender1+gender2, data=Fertility,
             gp=gpar(fill=c("grey90", "forestgreen")))

2.3.3 More than two kids by age and genders of previous children

doubledecker(morekids~age+gender1+gender2, data=Fertility,
             gp=gpar(fill=c("grey90", "forestgreen")),
             spacing=spacing_dimequal(c(0.1, 0, 0, 0)))

  1. Case Studies

3.1 Decathlon

# install.packages("GDAdata")
library(GDAdata)
## Warning: package 'GDAdata' was built under R version 4.1.2
data("Decathlon")
head(Decathlon)
##   Totalpoints   DecathleteName Nationality  m100 Longjump Shotput Highjump
## 1        8559     Torsten Voss         DDR 10.66     8.00   14.73     2.06
## 2        8504     Uwe Freimuth         DDR 11.10     7.66   16.30     1.94
## 3        8440  Siegfried Wentz         BRD 11.21     7.22   15.84     2.09
## 4        8409 Aleksandr Nevski          SU 10.95     7.35   14.99     2.08
## 5        8381       John Sayre         USA 10.86     7.41   14.22     2.00
## 6        8366  Vadim Podmaryov          SU 11.09     7.56   15.28     2.08
##    m400 m110hurdles Discus Polevault Javelin  m1500 yearEvent P100m  Plj Psp
## 1 48.28       14.50  43.28       4.9   61.28 268.80      1985   938 1061 773
## 2 48.46       14.77  47.72       4.9   68.26 270.56      1985   839  975 870
## 3 47.75       14.28  45.52       4.7   69.66 278.38      1985   814  866 841
## 4 49.29       14.76  46.12       4.6   68.16 261.09      1985   872  898 789
## 5 49.98       14.84  46.08       5.3   67.68 277.07      1985   892  913 742
## 6 50.00       14.89  48.58       4.6   67.46 272.31      1985   841  950 807
##   Phj P400m P110h  Ppv Pdt Pjt P1500
## 1 859   896   911  880 732 757   752
## 2 749   887   878  880 823 863   741
## 3 887   921   939  819 778 884   691
## 4 878   848   879  790 790 861   804
## 5 803   816   869 1004 789 854   699
## 6 878   815   863  790 841 851   730

Nationality vs Points

 df1 <- Decathlon %>%
  group_by(Nationality)%>%
  summarise(Tp = sum(Totalpoints)) %>%
  arrange(desc(Tp)) %>%
  slice(1:5)

p1<- ggplot(df1, aes(x=reorder(Nationality,Tp), y= Tp, fill= Nationality))+
  geom_col()+
  labs(title= "Top 5 Countries in the Decathlon",
       x= "",
       y= "Total Points")+
      scale_y_continuous(labels = scales::comma)  +
  theme_minimal()+
    theme(
       axis.text= element_text( size= 7, face = "bold"),
       plot.title = element_text(hjust=0.5, size= 8, face = "bold"),
       panel.grid.major.x = element_blank(),
       panel.grid.minor.y = element_blank(),
       panel.grid = element_line(color = "#e5e5e5"))

p1

p2<- ggplot(Decathlon, aes(Plj)) + geom_histogram()+
  labs( x= "Points",
        y= "",
        title= "Distribution of Long Jump Points")+
   theme_minimal()+
  
  theme(  plot.title = element_text(size =12, face = "bold", hjust = 0.50),
          axis.text = element_text(size= 10),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank())
p2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 df11 <- Decathlon %>%
  group_by(Nationality)%>%
  summarise(Tp = sum(Totalpoints)) %>%
  arrange(desc(Tp)) %>%
  slice(1:3)

df2 <- Decathlon %>%
  filter(Nationality %in% c(df11$Nationality))

p3<- ggplot(df2, aes(P100m, Plj, color= Nationality)) + geom_jitter(alpha= 0.5)+
    geom_smooth()+

labs( title= "Points for performance in a 100m vs Long Jump",
      x = "Points for 100 metres",
      y ="Points for long jump")+
  theme_minimal()+
    theme(
       axis.text= element_text( size= 7, face = "bold"),
       plot.title = element_text(hjust=0.5, size= 8, face = "bold"),
       panel.grid.major.x = element_blank(),
       panel.grid.minor.y = element_blank(),
       panel.grid = element_line(color = "#e5e5e5"))
  
  
ggplotly(p3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
grid.arrange(arrangeGrob(p1, p2, ncol=2, widths = c(10,8)),
             p3, nrow= 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'