ewm <- get_cspp_data(vars=c("pid","ideo","statemin"),years=seq(1976,2017)) %>%
#select(-stateno,-state_fips,-state_icpsr) %>%
filter(!st%in%c("AK","HI"))
ewm
## # A tibble: 2,058 × 9
## st stateno state state_fips state_icpsr year pid ideo statemin
## <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AL 1 Alabama 1 41 1976 0.227 -0.215 NA
## 2 AL 1 Alabama 1 41 1977 0.245 -0.0779 NA
## 3 AL 1 Alabama 1 41 1978 0.442 -0.175 NA
## 4 AL 1 Alabama 1 41 1979 0.380 -0.205 NA
## 5 AL 1 Alabama 1 41 1980 0.376 -0.228 3.1
## 6 AL 1 Alabama 1 41 1981 0.255 -0.314 3.35
## 7 AL 1 Alabama 1 41 1982 0.293 -0.256 3.35
## 8 AL 1 Alabama 1 41 1983 0.288 -0.270 3.35
## 9 AL 1 Alabama 1 41 1984 0.177 -0.146 3.35
## 10 AL 1 Alabama 1 41 1985 -0.0556 -0.229 3.35
## # ℹ 2,048 more rows
ewm.trend <- ewm %>%
filter(year<2012) %>%
mutate(year=as.Date(ISOdate(year, 1, 1))) %>%
group_by(year) %>%
summarize(pid=mean(pid), ideo=mean(ideo))
ewm.trend
## # A tibble: 36 × 3
## year pid ideo
## <date> <dbl> <dbl>
## 1 1976-01-01 0.0473 -0.140
## 2 1977-01-01 0.172 -0.0838
## 3 1978-01-01 0.205 -0.102
## 4 1979-01-01 0.133 -0.153
## 5 1980-01-01 0.133 -0.130
## 6 1981-01-01 0.0630 -0.147
## 7 1982-01-01 0.0815 -0.176
## 8 1983-01-01 0.0805 -0.140
## 9 1984-01-01 0.0602 -0.136
## 10 1985-01-01 0.0187 -0.145
## # ℹ 26 more rows
ggplot(data=ewm.trend, aes(x=year,y=pid)) +
scale_x_date(date_breaks = "2 year",date_labels="%y") +
geom_line(linewidth=1.25) +
geom_hline(yintercept = 0, linetype="dashed", color="grey50") +
labs(x="",y="Mean Partisanship") +
scale_y_continuous(labels = scales::label_percent()) +
theme(legend.position = "none", legend.title = element_blank())
ggplot(data=ewm.trend, aes(x=year,y=ideo)) +
scale_x_date(date_breaks = "4 year",date_labels="%Y") +
geom_line(linewidth=1.25) +
geom_hline(yintercept = 0, linetype="dashed", color="grey50") +
labs(x="",y="Mean Ideology") +
scale_y_continuous(labels = scales::label_percent()) +
theme(legend.position = "none", legend.title = element_blank())
We switch to a new dataset, organdata
which is a
country-year data set of organ donations in 17 OECD countries with a
bunch of missing data. As always we take a look at it first.
organdata
## # A tibble: 238 × 21
## country year donors pop pop_dens gdp gdp_lag health health_lag
## <chr> <date> <dbl> <int> <dbl> <int> <int> <dbl> <dbl>
## 1 Australia NA NA 17065 0.220 16774 16591 1300 1224
## 2 Australia 1991-01-01 12.1 17284 0.223 17171 16774 1379 1300
## 3 Australia 1992-01-01 12.4 17495 0.226 17914 17171 1455 1379
## 4 Australia 1993-01-01 12.5 17667 0.228 18883 17914 1540 1455
## 5 Australia 1994-01-01 10.2 17855 0.231 19849 18883 1626 1540
## 6 Australia 1995-01-01 10.2 18072 0.233 21079 19849 1737 1626
## 7 Australia 1996-01-01 10.6 18311 0.237 21923 21079 1846 1737
## 8 Australia 1997-01-01 10.3 18518 0.239 22961 21923 1948 1846
## 9 Australia 1998-01-01 10.5 18711 0.242 24148 22961 2077 1948
## 10 Australia 1999-01-01 8.67 18926 0.244 25445 24148 2231 2077
## # ℹ 228 more rows
## # ℹ 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
## # assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
## # consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
Let’s select the first 6 variables and sample 10 observations from the data.
organdata %>% select(1:6) %>% sample_n(size = 10)
## # A tibble: 10 × 6
## country year donors pop pop_dens gdp
## <chr> <date> <dbl> <int> <dbl> <int>
## 1 Italy 1997-01-01 11.6 57512 19.1 22030
## 2 Germany 1991-01-01 13.3 80014 22.4 17511
## 3 United Kingdom 1992-01-01 14.4 57582 23.7 17110
## 4 Australia 1999-01-01 8.67 18926 0.244 25445
## 5 Italy 1991-01-01 5.2 56751 18.8 18209
## 6 Austria 1994-01-01 21.4 7936 9.46 21940
## 7 Finland 1995-01-01 19.4 5108 1.51 19031
## 8 Norway 1991-01-01 15.2 4262 1.32 19134
## 9 Finland 1997-01-01 16.3 5140 1.52 21691
## 10 Belgium 1993-01-01 21 10085 30.5 19733
Let’s try a scatterplot of donors vs year.
p <- ggplot(data = organdata,
mapping = aes(x = year, y = donors))
p + geom_point()
## Warning: Removed 34 rows containing missing values (`geom_point()`).
That’s not very informative. Let’s switch to–you guessed it–small
multiples with geom_line
instead.
p + geom_line(aes(group = country)) + facet_wrap(~ country)
Let’s switch to a new geom, the geom_boxplot
which will
give us a boxplot showing us the distribution of donors within country
(eg over time).
p <- ggplot(data = organdata,
mapping = aes(x = country, y = donors))
p + geom_boxplot()
## Warning: Removed 34 rows containing non-finite values (`stat_boxplot()`).
That’s ok, except for the horrible x axis, where the labels are all overlapping. Let’s flip the coordinates.
p + geom_boxplot() + coord_flip()
## Warning: Removed 34 rows containing non-finite values (`stat_boxplot()`).
This is better but the order is just reverse alphabetical. Why should we accept that as the default? Ordering is often helpful as a communications tool.
Let’s reorder by donation rate. We can do this by the
reorder
function within the x
argument (why x
and not y?! Because of the coord_flip!) The first argument is the thing
we want to reorder (country
), the second argument is the
thing we want to reorder by (donors
). There’s a third
argument possible too, the statistic you want to reorder by. If not set,
the default is the mean of the second argument. The problem is that in
R, mean
will fail if you ask for an average with missing
values. So we remove the missing values with
na.rm=TRUE
.
We set the x label to NULL because it’s obvious we’re talking about countries.
p <- ggplot(data = organdata,
mapping = aes(x = reorder(country, donors, na.rm=TRUE),
y = donors))
p + geom_boxplot() +
labs(x=NULL) +
coord_flip()
## Warning: Removed 34 rows containing non-finite values (`stat_boxplot()`).
Remember we can set the fill argument of the mapping to whatever we
want, in this case world
.
Let’s also try the violin plot, a variety of boxplot using shape as way to display distribution.
p <- ggplot(data = organdata,
mapping = aes(x = reorder(country, donors, na.rm=TRUE),
y = donors, fill = world))
p + geom_violin() + labs(x=NULL) +
coord_flip() + theme(legend.position = "top")
## Warning: Removed 34 rows containing non-finite values (`stat_ydensity()`).
Let’s reorganize the data again, grouping by consent type and country, and creating a bunch of summary statistics. The summary command will work on the innermost grouping level, in this case, within-country averages and standard deviations.
by_country <- organdata %>% group_by(consent_law, country) %>%
summarize(donors_mean= mean(donors, na.rm = TRUE),
donors_sd = sd(donors, na.rm = TRUE),
gdp_mean = mean(gdp, na.rm = TRUE),
health_mean = mean(health, na.rm = TRUE),
roads_mean = mean(roads, na.rm = TRUE),
cerebvas_mean = mean(cerebvas, na.rm = TRUE))
## `summarise()` has grouped output by 'consent_law'. You can override using the
## `.groups` argument.
by_country
## # A tibble: 17 × 8
## # Groups: consent_law [2]
## consent_law country donors_mean donors_sd gdp_mean health_mean roads_mean
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Informed Australia 10.6 1.14 22179. 1958. 105.
## 2 Informed Canada 14.0 0.751 23711. 2272. 109.
## 3 Informed Denmark 13.1 1.47 23722. 2054. 102.
## 4 Informed Germany 13.0 0.611 22163. 2349. 113.
## 5 Informed Ireland 19.8 2.48 20824. 1480. 118.
## 6 Informed Netherlands 13.7 1.55 23013. 1993. 76.1
## 7 Informed United Kin… 13.5 0.775 21359. 1561. 67.9
## 8 Informed United Sta… 20.0 1.33 29212. 3988. 155.
## 9 Presumed Austria 23.5 2.42 23876. 1875. 150.
## 10 Presumed Belgium 21.9 1.94 22500. 1958. 155.
## 11 Presumed Finland 18.4 1.53 21019. 1615. 93.6
## 12 Presumed France 16.8 1.60 22603. 2160. 156.
## 13 Presumed Italy 11.1 4.28 21554. 1757 122.
## 14 Presumed Norway 15.4 1.11 26448. 2217. 70.0
## 15 Presumed Spain 28.1 4.96 16933 1289. 161.
## 16 Presumed Sweden 13.1 1.75 22415. 1951. 72.3
## 17 Presumed Switzerland 14.2 1.71 27233 2776. 96.4
## # ℹ 1 more variable: cerebvas_mean <dbl>
This works, but is there a faster way than just repeating the same thing over and over? Yes!
We use the summarize_if
command to only target the
numeric variables. If they are, generate summaries named “_mean” and
“_sd”, with the caution about missing values.
# note funs() was used in the book, but is now deprecated in the latest dplyr
by_country <- organdata %>% group_by(consent_law, country) %>%
summarize_if(is.numeric, list(mean=mean, sd=sd), na.rm = TRUE)
by_country
## # A tibble: 17 × 28
## # Groups: consent_law [2]
## consent_law country donors_mean pop_mean pop_dens_mean gdp_mean gdp_lag_mean
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Informed Austral… 10.6 18318. 0.237 22179. 21779.
## 2 Informed Canada 14.0 29608. 0.297 23711. 23353.
## 3 Informed Denmark 13.1 5257. 12.2 23722. 23275
## 4 Informed Germany 13.0 80255. 22.5 22163. 21938.
## 5 Informed Ireland 19.8 3674. 5.23 20824. 20154.
## 6 Informed Netherl… 13.7 15548. 37.4 23013. 22554.
## 7 Informed United … 13.5 58187. 24.0 21359. 20962.
## 8 Informed United … 20.0 269330. 2.80 29212. 28699.
## 9 Presumed Austria 23.5 7927. 9.45 23876. 23415.
## 10 Presumed Belgium 21.9 10153. 30.7 22500. 22096.
## 11 Presumed Finland 18.4 5112. 1.51 21019. 20763
## 12 Presumed France 16.8 58056. 10.5 22603. 22211.
## 13 Presumed Italy 11.1 57360. 19.0 21554. 21195.
## 14 Presumed Norway 15.4 4386. 1.35 26448. 25769.
## 15 Presumed Spain 28.1 39666. 7.84 16933 16584.
## 16 Presumed Sweden 13.1 8789. 1.95 22415. 22094
## 17 Presumed Switzer… 14.2 7037. 17.0 27233 26931.
## # ℹ 21 more variables: health_mean <dbl>, health_lag_mean <dbl>,
## # pubhealth_mean <dbl>, roads_mean <dbl>, cerebvas_mean <dbl>,
## # assault_mean <dbl>, external_mean <dbl>, txp_pop_mean <dbl>,
## # donors_sd <dbl>, pop_sd <dbl>, pop_dens_sd <dbl>, gdp_sd <dbl>,
## # gdp_lag_sd <dbl>, health_sd <dbl>, health_lag_sd <dbl>, pubhealth_sd <dbl>,
## # roads_sd <dbl>, cerebvas_sd <dbl>, assault_sd <dbl>, external_sd <dbl>,
## # txp_pop_sd <dbl>
Now that we have a country-level data set, let’s plot the summary points.
p <- ggplot(data = by_country,
mapping = aes(x = donors_mean, y = reorder(country, donors_mean),
color = consent_law))
p + geom_point(size=3) +
labs(x = "Donor Procurement Rate",
y = "", color = "Consent Law") +
theme(legend.position="top")
Why not try small multiples instead of coloring the points by law type?
p <- ggplot(data = by_country,
mapping = aes(x = donors_mean,
y = reorder(country, donors_mean)))
p + geom_point(size=3) +
facet_wrap(~ consent_law, ncol = 1) +
labs(x= "Donor Procurement Rate",
y= "")
Kind of what we wanted. But notice that a bunch of the points are missing. Can you figure out why?
Alternatively, we allow y scale to be free with the
scales
argument.
p <- ggplot(data = by_country,
mapping = aes(x = donors_mean,
y = reorder(country, donors_mean)))
p + geom_point(size=3) +
facet_wrap(~ consent_law, scales = "free_y", ncol = 1) +
labs(x= "Donor Procurement Rate",
y= "")
Much better.
Let’s try what’s called a “Cleveland dotplot”, with a dot
representing the mean, and a line surrounding the mean expressing a
variance. This is done with the geom_pointrange
function,
that has its own mapping.
p <- ggplot(data = by_country, mapping = aes(x = reorder(country,
donors_mean), y = donors_mean))
p + geom_pointrange(mapping = aes(ymin = donors_mean - donors_sd,
ymax = donors_mean + donors_sd)) +
labs(x= "", y= "Donor Procurement Rate") + coord_flip()