library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(broom)
library(countrycode)
library(stringr)
library(data.table)
votes <- readRDS('data/votes.rds')
descriptions <- readRDS('data/descriptions.rds')
# Load the dplyr package
# Print the votes dataset
votes
## # A tibble: 508,929 x 4
## rcid session vote ccode
## <dbl> <dbl> <dbl> <int>
## 1 46 2 1 2
## 2 46 2 1 20
## 3 46 2 9 31
## 4 46 2 1 40
## 5 46 2 1 41
## 6 46 2 1 42
## 7 46 2 9 51
## 8 46 2 9 52
## 9 46 2 9 53
## 10 46 2 9 54
## # ... with 508,919 more rows
# Filter for votes that are "yes", "abstain", or "no"
votes %>%
filter(vote <= 3)
## # A tibble: 353,547 x 4
## rcid session vote ccode
## <dbl> <dbl> <dbl> <int>
## 1 46 2 1 2
## 2 46 2 1 20
## 3 46 2 1 40
## 4 46 2 1 41
## 5 46 2 1 42
## 6 46 2 1 70
## 7 46 2 1 90
## 8 46 2 1 91
## 9 46 2 1 92
## 10 46 2 1 93
## # ... with 353,537 more rows
year = session + 1945
# Add another %>% step to add a year column
votes %>%
filter(vote <= 3) %>%
mutate(year = session + 1945)
## # A tibble: 353,547 x 5
## rcid session vote ccode year
## <dbl> <dbl> <dbl> <int> <dbl>
## 1 46 2 1 2 1947
## 2 46 2 1 20 1947
## 3 46 2 1 40 1947
## 4 46 2 1 41 1947
## 5 46 2 1 42 1947
## 6 46 2 1 70 1947
## 7 46 2 1 90 1947
## 8 46 2 1 91 1947
## 9 46 2 1 92 1947
## 10 46 2 1 93 1947
## # ... with 353,537 more rows
# Load the countrycode package
# Convert country code 100
countrycode(100, "cown", "country.name")
## [1] "Colombia"
# Add a country column within the mutate: votes_processed
votes_processed <- votes %>%
filter(vote <= 3) %>%
mutate(
year = session + 1945,
country = countrycode(ccode, "cown", "country.name"),
country = recode(country,
'United States of America' = 'United States',
'United Kingdom of Great Britain and Northern Ireland' = 'United Kingdom')
)
# Print votes_processed
votes_processed
## # A tibble: 353,547 x 6
## rcid session vote ccode year country
## <dbl> <dbl> <dbl> <int> <dbl> <chr>
## 1 46 2 1 2 1947 United States
## 2 46 2 1 20 1947 Canada
## 3 46 2 1 40 1947 Cuba
## 4 46 2 1 41 1947 Haiti
## 5 46 2 1 42 1947 Dominican Republic
## 6 46 2 1 70 1947 Mexico
## 7 46 2 1 90 1947 Guatemala
## 8 46 2 1 91 1947 Honduras
## 9 46 2 1 92 1947 El Salvador
## 10 46 2 1 93 1947 Nicaragua
## # ... with 353,537 more rows
# Find total and fraction of "yes" votes
votes_processed %>%
summarise(
total = n(),
percent_yes = mean(vote == 1)
)
## # A tibble: 1 x 2
## total percent_yes
## <int> <dbl>
## 1 353547 0.7999248
# Change this code to summarize by year
votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
## # A tibble: 34 x 3
## year total percent_yes
## <dbl> <int> <dbl>
## 1 1947 2039 0.5693968
## 2 1949 3469 0.4375901
## 3 1951 1434 0.5850767
## 4 1953 1537 0.6317502
## 5 1955 2169 0.6947902
## 6 1957 2708 0.6085672
## 7 1959 4326 0.5880721
## 8 1961 7482 0.5729751
## 9 1963 3308 0.7294438
## 10 1965 4382 0.7078959
## # ... with 24 more rows
# Summarize by country: by_country
by_country <- votes_processed %>%
group_by(country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
by_country
## # A tibble: 200 x 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Afghanistan 2373 0.8592499
## 2 Albania 1695 0.7174041
## 3 Algeria 2213 0.8992318
## 4 Andorra 719 0.6383866
## 5 Angola 1431 0.9238295
## 6 Antigua and Barbuda 1302 0.9124424
## 7 Argentina 2553 0.7677242
## 8 Armenia 758 0.7467018
## 9 Australia 2575 0.5565049
## 10 Austria 2389 0.6224362
## # ... with 190 more rows
# You have the votes summarized by country
by_country <- votes_processed %>%
group_by(country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Print the by_country dataset
by_country
## # A tibble: 200 x 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Afghanistan 2373 0.8592499
## 2 Albania 1695 0.7174041
## 3 Algeria 2213 0.8992318
## 4 Andorra 719 0.6383866
## 5 Angola 1431 0.9238295
## 6 Antigua and Barbuda 1302 0.9124424
## 7 Argentina 2553 0.7677242
## 8 Armenia 758 0.7467018
## 9 Australia 2575 0.5565049
## 10 Austria 2389 0.6224362
## # ... with 190 more rows
# Sort in ascending order of percent_yes
by_country %>%
arrange(percent_yes)
## # A tibble: 200 x 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Zanzibar 2 0.0000000
## 2 United States 2568 0.2694704
## 3 Palau 369 0.3387534
## 4 Israel 2380 0.3407563
## 5 Federal Republic of Germany 1075 0.3972093
## 6 United Kingdom 2558 0.4167318
## 7 France 2527 0.4265928
## 8 Micronesia (Federated States of) 724 0.4419890
## 9 Marshall Islands 757 0.4914135
## 10 Belgium 2568 0.4922118
## # ... with 190 more rows
# Now sort in descending order
by_country %>%
arrange(desc(percent_yes))
## # A tibble: 200 x 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Sao Tome and Principe 1091 0.9761687
## 2 Seychelles 881 0.9750284
## 3 Djibouti 1598 0.9612015
## 4 Guinea Bissau 1538 0.9603381
## 5 Timor-Leste 326 0.9570552
## 6 Mauritius 1831 0.9497542
## 7 Zimbabwe 1361 0.9493020
## 8 Comoros 1133 0.9470432
## 9 United Arab Emirates 1934 0.9467425
## 10 Mozambique 1701 0.9465021
## # ... with 190 more rows
# Filter out countries with fewer than 100 votes
by_country %>%
arrange(percent_yes) %>%
filter(total >= 100)
## # A tibble: 197 x 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 United States 2568 0.2694704
## 2 Palau 369 0.3387534
## 3 Israel 2380 0.3407563
## 4 Federal Republic of Germany 1075 0.3972093
## 5 United Kingdom 2558 0.4167318
## 6 France 2527 0.4265928
## 7 Micronesia (Federated States of) 724 0.4419890
## 8 Marshall Islands 757 0.4914135
## 9 Belgium 2568 0.4922118
## 10 Canada 2576 0.5081522
## # ... with 187 more rows
# Define by_year
by_year <- votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Load the ggplot2 package
library(ggplot2)
# Create line plot
ggplot(by_year, aes(x = year, y = percent_yes)) +
geom_line()
# Change to scatter plot and add smoothing curve
ggplot(by_year, aes(year, percent_yes)) +
geom_point() +
geom_smooth()
# Group by year and country: by_year_country
by_year_country <- votes_processed %>%
group_by(year, country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Start with by_year_country dataset
by_year_country <- votes_processed %>%
group_by(year, country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Print by_year_country
by_year_country
## # A tibble: 4,744 x 4
## # Groups: year [?]
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 Afghanistan 34 0.3823529
## 2 1947 Argentina 38 0.5789474
## 3 1947 Australia 38 0.5526316
## 4 1947 Belarus 38 0.5000000
## 5 1947 Belgium 38 0.6052632
## 6 1947 Bolivia (Plurinational State of) 37 0.5945946
## 7 1947 Brazil 38 0.6578947
## 8 1947 Canada 38 0.6052632
## 9 1947 Chile 38 0.6578947
## 10 1947 Colombia 35 0.5428571
## # ... with 4,734 more rows
# Create a filtered version: UK_by_year
UK_by_year <- by_year_country %>%
filter(country == 'United Kingdom')
# Line plot of percent_yes over time for UK only
ggplot(UK_by_year, aes(x = year, y = percent_yes)) +
geom_line()
# Vector of four countries to examine
countries <- c("United States", "United Kingdom",
"France", "India")
# Filter by_year_country: filtered_4_countries
filtered_4_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes in four countries
ggplot(filtered_4_countries, aes(x = year, y = percent_yes, color = country)) +
geom_line()
# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
"France", "Japan", "Brazil", "India")
# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~ country)
# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
"France", "Japan", "Brazil", "India")
# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scales = "free_y")
# Add three more countries to this list
countries <- c("United States", "United Kingdom",
"France", "Germany", "China", "India", "Argentina", "Canada","Uruguay")
# Filtered by_year_country: filtered_countries
filtered_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country)
Visualization can surprise you, but it doesn’t scale well. Modeling scales well, but it can’t surprise you. - Hadley Wickham
# Percentage of yes votes from the US by year: US_by_year
US_by_year <- by_year_country %>%
filter(country == "United States")
# Print the US_by_year data
US_by_year
## # A tibble: 34 x 4
## # Groups: year [34]
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 United States 38 0.7105263
## 2 1949 United States 64 0.2812500
## 3 1951 United States 25 0.4000000
## 4 1953 United States 26 0.5000000
## 5 1955 United States 37 0.6216216
## 6 1957 United States 34 0.6470588
## 7 1959 United States 54 0.4259259
## 8 1961 United States 75 0.5066667
## 9 1963 United States 32 0.5000000
## 10 1965 United States 41 0.3658537
## # ... with 24 more rows
# Perform a linear regression of percent_yes by year: US_fit
US_fit <- lm(percent_yes ~ year, data = US_by_year)
# Perform summary() on the US_fit object
summary(US_fit)
##
## Call:
## lm(formula = percent_yes ~ year, data = US_by_year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.222491 -0.080635 -0.008661 0.081948 0.194307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.6641455 1.8379743 6.890 8.48e-08 ***
## year -0.0062393 0.0009282 -6.722 1.37e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1062 on 32 degrees of freedom
## Multiple R-squared: 0.5854, Adjusted R-squared: 0.5724
## F-statistic: 45.18 on 1 and 32 DF, p-value: 1.367e-07
# Load the broom package
# Call the tidy() function on the US_fit object
tidy(US_fit)
## term estimate std.error statistic p.value
## 1 (Intercept) 12.664145512 1.8379742715 6.890274 8.477089e-08
## 2 year -0.006239305 0.0009282243 -6.721764 1.366904e-07
# Linear regression of percent_yes by year for US
US_by_year <- by_year_country %>%
filter(country == "United States")
US_fit <- lm(percent_yes ~ year, US_by_year)
# Fit model for the United Kingdom
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom")
by_year_country %>%
filter(country %like% c('United'))
## # A tibble: 117 x 4
## # Groups: year [34]
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 United Kingdom 38 0.5789474
## 2 1947 United States 38 0.7105263
## 3 1949 United Kingdom 64 0.2031250
## 4 1949 United States 64 0.2812500
## 5 1951 United Kingdom 25 0.2800000
## 6 1951 United States 25 0.4000000
## 7 1953 United Kingdom 26 0.2307692
## 8 1953 United States 26 0.5000000
## 9 1955 United Kingdom 37 0.6486486
## 10 1955 United States 37 0.6216216
## # ... with 107 more rows
UK_fit <- lm(percent_yes ~ year, UK_by_year)
# Create US_tidied and UK_tidied
US_tidied <- tidy(US_fit)
UK_tidied <- tidy(UK_fit)
# Combine the two tidied models
bind_rows(US_tidied, UK_tidied)
## term estimate std.error statistic p.value
## 1 (Intercept) 12.664145512 1.8379742715 6.890274 8.477089e-08
## 2 year -0.006239305 0.0009282243 -6.721764 1.366904e-07
## 3 (Intercept) -3.266547873 1.9577739504 -1.668501 1.049736e-01
## 4 year 0.001869434 0.0009887262 1.890750 6.774177e-02
# Load the tidyr package
library(tidyr)
# Nest all columns besides country
by_year_country %>%
nest(-country)
## # A tibble: 34 x 2
## year data
## <dbl> <list>
## 1 1947 <tibble [57 x 2]>
## 2 1949 <tibble [59 x 2]>
## 3 1951 <tibble [60 x 2]>
## 4 1953 <tibble [60 x 2]>
## 5 1955 <tibble [65 x 2]>
## 6 1957 <tibble [82 x 2]>
## 7 1959 <tibble [82 x 2]>
## 8 1961 <tibble [104 x 2]>
## 9 1963 <tibble [113 x 2]>
## 10 1965 <tibble [117 x 2]>
## # ... with 24 more rows
# All countries are nested besides country
nested <- by_year_country %>%
nest(-country)
nested$data[[1]]
## # A tibble: 57 x 2
## total percent_yes
## <int> <dbl>
## 1 34 0.3823529
## 2 38 0.5789474
## 3 38 0.5526316
## 4 38 0.5000000
## 5 38 0.6052632
## 6 37 0.5945946
## 7 38 0.6578947
## 8 38 0.6052632
## 9 38 0.6578947
## 10 35 0.5428571
## # ... with 47 more rows
nested$data[[1]]$percent_yes
## [1] 0.3823529 0.5789474 0.5526316 0.5000000 0.6052632 0.5945946 0.6578947
## [8] 0.6052632 0.6578947 0.5428571 0.6969697 0.6578947 0.4736842 0.6315789
## [15] 0.6052632 0.6000000 0.5000000 0.6000000 0.4473684 0.7368421 0.4324324
## [22] 0.6052632 0.6571429 0.5882353 0.7272727 0.4054054 0.6216216 0.4473684
## [29] 0.4473684 0.7027027 0.6388889 0.6315789 0.6052632 0.6315789 0.6756757
## [36] 0.6578947 0.4193548 0.7222222 0.6400000 0.6052632 0.5588235 0.4242424
## [43] 0.5263158 0.2812500 0.5000000 0.6578947 0.3793103 0.6315789 0.3529412
## [50] 0.3684211 0.5263158 0.5789474 0.7105263 0.6857143 0.6578947 0.3157895
## [57] 0.5000000
# Print the nested data for Brazil
nested$data[nested$country == 'Brazil']
## list()
# All countries are nested besides country
nested <- by_year_country %>%
nest(-country)
# Unnest the data column to return it to its original form
nested %>%
unnest()
## # A tibble: 4,744 x 3
## year total percent_yes
## <dbl> <int> <dbl>
## 1 1947 34 0.3823529
## 2 1947 38 0.5789474
## 3 1947 38 0.5526316
## 4 1947 38 0.5000000
## 5 1947 38 0.6052632
## 6 1947 37 0.5945946
## 7 1947 38 0.6578947
## 8 1947 38 0.6052632
## 9 1947 38 0.6578947
## 10 1947 35 0.5428571
## # ... with 4,734 more rows
v <- list(1,2,3)
map(v, ~ . * 10)
## [[1]]
## [1] 10
##
## [[2]]
## [1] 20
##
## [[3]]
## [1] 30
nest(-country)
the year column was not in the tibble# Load tidyr and purrr
library(tidyr)
library(purrr)
# Perform a linear regression on each item in the data column
by_year_country %>%
group_by(country) %>%
nest() %>%
mutate(
model = map(data, ~ lm(percent_yes ~ year, data = .))
)
## # A tibble: 200 x 3
## country data model
## <chr> <list> <list>
## 1 Afghanistan <tibble [34 x 3]> <S3: lm>
## 2 Argentina <tibble [34 x 3]> <S3: lm>
## 3 Australia <tibble [34 x 3]> <S3: lm>
## 4 Belarus <tibble [34 x 3]> <S3: lm>
## 5 Belgium <tibble [34 x 3]> <S3: lm>
## 6 Bolivia (Plurinational State of) <tibble [34 x 3]> <S3: lm>
## 7 Brazil <tibble [34 x 3]> <S3: lm>
## 8 Canada <tibble [34 x 3]> <S3: lm>
## 9 Chile <tibble [34 x 3]> <S3: lm>
## 10 Colombia <tibble [34 x 3]> <S3: lm>
## # ... with 190 more rows
# Load the broom package
# Add another mutate that applies tidy() to each model
by_year_country %>%
group_by(country) %>%
nest() %>%
mutate(
model = map(data, ~ lm(percent_yes ~ year, data = .)),
tidied = map(model, tidy)
)
## # A tibble: 200 x 4
## country data model
## <chr> <list> <list>
## 1 Afghanistan <tibble [34 x 3]> <S3: lm>
## 2 Argentina <tibble [34 x 3]> <S3: lm>
## 3 Australia <tibble [34 x 3]> <S3: lm>
## 4 Belarus <tibble [34 x 3]> <S3: lm>
## 5 Belgium <tibble [34 x 3]> <S3: lm>
## 6 Bolivia (Plurinational State of) <tibble [34 x 3]> <S3: lm>
## 7 Brazil <tibble [34 x 3]> <S3: lm>
## 8 Canada <tibble [34 x 3]> <S3: lm>
## 9 Chile <tibble [34 x 3]> <S3: lm>
## 10 Colombia <tibble [34 x 3]> <S3: lm>
## # ... with 190 more rows, and 1 more variables: tidied <list>
# Add one more step that unnests the tidied column
country_coefficients <- by_year_country %>%
group_by(country) %>%
nest() %>%
mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
tidied = map(model, tidy)) %>%
unnest(tidied)
# Print the resulting country_coefficients variable
country_coefficients
## # A tibble: 399 x 6
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan (Intercept) -11.063084650 1.4705189228 -7.523252
## 2 Afghanistan year 0.006009299 0.0007426499 8.091698
## 3 Argentina (Intercept) -9.464512565 2.1008982371 -4.504984
## 4 Argentina year 0.005148829 0.0010610076 4.852773
## 5 Australia (Intercept) -4.545492536 2.1479916283 -2.116159
## 6 Australia year 0.002567161 0.0010847910 2.366503
## 7 Belarus (Intercept) -7.000692717 1.5024232546 -4.659601
## 8 Belarus year 0.003907557 0.0007587624 5.149908
## 9 Belgium (Intercept) -5.845534016 1.5153390521 -3.857575
## 10 Belgium year 0.003203234 0.0007652852 4.185673
## # ... with 389 more rows, and 1 more variables: p.value <dbl>
p.adjust
that can do this# Print the country_coefficients dataset
country_coefficients
## # A tibble: 399 x 6
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan (Intercept) -11.063084650 1.4705189228 -7.523252
## 2 Afghanistan year 0.006009299 0.0007426499 8.091698
## 3 Argentina (Intercept) -9.464512565 2.1008982371 -4.504984
## 4 Argentina year 0.005148829 0.0010610076 4.852773
## 5 Australia (Intercept) -4.545492536 2.1479916283 -2.116159
## 6 Australia year 0.002567161 0.0010847910 2.366503
## 7 Belarus (Intercept) -7.000692717 1.5024232546 -4.659601
## 8 Belarus year 0.003907557 0.0007587624 5.149908
## 9 Belgium (Intercept) -5.845534016 1.5153390521 -3.857575
## 10 Belgium year 0.003203234 0.0007652852 4.185673
## # ... with 389 more rows, and 1 more variables: p.value <dbl>
# Filter for only the slope terms
country_coefficients %>%
filter(term == 'year')
## # A tibble: 199 x 6
## country term estimate std.error
## <chr> <chr> <dbl> <dbl>
## 1 Afghanistan year 0.006009299 0.0007426499
## 2 Argentina year 0.005148829 0.0010610076
## 3 Australia year 0.002567161 0.0010847910
## 4 Belarus year 0.003907557 0.0007587624
## 5 Belgium year 0.003203234 0.0007652852
## 6 Bolivia (Plurinational State of) year 0.005802864 0.0009657515
## 7 Brazil year 0.006107151 0.0008167736
## 8 Canada year 0.001515867 0.0009552118
## 9 Chile year 0.006775560 0.0008220463
## 10 Colombia year 0.006157755 0.0009645084
## # ... with 189 more rows, and 2 more variables: statistic <dbl>,
## # p.value <dbl>
# Filter for only the slope terms
slope_terms <- country_coefficients %>%
filter(term == "year")
# Add p.adjusted column, then filter
slope_terms %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
data.frame() %>% head()
## country term estimate std.error statistic
## 1 Afghanistan year 0.006009299 0.0007426499 8.091698
## 2 Argentina year 0.005148829 0.0010610076 4.852773
## 3 Australia year 0.002567161 0.0010847910 2.366503
## 4 Belarus year 0.003907557 0.0007587624 5.149908
## 5 Belgium year 0.003203234 0.0007652852 4.185673
## 6 Bolivia (Plurinational State of) year 0.005802864 0.0009657515 6.008651
## p.value p.adjusted
## 1 3.064797e-09 5.945706e-07
## 2 3.047078e-05 4.814383e-03
## 3 2.417617e-02 1.000000e+00
## 4 1.284924e-05 2.081577e-03
## 5 2.072981e-04 3.005823e-02
## 6 1.058595e-06 1.884300e-04
slope_terms %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < 0.05)
## # A tibble: 61 x 7
## country term estimate std.error
## <chr> <chr> <dbl> <dbl>
## 1 Afghanistan year 0.006009299 0.0007426499
## 2 Argentina year 0.005148829 0.0010610076
## 3 Belarus year 0.003907557 0.0007587624
## 4 Belgium year 0.003203234 0.0007652852
## 5 Bolivia (Plurinational State of) year 0.005802864 0.0009657515
## 6 Brazil year 0.006107151 0.0008167736
## 7 Chile year 0.006775560 0.0008220463
## 8 Colombia year 0.006157755 0.0009645084
## 9 Costa Rica year 0.006539273 0.0008119113
## 10 Cuba year 0.004610867 0.0007205029
## # ... with 51 more rows, and 3 more variables: statistic <dbl>,
## # p.value <dbl>, p.adjusted <dbl>
# Filter by adjusted p-values
filtered_countries <- country_coefficients %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < .05)
# Sort for the countries increasing most quickly
filtered_countries %>%
arrange(desc(estimate))
## # A tibble: 61 x 7
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 South Africa year 0.011858333 0.0014003768 8.467959
## 2 Kazakhstan year 0.010955741 0.0019482401 5.623404
## 3 Yemen Arab Republic year 0.010854882 0.0015869058 6.840281
## 4 Kyrgyzstan year 0.009725462 0.0009884060 9.839541
## 5 Malawi year 0.009084873 0.0018111087 5.016194
## 6 Dominican Republic year 0.008055482 0.0009138578 8.814809
## 7 Portugal year 0.008020046 0.0017124482 4.683380
## 8 Honduras year 0.007717977 0.0009214260 8.376123
## 9 Peru year 0.007299813 0.0009764019 7.476238
## 10 Nicaragua year 0.007075848 0.0010716402 6.602820
## # ... with 51 more rows, and 2 more variables: p.value <dbl>,
## # p.adjusted <dbl>
# Sort for the countries decreasing most quickly
filtered_countries %>%
arrange(estimate)
## # A tibble: 61 x 7
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Republic of Korea year -0.009209912 0.0015453128 -5.959901
## 2 Israel year -0.006852921 0.0011718657 -5.847873
## 3 United States year -0.006239305 0.0009282243 -6.721764
## 4 Belgium year 0.003203234 0.0007652852 4.185673
## 5 Guinea year 0.003621508 0.0008326598 4.349325
## 6 Morocco year 0.003798641 0.0008603064 4.415451
## 7 Belarus year 0.003907557 0.0007587624 5.149908
## 8 Iran (Islamic Republic of) year 0.003911100 0.0008558952 4.569602
## 9 Congo year 0.003967778 0.0009220262 4.303324
## 10 Sudan year 0.003989394 0.0009613894 4.149613
## # ... with 51 more rows, and 2 more variables: p.value <dbl>,
## # p.adjusted <dbl>
# Load dplyr package
# Print the votes_processed dataset
votes_processed
## # A tibble: 353,547 x 6
## rcid session vote ccode year country
## <dbl> <dbl> <dbl> <int> <dbl> <chr>
## 1 46 2 1 2 1947 United States
## 2 46 2 1 20 1947 Canada
## 3 46 2 1 40 1947 Cuba
## 4 46 2 1 41 1947 Haiti
## 5 46 2 1 42 1947 Dominican Republic
## 6 46 2 1 70 1947 Mexico
## 7 46 2 1 90 1947 Guatemala
## 8 46 2 1 91 1947 Honduras
## 9 46 2 1 92 1947 El Salvador
## 10 46 2 1 93 1947 Nicaragua
## # ... with 353,537 more rows
# Print the descriptions dataset
descriptions
## # A tibble: 2,589 x 10
## rcid session date unres me nu di hr co ec
## <dbl> <dbl> <dttm> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 46 2 1947-09-04 R/2/299 0 0 0 0 0 0
## 2 47 2 1947-10-05 R/2/355 0 0 0 1 0 0
## 3 48 2 1947-10-06 R/2/461 0 0 0 0 0 0
## 4 49 2 1947-10-06 R/2/463 0 0 0 0 0 0
## 5 50 2 1947-10-06 R/2/465 0 0 0 0 0 0
## 6 51 2 1947-10-02 R/2/561 0 0 0 0 1 0
## 7 52 2 1947-11-06 R/2/650 0 0 0 0 1 0
## 8 53 2 1947-11-06 R/2/651 0 0 0 0 1 0
## 9 54 2 1947-11-06 R/2/651 0 0 0 0 1 0
## 10 55 2 1947-11-06 R/2/667 0 0 0 0 1 0
## # ... with 2,579 more rows
# Join them together based on the "rcid" and "session" columns
votes_joined <- inner_join(votes_processed, descriptions, by = c("rcid", "session"))
# Filter for votes related to colonialism
votes_joined %>%
filter(co == 1)
## # A tibble: 60,962 x 14
## rcid session vote ccode year country date unres
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <dttm> <chr>
## 1 51 2 3 2 1947 United States 1947-10-02 R/2/561
## 2 51 2 3 20 1947 Canada 1947-10-02 R/2/561
## 3 51 2 2 40 1947 Cuba 1947-10-02 R/2/561
## 4 51 2 1 41 1947 Haiti 1947-10-02 R/2/561
## 5 51 2 3 42 1947 Dominican Republic 1947-10-02 R/2/561
## 6 51 2 2 70 1947 Mexico 1947-10-02 R/2/561
## 7 51 2 2 90 1947 Guatemala 1947-10-02 R/2/561
## 8 51 2 2 92 1947 El Salvador 1947-10-02 R/2/561
## 9 51 2 3 93 1947 Nicaragua 1947-10-02 R/2/561
## 10 51 2 2 95 1947 Panama 1947-10-02 R/2/561
## # ... with 60,952 more rows, and 6 more variables: me <dbl>, nu <dbl>,
## # di <dbl>, hr <dbl>, co <dbl>, ec <dbl>
# Load the ggplot2 package
library(ggplot2)
# Filter, then summarize by year: US_co_by_year
US_co_by_year <- votes_joined %>%
filter(
co == 1,
country == "United States"
) %>%
group_by(year) %>%
summarize(percent_yes = mean(vote == 1))
# Graph the % of "yes" votes over time
ggplot(US_co_by_year, aes(x = year, y = percent_yes)) +
geom_line()
# Load the tidyr package
library(tidyr)
# Gather the six me/nu/di/hr/co/ec columns
votes_joined %>%
gather(topic, has_topic, one_of("me","nu","di","hr","co","ec"))
## # A tibble: 2,121,282 x 10
## rcid session vote ccode year country date unres
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <dttm> <chr>
## 1 46 2 1 2 1947 United States 1947-09-04 R/2/299
## 2 46 2 1 20 1947 Canada 1947-09-04 R/2/299
## 3 46 2 1 40 1947 Cuba 1947-09-04 R/2/299
## 4 46 2 1 41 1947 Haiti 1947-09-04 R/2/299
## 5 46 2 1 42 1947 Dominican Republic 1947-09-04 R/2/299
## 6 46 2 1 70 1947 Mexico 1947-09-04 R/2/299
## 7 46 2 1 90 1947 Guatemala 1947-09-04 R/2/299
## 8 46 2 1 91 1947 Honduras 1947-09-04 R/2/299
## 9 46 2 1 92 1947 El Salvador 1947-09-04 R/2/299
## 10 46 2 1 93 1947 Nicaragua 1947-09-04 R/2/299
## # ... with 2,121,272 more rows, and 2 more variables: topic <chr>,
## # has_topic <dbl>
# Perform gather again, then filter
votes_gathered <- votes_joined %>%
gather(topic, has_topic, one_of("me","nu","di","hr","co","ec")) %>%
filter(has_topic == 1)
recode
is awesomeexample <- c("apple", "banana", "apple", "orange")
recode(example,
apple = "plum",
banana = "grape")
## [1] "plum" "grape" "plum" "orange"
# Replace the two-letter codes in topic: votes_tidied
votes_tidied <- votes_gathered %>%
mutate(topic = recode(topic,
me = "Palestinian conflict",
nu = "Nuclear weapons and nuclear material",
di = "Arms control and disarmament",
hr = "Human rights",
co = "Colonialism",
ec = "Economic development"))
# Print votes_tidied
votes_tidied
## # A tibble: 350,032 x 10
## rcid session vote ccode year country date unres
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <dttm> <chr>
## 1 77 2 1 2 1947 United States 1947-11-06 R/2/1424
## 2 77 2 1 20 1947 Canada 1947-11-06 R/2/1424
## 3 77 2 3 40 1947 Cuba 1947-11-06 R/2/1424
## 4 77 2 1 41 1947 Haiti 1947-11-06 R/2/1424
## 5 77 2 1 42 1947 Dominican Republic 1947-11-06 R/2/1424
## 6 77 2 2 70 1947 Mexico 1947-11-06 R/2/1424
## 7 77 2 1 90 1947 Guatemala 1947-11-06 R/2/1424
## 8 77 2 2 91 1947 Honduras 1947-11-06 R/2/1424
## 9 77 2 2 92 1947 El Salvador 1947-11-06 R/2/1424
## 10 77 2 1 93 1947 Nicaragua 1947-11-06 R/2/1424
## # ... with 350,022 more rows, and 2 more variables: topic <chr>,
## # has_topic <dbl>
# Summarize the percentage "yes" per country-year-topic
by_country_year_topic <- votes_tidied %>%
group_by(country, year, topic) %>%
summarize(
total = n(),
percent_yes = mean(vote == 1)
) %>%
ungroup()
# Print by_country_year_topic
by_country_year_topic
## # A tibble: 26,968 x 5
## country year topic total
## <chr> <dbl> <chr> <int>
## 1 Afghanistan 1947 Colonialism 8
## 2 Afghanistan 1947 Economic development 1
## 3 Afghanistan 1947 Human rights 1
## 4 Afghanistan 1947 Palestinian conflict 6
## 5 Afghanistan 1949 Arms control and disarmament 3
## 6 Afghanistan 1949 Colonialism 22
## 7 Afghanistan 1949 Economic development 1
## 8 Afghanistan 1949 Human rights 3
## 9 Afghanistan 1949 Nuclear weapons and nuclear material 3
## 10 Afghanistan 1949 Palestinian conflict 11
## # ... with 26,958 more rows, and 1 more variables: percent_yes <dbl>
# Load the ggplot2 package
library(ggplot2)
# Filter by_country_year_topic for just the US
US_by_country_year_topic <- by_country_year_topic %>%
filter(country == "United States")
# Plot % yes over time for the US, faceting by topic
ggplot(US_by_country_year_topic, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~ topic)
# Load purrr, tidyr, and broom
library(purrr)
library(tidyr)
library(broom)
# Print by_country_year_topic
by_country_year_topic
## # A tibble: 26,968 x 5
## country year topic total
## <chr> <dbl> <chr> <int>
## 1 Afghanistan 1947 Colonialism 8
## 2 Afghanistan 1947 Economic development 1
## 3 Afghanistan 1947 Human rights 1
## 4 Afghanistan 1947 Palestinian conflict 6
## 5 Afghanistan 1949 Arms control and disarmament 3
## 6 Afghanistan 1949 Colonialism 22
## 7 Afghanistan 1949 Economic development 1
## 8 Afghanistan 1949 Human rights 3
## 9 Afghanistan 1949 Nuclear weapons and nuclear material 3
## 10 Afghanistan 1949 Palestinian conflict 11
## # ... with 26,958 more rows, and 1 more variables: percent_yes <dbl>
# Fit model on the by_country_year_topic dataset
country_topic_coefficients <- by_country_year_topic %>%
nest(-country, -topic) %>%
mutate(
model = map(data, ~ lm(percent_yes ~ year, data = .)),
tidied = map(model, tidy)
) %>%
unnest(tidied)
# Print country_topic_coefficients
country_topic_coefficients
## # A tibble: 2,383 x 7
## country topic term estimate
## <chr> <chr> <chr> <dbl>
## 1 Afghanistan Colonialism (Intercept) -9.196506325
## 2 Afghanistan Colonialism year 0.005106200
## 3 Afghanistan Economic development (Intercept) -11.476390441
## 4 Afghanistan Economic development year 0.006239157
## 5 Afghanistan Human rights (Intercept) -7.265379964
## 6 Afghanistan Human rights year 0.004075877
## 7 Afghanistan Palestinian conflict (Intercept) -13.313363338
## 8 Afghanistan Palestinian conflict year 0.007167675
## 9 Afghanistan Arms control and disarmament (Intercept) -13.759624843
## 10 Afghanistan Arms control and disarmament year 0.007369733
## # ... with 2,373 more rows, and 3 more variables: std.error <dbl>,
## # statistic <dbl>, p.value <dbl>
# Create country_topic_filtered
country_topic_filtered <- country_topic_coefficients %>%
filter(term == 'year') %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < 0.05)
country_topic_filtered
## # A tibble: 56 x 8
## country topic term estimate
## <chr> <chr> <chr> <dbl>
## 1 Afghanistan Colonialism year 0.005106200
## 2 Austria Colonialism year 0.008353305
## 3 Barbados Colonialism year 0.006690915
## 4 Barbados Palestinian conflict year 0.016588442
## 5 Belgium Colonialism year 0.007313000
## 6 Bolivia (Plurinational State of) Colonialism year 0.008524197
## 7 Brazil Colonialism year 0.008271821
## 8 Chile Colonialism year 0.009323302
## 9 Colombia Colonialism year 0.007145088
## 10 Costa Rica Colonialism year 0.007380721
## # ... with 46 more rows, and 4 more variables: std.error <dbl>,
## # statistic <dbl>, p.value <dbl>, p.adjusted <dbl>
country_topic_filtered %>% arrange(estimate)
## # A tibble: 56 x 8
## country topic term estimate
## <chr> <chr> <chr> <dbl>
## 1 Vanuatu Palestinian conflict year -0.032694090
## 2 Vanuatu Colonialism year -0.017919993
## 3 Malta Nuclear weapons and nuclear material year -0.011190617
## 4 Cyprus Human rights year -0.010769662
## 5 United States Palestinian conflict year -0.010686324
## 6 Cyprus Nuclear weapons and nuclear material year -0.010658643
## 7 Israel Colonialism year -0.009529337
## 8 Romania Human rights year -0.009445343
## 9 Malta Arms control and disarmament year -0.009299679
## 10 Cyprus Arms control and disarmament year -0.008784289
## # ... with 46 more rows, and 4 more variables: std.error <dbl>,
## # statistic <dbl>, p.value <dbl>, p.adjusted <dbl>
# Create vanuatu_by_country_year_topic
vanuatu_by_country_year_topic <- by_country_year_topic %>%
filter(country == "Vanuatu")
# Plot of percentage "yes" over time, faceted by topic
ggplot(vanuatu_by_country_year_topic, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~ topic)
filtered_by_country_year_topic <- by_country_year_topic %>%
filter(country %in% c("United States", "United Kingdom", "France", "Germany"))
# Plot of percentage "yes" over time, faceted by topic
ggplot(filtered_by_country_year_topic, aes(x = year, y = percent_yes, color = country)) +
geom_point() +
geom_smooth() +
facet_wrap(~ topic)