The following project was completed as a part of Datacamp’s R courses. Below is the original project description:
Once you’ve started learning tools for data manipulation and visualization like dplyr and ggplot2, this course gives you a chance to use them in action on a real dataset. You’ll explore the historical voting of the United Nations General Assembly, including analyzing differences in voting between countries, across time, and among international issues. In the process you’ll gain more practice with the dplyr and ggplot2 packages, learn about the broom package for tidying model output, and experience the kind of start-to-finish exploratory analysis common in data science.
# Filter for votes that are "yes", "abstain", or "no"
# Add another %>% step to add a year column
votes %>%
filter(vote <= 3) %>%
mutate(year=session+1945)
## # A tibble: 353,547 x 6
## X rcid session vote ccode year
## <int> <int> <int> <int> <int> <dbl>
## 1 1 46 2 1 2 1947
## 2 2 46 2 1 20 1947
## 3 4 46 2 1 40 1947
## 4 5 46 2 1 41 1947
## 5 6 46 2 1 42 1947
## 6 16 46 2 1 70 1947
## 7 18 46 2 1 90 1947
## 8 19 46 2 1 91 1947
## 9 20 46 2 1 92 1947
## 10 21 46 2 1 93 1947
## # … with 353,537 more rows
# Load the countrycode package
#library(countrycode)
# Convert country code 100
countrycode(100,"cown","country.name")
## [1] "Colombia"
# Add a country column within the mutate: votes_processed
votes_processed <- votes %>%
filter(vote <= 3) %>%
mutate(year = session + 1945,
country=countrycode(ccode,"cown","country.name"))
## Warning in countrycode(ccode, "cown", "country.name"): Some values were not matched unambiguously: 260
# Find total and fraction of "yes" votes
votes_processed %>% summarize(
total=n(),
percent_yes=mean(vote==1))
## # A tibble: 1 x 2
## total percent_yes
## <int> <dbl>
## 1 353547 0.800
# Summarize by country: by_country
by_country <- votes_processed %>%
group_by(country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Filter out countries with fewer than 100 votes
by_country %>%
arrange(percent_yes) %>%
filter(total>=100)
## # A tibble: 197 x 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 United States 2568 0.269
## 2 Palau 369 0.339
## 3 Israel 2380 0.341
## 4 <NA> 1075 0.397
## 5 United Kingdom 2558 0.417
## 6 France 2527 0.427
## 7 Micronesia (Federated States of) 724 0.442
## 8 Marshall Islands 757 0.491
## 9 Belgium 2568 0.492
## 10 Canada 2576 0.508
## # … with 187 more rows
# Define by_year
by_year <- votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Load the ggplot2 package
#library(ggplot2)
# Create line plot
ggplot(by_year, aes(x=year, y=percent_yes)) +
geom_line()
# Change to scatter plot and add smoothing curve
ggplot(by_year, aes(year, percent_yes)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Group by year and country: by_year_country
by_year_country <- votes_processed %>%
group_by(year, country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Start with by_year_country dataset
by_year_country <- votes_processed %>%
group_by(year, country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Print by_year_country
by_year_country
## # A tibble: 4,744 x 4
## # Groups: year [34]
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 Afghanistan 34 0.382
## 2 1947 Argentina 38 0.579
## 3 1947 Australia 38 0.553
## 4 1947 Belarus 38 0.5
## 5 1947 Belgium 38 0.605
## 6 1947 Bolivia 37 0.595
## 7 1947 Brazil 38 0.658
## 8 1947 Canada 38 0.605
## 9 1947 Chile 38 0.658
## 10 1947 Colombia 35 0.543
## # … with 4,734 more rows
# Create a filtered version: UK_by_year
UK_by_year <- filter(by_year_country, country=="United Kingdom")
# Line plot of percent_yes over time for UK only
ggplot(UK_by_year, aes(x=year,y=percent_yes)) +
geom_line()
# Vector of four countries to examine
countries <- c("United States", "United Kingdom",
"France", "India")
# Filter by_year_country: filtered_4_countries
filtered_4_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes in four countries
ggplot(filtered_4_countries, aes(x=year, y=percent_yes, color=country)) +
geom_line()
# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
"France", "Japan", "Brazil", "India")
# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>% filter (country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(x=year, y=percent_yes)) +
geom_line() +
facet_wrap(~country)
# Line plot of % yes over time faceted by country and a free y-scale
ggplot(filtered_6_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scales="free_y")
# Add three more countries to this list
countries <- c("United States", "United Kingdom",
"France", "Japan", "Brazil", "India",
"Russian Federation", "Pakistan", "Kazakhstan")
# Filtered by_year_country: filtered_countries
filtered_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scales = "free_y")
# Percentage of yes votes from the US by year: US_by_year
US_by_year <- by_year_country %>%
filter(country == "United States")
# Print the US_by_year data
US_by_year
## # A tibble: 34 x 4
## # Groups: year [34]
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 United States 38 0.711
## 2 1949 United States 64 0.281
## 3 1951 United States 25 0.4
## 4 1953 United States 26 0.5
## 5 1955 United States 37 0.622
## 6 1957 United States 34 0.647
## 7 1959 United States 54 0.426
## 8 1961 United States 75 0.507
## 9 1963 United States 32 0.5
## 10 1965 United States 41 0.366
## # … with 24 more rows
# Perform a linear regression of percent_yes by year: US_fit
US_fit <- lm(formula=percent_yes ~ year, data=US_by_year)
# Perform summary() on the US_fit object
summary(US_fit)
##
## Call:
## lm(formula = percent_yes ~ year, data = US_by_year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.222491 -0.080635 -0.008661 0.081948 0.194307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.6641455 1.8379743 6.890 8.48e-08 ***
## year -0.0062393 0.0009282 -6.722 1.37e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1062 on 32 degrees of freedom
## Multiple R-squared: 0.5854, Adjusted R-squared: 0.5724
## F-statistic: 45.18 on 1 and 32 DF, p-value: 1.367e-07
# Load the broom package
#library(broom)
# Call the tidy() function on the US_fit object
tidy(US_fit)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 12.7 1.84 6.89 0.0000000848
## 2 year -0.00624 0.000928 -6.72 0.000000137
# Linear regression of percent_yes by year for US
US_by_year <- by_year_country %>%
filter(country == "United States")
US_fit <- lm(percent_yes ~ year, US_by_year)
# Fit model for the United Kingdom
UK_by_year <- by_year_country %>%
filter(country=="United Kingdom")
UK_fit <- lm(percent_yes ~ year, UK_by_year)
# Create US_tidied and UK_tidied
US_tidied <- tidy(US_fit)
UK_tidied <- tidy(UK_fit)
# Combine the two tidied models
bind_rows(US_tidied, UK_tidied)
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 12.7 1.84 6.89 0.0000000848
## 2 year -0.00624 0.000928 -6.72 0.000000137
## 3 (Intercept) -3.27 1.96 -1.67 0.105
## 4 year 0.00187 0.000989 1.89 0.0677
# Load the tidyr package
#library(tidyr)
# Nest all columns besides country
by_year_country %>% nest(-c(country))
## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?
## # A tibble: 200 x 2
## country data
## <chr> <list<df[,3]>>
## 1 Afghanistan [34 × 3]
## 2 Argentina [34 × 3]
## 3 Australia [34 × 3]
## 4 Belarus [34 × 3]
## 5 Belgium [34 × 3]
## 6 Bolivia [34 × 3]
## 7 Brazil [34 × 3]
## 8 Canada [34 × 3]
## 9 Chile [34 × 3]
## 10 Colombia [34 × 3]
## # … with 190 more rows
# All countries are nested besides country
nested <- by_year_country %>%
nest(-country)
## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?
# Print the nested data for Brazil
nested$data[nested$country=="Brazil"]
## <list_of<
## grouped_df<
## year : double
## total : integer
## percent_yes: double
## >
## >[2]>
## [[1]]
## # A tibble: 34 x 3
## # Groups: year [34]
## year total percent_yes
## * <dbl> <int> <dbl>
## 1 1947 38 0.658
## 2 1949 64 0.469
## 3 1951 25 0.64
## 4 1953 26 0.731
## 5 1955 37 0.730
## 6 1957 34 0.735
## 7 1959 54 0.537
## 8 1961 76 0.553
## 9 1963 32 0.781
## 10 1965 41 0.610
## # … with 24 more rows
##
## [[2]]
## NULL
# Unnest the data column to return it to its original form
nested %>% unnest(data)
## # A tibble: 4,744 x 4
## country year total percent_yes
## <chr> <dbl> <int> <dbl>
## 1 Afghanistan 1947 34 0.382
## 2 Afghanistan 1949 51 0.608
## 3 Afghanistan 1951 25 0.76
## 4 Afghanistan 1953 26 0.769
## 5 Afghanistan 1955 37 0.730
## 6 Afghanistan 1957 34 0.529
## 7 Afghanistan 1959 54 0.611
## 8 Afghanistan 1961 76 0.605
## 9 Afghanistan 1963 32 0.781
## 10 Afghanistan 1965 40 0.85
## # … with 4,734 more rows
# IMPORTANT: Ungroup the data set before nesting it.
by_year_country <- by_year_country %>% ungroup()
# Perform a linear regression on each item in the data column
by_year_country %>%
nest(-country) %>% mutate(model=map(data, ~ lm(percent_yes ~ year, .)))
## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?
## # A tibble: 200 x 3
## country data model
## <chr> <list<df[,3]>> <list>
## 1 Afghanistan [34 × 3] <lm>
## 2 Argentina [34 × 3] <lm>
## 3 Australia [34 × 3] <lm>
## 4 Belarus [34 × 3] <lm>
## 5 Belgium [34 × 3] <lm>
## 6 Bolivia [34 × 3] <lm>
## 7 Brazil [34 × 3] <lm>
## 8 Canada [34 × 3] <lm>
## 9 Chile [34 × 3] <lm>
## 10 Colombia [34 × 3] <lm>
## # … with 190 more rows
# Load the broom package
#library(broom)
# Add another mutate that applies tidy() to each model
by_year_country %>%
nest(-country) %>%
mutate(model = map(data, ~ lm(percent_yes ~ year, data = .))) %>%
mutate(tidied = map(model, ~tidy(.)))
## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?
## # A tibble: 200 x 4
## country data model tidied
## <chr> <list<df[,3]>> <list> <list>
## 1 Afghanistan [34 × 3] <lm> <tibble [2 × 5]>
## 2 Argentina [34 × 3] <lm> <tibble [2 × 5]>
## 3 Australia [34 × 3] <lm> <tibble [2 × 5]>
## 4 Belarus [34 × 3] <lm> <tibble [2 × 5]>
## 5 Belgium [34 × 3] <lm> <tibble [2 × 5]>
## 6 Bolivia [34 × 3] <lm> <tibble [2 × 5]>
## 7 Brazil [34 × 3] <lm> <tibble [2 × 5]>
## 8 Canada [34 × 3] <lm> <tibble [2 × 5]>
## 9 Chile [34 × 3] <lm> <tibble [2 × 5]>
## 10 Colombia [34 × 3] <lm> <tibble [2 × 5]>
## # … with 190 more rows
# Add one more step that unnests the tidied column
country_coefficients <- by_year_country %>%
nest(-country) %>%
mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
tidied = map(model, ~tidy(.))) %>%
unnest(tidied)
## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?
# Print the resulting country_coefficients variable
country_coefficients
## # A tibble: 399 x 8
## country data model term estimate std.error statistic p.value
## <chr> <list<df[,> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanis… [34 × 3] <lm> (Interc… -1.11e+1 1.47 -7.52 1.44e-8
## 2 Afghanis… [34 × 3] <lm> year 6.01e-3 0.000743 8.09 3.06e-9
## 3 Argentina [34 × 3] <lm> (Interc… -9.46e+0 2.10 -4.50 8.32e-5
## 4 Argentina [34 × 3] <lm> year 5.15e-3 0.00106 4.85 3.05e-5
## 5 Australia [34 × 3] <lm> (Interc… -4.55e+0 2.15 -2.12 4.22e-2
## 6 Australia [34 × 3] <lm> year 2.57e-3 0.00108 2.37 2.42e-2
## 7 Belarus [34 × 3] <lm> (Interc… -7.00e+0 1.50 -4.66 5.33e-5
## 8 Belarus [34 × 3] <lm> year 3.91e-3 0.000759 5.15 1.28e-5
## 9 Belgium [34 × 3] <lm> (Interc… -5.85e+0 1.52 -3.86 5.22e-4
## 10 Belgium [34 × 3] <lm> year 3.20e-3 0.000765 4.19 2.07e-4
## # … with 389 more rows
### Working with many tidy models ###
# Print the country_coefficients dataset
country_coefficients
## # A tibble: 399 x 8
## country data model term estimate std.error statistic p.value
## <chr> <list<df[,> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanis… [34 × 3] <lm> (Interc… -1.11e+1 1.47 -7.52 1.44e-8
## 2 Afghanis… [34 × 3] <lm> year 6.01e-3 0.000743 8.09 3.06e-9
## 3 Argentina [34 × 3] <lm> (Interc… -9.46e+0 2.10 -4.50 8.32e-5
## 4 Argentina [34 × 3] <lm> year 5.15e-3 0.00106 4.85 3.05e-5
## 5 Australia [34 × 3] <lm> (Interc… -4.55e+0 2.15 -2.12 4.22e-2
## 6 Australia [34 × 3] <lm> year 2.57e-3 0.00108 2.37 2.42e-2
## 7 Belarus [34 × 3] <lm> (Interc… -7.00e+0 1.50 -4.66 5.33e-5
## 8 Belarus [34 × 3] <lm> year 3.91e-3 0.000759 5.15 1.28e-5
## 9 Belgium [34 × 3] <lm> (Interc… -5.85e+0 1.52 -3.86 5.22e-4
## 10 Belgium [34 × 3] <lm> year 3.20e-3 0.000765 4.19 2.07e-4
## # … with 389 more rows
# Filter for only the slope terms
filter(country_coefficients, term=="year")
## # A tibble: 199 x 8
## country data model term estimate std.error statistic p.value
## <chr> <list<df[,3]> <lis> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanist… [34 × 3] <lm> year 0.00601 0.000743 8.09 3.06e-9
## 2 Argentina [34 × 3] <lm> year 0.00515 0.00106 4.85 3.05e-5
## 3 Australia [34 × 3] <lm> year 0.00257 0.00108 2.37 2.42e-2
## 4 Belarus [34 × 3] <lm> year 0.00391 0.000759 5.15 1.28e-5
## 5 Belgium [34 × 3] <lm> year 0.00320 0.000765 4.19 2.07e-4
## 6 Bolivia [34 × 3] <lm> year 0.00580 0.000966 6.01 1.06e-6
## 7 Brazil [34 × 3] <lm> year 0.00611 0.000817 7.48 1.64e-8
## 8 Canada [34 × 3] <lm> year 0.00152 0.000955 1.59 1.22e-1
## 9 Chile [34 × 3] <lm> year 0.00678 0.000822 8.24 2.05e-9
## 10 Colombia [34 × 3] <lm> year 0.00616 0.000965 6.38 3.58e-7
## # … with 189 more rows
# Filter for only the slope terms
slope_terms <- country_coefficients %>%
filter(term == "year")
# Add p.adjusted column, then filter
slope_terms %>% mutate (p.adjusted=p.adjust(p.value)) %>%
filter(p.adjusted <0.5)
## # A tibble: 90 x 9
## country data model term estimate std.error statistic p.value p.adjusted
## <chr> <list<d> <lis> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghani… [34 × 3] <lm> year 0.00601 0.000743 8.09 3.06e-9 5.95e-7
## 2 Argenti… [34 × 3] <lm> year 0.00515 0.00106 4.85 3.05e-5 4.81e-3
## 3 Belarus [34 × 3] <lm> year 0.00391 0.000759 5.15 1.28e-5 2.08e-3
## 4 Belgium [34 × 3] <lm> year 0.00320 0.000765 4.19 2.07e-4 3.01e-2
## 5 Bolivia [34 × 3] <lm> year 0.00580 0.000966 6.01 1.06e-6 1.88e-4
## 6 Brazil [34 × 3] <lm> year 0.00611 0.000817 7.48 1.64e-8 3.12e-6
## 7 Chile [34 × 3] <lm> year 0.00678 0.000822 8.24 2.05e-9 3.99e-7
## 8 Colombia [34 × 3] <lm> year 0.00616 0.000965 6.38 3.58e-7 6.56e-5
## 9 Costa R… [34 × 3] <lm> year 0.00654 0.000812 8.05 3.39e-9 6.54e-7
## 10 Cuba [34 × 3] <lm> year 0.00461 0.000721 6.40 3.43e-7 6.31e-5
## # … with 80 more rows
# Filter by adjusted p-values
filtered_countries <- country_coefficients %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < .05)
# Sort for the countries increasing most quickly
arrange(filtered_countries, estimate)
## # A tibble: 61 x 9
## country data model term estimate std.error statistic p.value p.adjusted
## <chr> <list<d> <lis> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South K… [12 × 3] <lm> year -0.00921 0.00155 -5.96 1.39e-4 0.0209
## 2 Israel [33 × 3] <lm> year -0.00685 0.00117 -5.85 1.89e-6 0.000331
## 3 United … [34 × 3] <lm> year -0.00624 0.000928 -6.72 1.37e-7 0.0000254
## 4 Belgium [34 × 3] <lm> year 0.00320 0.000765 4.19 2.07e-4 0.0301
## 5 Guinea [28 × 3] <lm> year 0.00362 0.000833 4.35 1.87e-4 0.0275
## 6 Morocco [29 × 3] <lm> year 0.00380 0.000860 4.42 1.46e-4 0.0218
## 7 Belarus [34 × 3] <lm> year 0.00391 0.000759 5.15 1.28e-5 0.00208
## 8 Iran [34 × 3] <lm> year 0.00391 0.000856 4.57 6.91e-5 0.0107
## 9 Congo -… [27 × 3] <lm> year 0.00397 0.000922 4.30 2.27e-4 0.0326
## 10 Sudan [29 × 3] <lm> year 0.00399 0.000961 4.15 2.98e-4 0.0420
## # … with 51 more rows
# Sort for the countries decreasing most quickly
arrange(filtered_countries, desc(estimate))
## # A tibble: 61 x 9
## country data model term estimate std.error statistic p.value p.adjusted
## <chr> <list<d> <lis> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South … [25 × 3] <lm> year 0.0119 0.00140 8.47 1.60e- 8 3.05e-6
## 2 Kazakh… [11 × 3] <lm> year 0.0110 0.00195 5.62 3.24e- 4 4.51e-2
## 3 Yemen … [22 × 3] <lm> year 0.0109 0.00159 6.84 1.20e- 6 2.11e-4
## 4 Kyrgyz… [9 × 3] <lm> year 0.00973 0.000988 9.84 2.38e- 5 3.78e-3
## 5 Malawi [25 × 3] <lm> year 0.00908 0.00181 5.02 4.48e- 5 7.03e-3
## 6 Domini… [33 × 3] <lm> year 0.00806 0.000914 8.81 5.96e-10 1.17e-7
## 7 Portug… [29 × 3] <lm> year 0.00802 0.00171 4.68 7.13e- 5 1.10e-2
## 8 Hondur… [34 × 3] <lm> year 0.00772 0.000921 8.38 1.43e- 9 2.81e-7
## 9 Peru [34 × 3] <lm> year 0.00730 0.000976 7.48 1.65e- 8 3.12e-6
## 10 Nicara… [34 × 3] <lm> year 0.00708 0.00107 6.60 1.92e- 7 3.55e-5
## # … with 51 more rows
# Load dplyr package
#library(dplyr)
# Print the votes_processed dataset
votes_processed
## # A tibble: 353,547 x 7
## X rcid session vote ccode year country
## <int> <int> <int> <int> <int> <dbl> <chr>
## 1 1 46 2 1 2 1947 United States
## 2 2 46 2 1 20 1947 Canada
## 3 4 46 2 1 40 1947 Cuba
## 4 5 46 2 1 41 1947 Haiti
## 5 6 46 2 1 42 1947 Dominican Republic
## 6 16 46 2 1 70 1947 Mexico
## 7 18 46 2 1 90 1947 Guatemala
## 8 19 46 2 1 91 1947 Honduras
## 9 20 46 2 1 92 1947 El Salvador
## 10 21 46 2 1 93 1947 Nicaragua
## # … with 353,537 more rows
# Print the descriptions dataset
descriptions
## # A tibble: 2,589 x 11
## X rcid session date unres me nu di hr co ec
## <int> <int> <int> <fct> <fct> <int> <int> <int> <int> <int> <int>
## 1 1 46 2 1947-09-04 R/2/299 0 0 0 0 0 0
## 2 2 47 2 1947-10-05 R/2/355 0 0 0 1 0 0
## 3 3 48 2 1947-10-06 R/2/461 0 0 0 0 0 0
## 4 4 49 2 1947-10-06 R/2/463 0 0 0 0 0 0
## 5 5 50 2 1947-10-06 R/2/465 0 0 0 0 0 0
## 6 6 51 2 1947-10-02 R/2/561 0 0 0 0 1 0
## 7 7 52 2 1947-11-06 R/2/650 0 0 0 0 1 0
## 8 8 53 2 1947-11-06 R/2/651 0 0 0 0 1 0
## 9 9 54 2 1947-11-06 R/2/651 0 0 0 0 1 0
## 10 10 55 2 1947-11-06 R/2/667 0 0 0 0 1 0
## # … with 2,579 more rows
# Join them together based on the "rcid" and "session" columns
votes_joined <- inner_join(votes_processed, descriptions,
by=c("rcid", "session"))
# Filter for votes related to colonialism
votes_joined %>% filter(co==1)
## # A tibble: 60,962 x 16
## X.x rcid session vote ccode year country X.y date unres me nu
## <int> <int> <int> <int> <int> <dbl> <chr> <int> <fct> <fct> <int> <int>
## 1 986 51 2 3 2 1947 United… 6 1947… R/2/… 0 0
## 2 987 51 2 3 20 1947 Canada 6 1947… R/2/… 0 0
## 3 989 51 2 2 40 1947 Cuba 6 1947… R/2/… 0 0
## 4 990 51 2 1 41 1947 Haiti 6 1947… R/2/… 0 0
## 5 991 51 2 3 42 1947 Domini… 6 1947… R/2/… 0 0
## 6 1001 51 2 2 70 1947 Mexico 6 1947… R/2/… 0 0
## 7 1003 51 2 2 90 1947 Guatem… 6 1947… R/2/… 0 0
## 8 1005 51 2 2 92 1947 El Sal… 6 1947… R/2/… 0 0
## 9 1006 51 2 3 93 1947 Nicara… 6 1947… R/2/… 0 0
## 10 1008 51 2 2 95 1947 Panama 6 1947… R/2/… 0 0
## # … with 60,952 more rows, and 4 more variables: di <int>, hr <int>, co <int>,
## # ec <int>
# Load the ggplot2 package
#library(ggplot2)
# Filter, then summarize by year: US_co_by_year
US_co_by_year <- votes_joined %>% filter(co==1, country=="United States") %>% group_by(year) %>% summarize(percent_yes=mean(vote==1, na.rm=TRUE))
# Graph the % of "yes" votes over time
ggplot(US_co_by_year, aes(x=year, y=percent_yes)) +
geom_line()
# Load the tidyr package
#library(tidyr)
# Gather the six me/nu/di/hr/co/ec columns
votes_joined %>%
gather(topic, has_topic, me:ec)
## # A tibble: 2,121,282 x 12
## X.x rcid session vote ccode year country X.y date unres topic
## <int> <int> <int> <int> <int> <dbl> <chr> <int> <fct> <fct> <chr>
## 1 1 46 2 1 2 1947 United… 1 1947… R/2/… me
## 2 2 46 2 1 20 1947 Canada 1 1947… R/2/… me
## 3 4 46 2 1 40 1947 Cuba 1 1947… R/2/… me
## 4 5 46 2 1 41 1947 Haiti 1 1947… R/2/… me
## 5 6 46 2 1 42 1947 Domini… 1 1947… R/2/… me
## 6 16 46 2 1 70 1947 Mexico 1 1947… R/2/… me
## 7 18 46 2 1 90 1947 Guatem… 1 1947… R/2/… me
## 8 19 46 2 1 91 1947 Hondur… 1 1947… R/2/… me
## 9 20 46 2 1 92 1947 El Sal… 1 1947… R/2/… me
## 10 21 46 2 1 93 1947 Nicara… 1 1947… R/2/… me
## # … with 2,121,272 more rows, and 1 more variable: has_topic <int>
# Perform gather again, then filter
votes_gathered <- votes_joined %>%
gather(topic, has_topic, me:ec) %>%
filter(has_topic == 1)
# Replace the two-letter codes in topic: votes_tidied
votes_tidied <- votes_gathered %>%
mutate(topic = recode(topic,
me = "Palestinian conflict",
nu = "Nuclear weapons and nuclear material",
di = "Arms control and disarmament",
hr = "Human rights",
co = "Colonialism",
ec = "Economic development"))
# Print votes_tidied
votes_tidied
## # A tibble: 350,032 x 12
## X.x rcid session vote ccode year country X.y date unres topic
## <int> <int> <int> <int> <int> <dbl> <chr> <int> <fct> <fct> <chr>
## 1 6108 77 2 1 2 1947 United… 32 1947… R/2/… Pale…
## 2 6109 77 2 1 20 1947 Canada 32 1947… R/2/… Pale…
## 3 6111 77 2 3 40 1947 Cuba 32 1947… R/2/… Pale…
## 4 6112 77 2 1 41 1947 Haiti 32 1947… R/2/… Pale…
## 5 6113 77 2 1 42 1947 Domini… 32 1947… R/2/… Pale…
## 6 6123 77 2 2 70 1947 Mexico 32 1947… R/2/… Pale…
## 7 6125 77 2 1 90 1947 Guatem… 32 1947… R/2/… Pale…
## 8 6126 77 2 2 91 1947 Hondur… 32 1947… R/2/… Pale…
## 9 6127 77 2 2 92 1947 El Sal… 32 1947… R/2/… Pale…
## 10 6128 77 2 1 93 1947 Nicara… 32 1947… R/2/… Pale…
## # … with 350,022 more rows, and 1 more variable: has_topic <int>
# Summarize the percentage "yes" per country-year-topic
by_country_year_topic <- votes_tidied %>% group_by(country, year, topic) %>%
summarize(total=n(), percent_yes=mean(vote==1, na.rm=TRUE)) %>% ungroup()
# Print by_country_year_topic
by_country_year_topic
## # A tibble: 26,968 x 5
## country year topic total percent_yes
## <chr> <dbl> <chr> <int> <dbl>
## 1 Afghanistan 1947 Colonialism 8 0.5
## 2 Afghanistan 1947 Economic development 1 0
## 3 Afghanistan 1947 Human rights 1 0
## 4 Afghanistan 1947 Palestinian conflict 6 0
## 5 Afghanistan 1949 Arms control and disarmament 3 0
## 6 Afghanistan 1949 Colonialism 22 0.864
## 7 Afghanistan 1949 Economic development 1 1
## 8 Afghanistan 1949 Human rights 3 0
## 9 Afghanistan 1949 Nuclear weapons and nuclear material 3 0
## 10 Afghanistan 1949 Palestinian conflict 11 0.818
## # … with 26,958 more rows
# Load the ggplot2 package
#library(ggplot2)
# Filter by_country_year_topic for just the US
US_by_country_year_topic <- by_country_year_topic %>% filter (country=="United States")
# Plot % yes over time for the US, faceting by topic
ggplot (US_by_country_year_topic, aes (x=year, y=percent_yes)) + geom_line() +
facet_wrap(~topic)
# Print by_country_year_topic
by_country_year_topic
## # A tibble: 26,968 x 5
## country year topic total percent_yes
## <chr> <dbl> <chr> <int> <dbl>
## 1 Afghanistan 1947 Colonialism 8 0.5
## 2 Afghanistan 1947 Economic development 1 0
## 3 Afghanistan 1947 Human rights 1 0
## 4 Afghanistan 1947 Palestinian conflict 6 0
## 5 Afghanistan 1949 Arms control and disarmament 3 0
## 6 Afghanistan 1949 Colonialism 22 0.864
## 7 Afghanistan 1949 Economic development 1 1
## 8 Afghanistan 1949 Human rights 3 0
## 9 Afghanistan 1949 Nuclear weapons and nuclear material 3 0
## 10 Afghanistan 1949 Palestinian conflict 11 0.818
## # … with 26,958 more rows
# Fit model on the by_country_year_topic dataset
country_topic_coefficients <- by_country_year_topic %>%
nest(-country, -topic) %>%
mutate(model=map(data, ~lm(percent_yes ~ year, data= .)),
tidied=map(model, tidy)) %>%
unnest(tidied)
# Print country_topic_coefficients
country_topic_coefficients
## # A tibble: 2,383 x 9
## country topic data model term estimate std.error statistic p.value
## <chr> <chr> <list<df> <lis> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghani… Colonial… [34 × 3] <lm> (Int… -9.20e+0 1.96 -4.70 4.76e-5
## 2 Afghani… Colonial… [34 × 3] <lm> year 5.11e-3 0.000989 5.17 1.23e-5
## 3 Afghani… Economic… [32 × 3] <lm> (Int… -1.15e+1 3.62 -3.17 3.49e-3
## 4 Afghani… Economic… [32 × 3] <lm> year 6.24e-3 0.00183 3.42 1.85e-3
## 5 Afghani… Human ri… [34 × 3] <lm> (Int… -7.27e+0 4.37 -1.66 1.06e-1
## 6 Afghani… Human ri… [34 × 3] <lm> year 4.08e-3 0.00221 1.85 7.43e-2
## 7 Afghani… Palestin… [30 × 3] <lm> (Int… -1.33e+1 3.57 -3.73 8.66e-4
## 8 Afghani… Palestin… [30 × 3] <lm> year 7.17e-3 0.00180 3.98 4.42e-4
## 9 Afghani… Arms con… [29 × 3] <lm> (Int… -1.38e+1 4.13 -3.33 2.53e-3
## 10 Afghani… Arms con… [29 × 3] <lm> year 7.37e-3 0.00208 3.54 1.49e-3
## # … with 2,373 more rows
# Create country_topic_filtered
country_topic_filtered <- country_topic_coefficients %>%
filter(term=="year")
country_topic_filtered <- country_topic_filtered %>% mutate(p.adjusted=p.adjust(p.value)) %>%
filter(p.adjusted <0.05)
# Create vanuatu_by_country_year_topic
vanuatu_by_country_year_topic <- by_country_year_topic %>%
filter (country=="Vanuatu")
# Plot of percentage "yes" over time, faceted by topic
ggplot(vanuatu_by_country_year_topic, aes(x=year, y=percent_yes)) +
geom_line() +
facet_wrap (~topic)