DataCamp - EDA Case Study

Exploratory Data Analysis

The following project was completed as a part of Datacamp’s R courses. Below is the original project description:
Once you’ve started learning tools for data manipulation and visualization like dplyr and ggplot2, this course gives you a chance to use them in action on a real dataset. You’ll explore the historical voting of the United Nations General Assembly, including analyzing differences in voting between countries, across time, and among international issues. In the process you’ll gain more practice with the dplyr and ggplot2 packages, learn about the broom package for tidying model output, and experience the kind of start-to-finish exploratory analysis common in data science.

1 Data cleaning and summarizing with dplyr

# Filter for votes that are "yes", "abstain", or "no"
# Add another %>% step to add a year column
votes %>%
  filter(vote <= 3) %>%
  mutate(year=session+1945)

## # A tibble: 353,547 x 6
##        X  rcid session  vote ccode  year
##    <int> <int>   <int> <int> <int> <dbl>
##  1     1    46       2     1     2  1947
##  2     2    46       2     1    20  1947
##  3     4    46       2     1    40  1947
##  4     5    46       2     1    41  1947
##  5     6    46       2     1    42  1947
##  6    16    46       2     1    70  1947
##  7    18    46       2     1    90  1947
##  8    19    46       2     1    91  1947
##  9    20    46       2     1    92  1947
## 10    21    46       2     1    93  1947
## # … with 353,537 more rows

# Load the countrycode package
#library(countrycode)

# Convert country code 100
countrycode(100,"cown","country.name")

## [1] "Colombia"

# Add a country column within the mutate: votes_processed
votes_processed <- votes %>%
  filter(vote <= 3) %>%
  mutate(year = session + 1945, 
  country=countrycode(ccode,"cown","country.name"))

## Warning in countrycode(ccode, "cown", "country.name"): Some values were not matched unambiguously: 260

# Find total and fraction of "yes" votes
votes_processed %>% summarize(
total=n(),
percent_yes=mean(vote==1))

## # A tibble: 1 x 2
##    total percent_yes
##    <int>       <dbl>
## 1 353547       0.800

# Summarize by country: by_country
by_country <- votes_processed %>%
  group_by(country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Filter out countries with fewer than 100 votes
by_country %>%
  arrange(percent_yes) %>%
  filter(total>=100)

## # A tibble: 197 x 3
##    country                          total percent_yes
##    <chr>                            <int>       <dbl>
##  1 United States                     2568       0.269
##  2 Palau                              369       0.339
##  3 Israel                            2380       0.341
##  4 <NA>                              1075       0.397
##  5 United Kingdom                    2558       0.417
##  6 France                            2527       0.427
##  7 Micronesia (Federated States of)   724       0.442
##  8 Marshall Islands                   757       0.491
##  9 Belgium                           2568       0.492
## 10 Canada                            2576       0.508
## # … with 187 more rows

2 Visualizing data with ggplot2

# Define by_year
by_year <- votes_processed %>%
  group_by(year) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Load the ggplot2 package
#library(ggplot2)

# Create line plot
ggplot(by_year, aes(x=year, y=percent_yes)) +
  geom_line()

# Change to scatter plot and add smoothing curve
ggplot(by_year, aes(year, percent_yes)) +
  geom_point() + geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Group by year and country: by_year_country
by_year_country <- votes_processed %>%
  group_by(year, country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Start with by_year_country dataset
by_year_country <- votes_processed %>%
  group_by(year, country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Print by_year_country
by_year_country

## # A tibble: 4,744 x 4
## # Groups:   year [34]
##     year country     total percent_yes
##    <dbl> <chr>       <int>       <dbl>
##  1  1947 Afghanistan    34       0.382
##  2  1947 Argentina      38       0.579
##  3  1947 Australia      38       0.553
##  4  1947 Belarus        38       0.5  
##  5  1947 Belgium        38       0.605
##  6  1947 Bolivia        37       0.595
##  7  1947 Brazil         38       0.658
##  8  1947 Canada         38       0.605
##  9  1947 Chile          38       0.658
## 10  1947 Colombia       35       0.543
## # … with 4,734 more rows

# Create a filtered version: UK_by_year
UK_by_year <- filter(by_year_country, country=="United Kingdom")

# Line plot of percent_yes over time for UK only
ggplot(UK_by_year, aes(x=year,y=percent_yes)) +
  geom_line()

# Vector of four countries to examine
countries <- c("United States", "United Kingdom",
               "France", "India")

# Filter by_year_country: filtered_4_countries
filtered_4_countries <- by_year_country %>%
filter(country %in% countries)

# Line plot of % yes in four countries
ggplot(filtered_4_countries, aes(x=year, y=percent_yes, color=country)) +
  geom_line()

# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
               "France", "Japan", "Brazil", "India")

# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>% filter (country %in% countries)

# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(x=year, y=percent_yes)) +
geom_line() +
  facet_wrap(~country)

# Line plot of % yes over time faceted by country and a free y-scale
ggplot(filtered_6_countries, aes(year, percent_yes)) +
  geom_line() +
  facet_wrap(~ country, scales="free_y")

# Add three more countries to this list
countries <- c("United States", "United Kingdom",
               "France", "Japan", "Brazil", "India", 
               "Russian Federation", "Pakistan", "Kazakhstan")

# Filtered by_year_country: filtered_countries
filtered_countries <- by_year_country %>%
  filter(country %in% countries)

# Line plot of % yes over time faceted by country
ggplot(filtered_countries, aes(year, percent_yes)) +
  geom_line() +
  facet_wrap(~ country, scales = "free_y")

3 Tidy modeling with broom

# Percentage of yes votes from the US by year: US_by_year
US_by_year <- by_year_country %>%
  filter(country == "United States")

# Print the US_by_year data
US_by_year

## # A tibble: 34 x 4
## # Groups:   year [34]
##     year country       total percent_yes
##    <dbl> <chr>         <int>       <dbl>
##  1  1947 United States    38       0.711
##  2  1949 United States    64       0.281
##  3  1951 United States    25       0.4  
##  4  1953 United States    26       0.5  
##  5  1955 United States    37       0.622
##  6  1957 United States    34       0.647
##  7  1959 United States    54       0.426
##  8  1961 United States    75       0.507
##  9  1963 United States    32       0.5  
## 10  1965 United States    41       0.366
## # … with 24 more rows

# Perform a linear regression of percent_yes by year: US_fit
US_fit <- lm(formula=percent_yes ~ year, data=US_by_year)

# Perform summary() on the US_fit object
summary(US_fit)

## 
## Call:
## lm(formula = percent_yes ~ year, data = US_by_year)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.222491 -0.080635 -0.008661  0.081948  0.194307 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.6641455  1.8379743   6.890 8.48e-08 ***
## year        -0.0062393  0.0009282  -6.722 1.37e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1062 on 32 degrees of freedom
## Multiple R-squared:  0.5854, Adjusted R-squared:  0.5724 
## F-statistic: 45.18 on 1 and 32 DF,  p-value: 1.367e-07

# Load the broom package
#library(broom)

# Call the tidy() function on the US_fit object
tidy(US_fit)

## # A tibble: 2 x 5
##   term        estimate std.error statistic      p.value
##   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept) 12.7      1.84          6.89 0.0000000848
## 2 year        -0.00624  0.000928     -6.72 0.000000137

# Linear regression of percent_yes by year for US
US_by_year <- by_year_country %>%
  filter(country == "United States")
US_fit <- lm(percent_yes ~ year, US_by_year)

# Fit model for the United Kingdom
UK_by_year <- by_year_country %>%
filter(country=="United Kingdom")
UK_fit <- lm(percent_yes ~ year, UK_by_year)

# Create US_tidied and UK_tidied
US_tidied <- tidy(US_fit)
UK_tidied <- tidy(UK_fit)

# Combine the two tidied models
bind_rows(US_tidied, UK_tidied)

## # A tibble: 4 x 5
##   term        estimate std.error statistic      p.value
##   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept) 12.7      1.84          6.89 0.0000000848
## 2 year        -0.00624  0.000928     -6.72 0.000000137 
## 3 (Intercept) -3.27     1.96         -1.67 0.105       
## 4 year         0.00187  0.000989      1.89 0.0677

Nesting the data

# Load the tidyr package
#library(tidyr)

# Nest all columns besides country
by_year_country %>% nest(-c(country))

## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?

## # A tibble: 200 x 2
##    country               data
##    <chr>       <list<df[,3]>>
##  1 Afghanistan       [34 × 3]
##  2 Argentina         [34 × 3]
##  3 Australia         [34 × 3]
##  4 Belarus           [34 × 3]
##  5 Belgium           [34 × 3]
##  6 Bolivia           [34 × 3]
##  7 Brazil            [34 × 3]
##  8 Canada            [34 × 3]
##  9 Chile             [34 × 3]
## 10 Colombia          [34 × 3]
## # … with 190 more rows

# All countries are nested besides country
nested <- by_year_country %>%
  nest(-country)

## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?

# Print the nested data for Brazil
nested$data[nested$country=="Brazil"]

## <list_of<
##   grouped_df<
##     year       : double
##     total      : integer
##     percent_yes: double
##   >
## >[2]>
## [[1]]
## # A tibble: 34 x 3
## # Groups:   year [34]
##     year total percent_yes
##  * <dbl> <int>       <dbl>
##  1  1947    38       0.658
##  2  1949    64       0.469
##  3  1951    25       0.64 
##  4  1953    26       0.731
##  5  1955    37       0.730
##  6  1957    34       0.735
##  7  1959    54       0.537
##  8  1961    76       0.553
##  9  1963    32       0.781
## 10  1965    41       0.610
## # … with 24 more rows
## 
## [[2]]
## NULL

# Unnest the data column to return it to its original form
nested %>% unnest(data)

## # A tibble: 4,744 x 4
##    country      year total percent_yes
##    <chr>       <dbl> <int>       <dbl>
##  1 Afghanistan  1947    34       0.382
##  2 Afghanistan  1949    51       0.608
##  3 Afghanistan  1951    25       0.76 
##  4 Afghanistan  1953    26       0.769
##  5 Afghanistan  1955    37       0.730
##  6 Afghanistan  1957    34       0.529
##  7 Afghanistan  1959    54       0.611
##  8 Afghanistan  1961    76       0.605
##  9 Afghanistan  1963    32       0.781
## 10 Afghanistan  1965    40       0.85 
## # … with 4,734 more rows

# IMPORTANT: Ungroup the data set before nesting it.
by_year_country <- by_year_country %>% ungroup()


# Perform a linear regression on each item in the data column
by_year_country %>%
  nest(-country) %>% mutate(model=map(data, ~ lm(percent_yes ~ year, .)))

## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?

## # A tibble: 200 x 3
##    country               data model 
##    <chr>       <list<df[,3]>> <list>
##  1 Afghanistan       [34 × 3] <lm>  
##  2 Argentina         [34 × 3] <lm>  
##  3 Australia         [34 × 3] <lm>  
##  4 Belarus           [34 × 3] <lm>  
##  5 Belgium           [34 × 3] <lm>  
##  6 Bolivia           [34 × 3] <lm>  
##  7 Brazil            [34 × 3] <lm>  
##  8 Canada            [34 × 3] <lm>  
##  9 Chile             [34 × 3] <lm>  
## 10 Colombia          [34 × 3] <lm>  
## # … with 190 more rows

# Load the broom package
#library(broom)

# Add another mutate that applies tidy() to each model
by_year_country %>%
  nest(-country) %>%
  mutate(model = map(data, ~ lm(percent_yes ~ year, data = .))) %>%
  mutate(tidied = map(model, ~tidy(.)))

## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?

## # A tibble: 200 x 4
##    country               data model  tidied          
##    <chr>       <list<df[,3]>> <list> <list>          
##  1 Afghanistan       [34 × 3] <lm>   <tibble [2 × 5]>
##  2 Argentina         [34 × 3] <lm>   <tibble [2 × 5]>
##  3 Australia         [34 × 3] <lm>   <tibble [2 × 5]>
##  4 Belarus           [34 × 3] <lm>   <tibble [2 × 5]>
##  5 Belgium           [34 × 3] <lm>   <tibble [2 × 5]>
##  6 Bolivia           [34 × 3] <lm>   <tibble [2 × 5]>
##  7 Brazil            [34 × 3] <lm>   <tibble [2 × 5]>
##  8 Canada            [34 × 3] <lm>   <tibble [2 × 5]>
##  9 Chile             [34 × 3] <lm>   <tibble [2 × 5]>
## 10 Colombia          [34 × 3] <lm>   <tibble [2 × 5]>
## # … with 190 more rows

# Add one more step that unnests the tidied column
country_coefficients <- by_year_country %>%
  nest(-country) %>%
  mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
         tidied = map(model, ~tidy(.))) %>%
         unnest(tidied)

## Warning: All elements of `...` must be named.
## Did you want `data = c(year, total, percent_yes)`?

# Print the resulting country_coefficients variable
country_coefficients

## # A tibble: 399 x 8
##    country          data model  term     estimate std.error statistic    p.value
##    <chr>     <list<df[,> <list> <chr>       <dbl>     <dbl>     <dbl>      <dbl>
##  1 Afghanis…    [34 × 3] <lm>   (Interc… -1.11e+1  1.47         -7.52    1.44e-8
##  2 Afghanis…    [34 × 3] <lm>   year      6.01e-3  0.000743      8.09    3.06e-9
##  3 Argentina    [34 × 3] <lm>   (Interc… -9.46e+0  2.10         -4.50    8.32e-5
##  4 Argentina    [34 × 3] <lm>   year      5.15e-3  0.00106       4.85    3.05e-5
##  5 Australia    [34 × 3] <lm>   (Interc… -4.55e+0  2.15         -2.12    4.22e-2
##  6 Australia    [34 × 3] <lm>   year      2.57e-3  0.00108       2.37    2.42e-2
##  7 Belarus      [34 × 3] <lm>   (Interc… -7.00e+0  1.50         -4.66    5.33e-5
##  8 Belarus      [34 × 3] <lm>   year      3.91e-3  0.000759      5.15    1.28e-5
##  9 Belgium      [34 × 3] <lm>   (Interc… -5.85e+0  1.52         -3.86    5.22e-4
## 10 Belgium      [34 × 3] <lm>   year      3.20e-3  0.000765      4.19    2.07e-4
## # … with 389 more rows

### Working with many tidy models ###
# Print the country_coefficients dataset
country_coefficients

## # A tibble: 399 x 8
##    country          data model  term     estimate std.error statistic    p.value
##    <chr>     <list<df[,> <list> <chr>       <dbl>     <dbl>     <dbl>      <dbl>
##  1 Afghanis…    [34 × 3] <lm>   (Interc… -1.11e+1  1.47         -7.52    1.44e-8
##  2 Afghanis…    [34 × 3] <lm>   year      6.01e-3  0.000743      8.09    3.06e-9
##  3 Argentina    [34 × 3] <lm>   (Interc… -9.46e+0  2.10         -4.50    8.32e-5
##  4 Argentina    [34 × 3] <lm>   year      5.15e-3  0.00106       4.85    3.05e-5
##  5 Australia    [34 × 3] <lm>   (Interc… -4.55e+0  2.15         -2.12    4.22e-2
##  6 Australia    [34 × 3] <lm>   year      2.57e-3  0.00108       2.37    2.42e-2
##  7 Belarus      [34 × 3] <lm>   (Interc… -7.00e+0  1.50         -4.66    5.33e-5
##  8 Belarus      [34 × 3] <lm>   year      3.91e-3  0.000759      5.15    1.28e-5
##  9 Belgium      [34 × 3] <lm>   (Interc… -5.85e+0  1.52         -3.86    5.22e-4
## 10 Belgium      [34 × 3] <lm>   year      3.20e-3  0.000765      4.19    2.07e-4
## # … with 389 more rows

# Filter for only the slope terms
filter(country_coefficients, term=="year")

## # A tibble: 199 x 8
##    country             data model term  estimate std.error statistic     p.value
##    <chr>      <list<df[,3]> <lis> <chr>    <dbl>     <dbl>     <dbl>       <dbl>
##  1 Afghanist…      [34 × 3] <lm>  year   0.00601  0.000743      8.09     3.06e-9
##  2 Argentina       [34 × 3] <lm>  year   0.00515  0.00106       4.85     3.05e-5
##  3 Australia       [34 × 3] <lm>  year   0.00257  0.00108       2.37     2.42e-2
##  4 Belarus         [34 × 3] <lm>  year   0.00391  0.000759      5.15     1.28e-5
##  5 Belgium         [34 × 3] <lm>  year   0.00320  0.000765      4.19     2.07e-4
##  6 Bolivia         [34 × 3] <lm>  year   0.00580  0.000966      6.01     1.06e-6
##  7 Brazil          [34 × 3] <lm>  year   0.00611  0.000817      7.48     1.64e-8
##  8 Canada          [34 × 3] <lm>  year   0.00152  0.000955      1.59     1.22e-1
##  9 Chile           [34 × 3] <lm>  year   0.00678  0.000822      8.24     2.05e-9
## 10 Colombia        [34 × 3] <lm>  year   0.00616  0.000965      6.38     3.58e-7
## # … with 189 more rows

# Filter for only the slope terms
slope_terms <- country_coefficients %>%
  filter(term == "year")

# Add p.adjusted column, then filter
slope_terms %>% mutate (p.adjusted=p.adjust(p.value)) %>%
filter(p.adjusted <0.5)

## # A tibble: 90 x 9
##    country      data model term  estimate std.error statistic p.value p.adjusted
##    <chr>    <list<d> <lis> <chr>    <dbl>     <dbl>     <dbl>   <dbl>      <dbl>
##  1 Afghani… [34 × 3] <lm>  year   0.00601  0.000743      8.09 3.06e-9    5.95e-7
##  2 Argenti… [34 × 3] <lm>  year   0.00515  0.00106       4.85 3.05e-5    4.81e-3
##  3 Belarus  [34 × 3] <lm>  year   0.00391  0.000759      5.15 1.28e-5    2.08e-3
##  4 Belgium  [34 × 3] <lm>  year   0.00320  0.000765      4.19 2.07e-4    3.01e-2
##  5 Bolivia  [34 × 3] <lm>  year   0.00580  0.000966      6.01 1.06e-6    1.88e-4
##  6 Brazil   [34 × 3] <lm>  year   0.00611  0.000817      7.48 1.64e-8    3.12e-6
##  7 Chile    [34 × 3] <lm>  year   0.00678  0.000822      8.24 2.05e-9    3.99e-7
##  8 Colombia [34 × 3] <lm>  year   0.00616  0.000965      6.38 3.58e-7    6.56e-5
##  9 Costa R… [34 × 3] <lm>  year   0.00654  0.000812      8.05 3.39e-9    6.54e-7
## 10 Cuba     [34 × 3] <lm>  year   0.00461  0.000721      6.40 3.43e-7    6.31e-5
## # … with 80 more rows

# Filter by adjusted p-values
filtered_countries <- country_coefficients %>%
  filter(term == "year") %>%
  mutate(p.adjusted = p.adjust(p.value)) %>%
  filter(p.adjusted < .05)

# Sort for the countries increasing most quickly
arrange(filtered_countries, estimate)

## # A tibble: 61 x 9
##    country      data model term  estimate std.error statistic p.value p.adjusted
##    <chr>    <list<d> <lis> <chr>    <dbl>     <dbl>     <dbl>   <dbl>      <dbl>
##  1 South K… [12 × 3] <lm>  year  -0.00921  0.00155      -5.96 1.39e-4  0.0209   
##  2 Israel   [33 × 3] <lm>  year  -0.00685  0.00117      -5.85 1.89e-6  0.000331 
##  3 United … [34 × 3] <lm>  year  -0.00624  0.000928     -6.72 1.37e-7  0.0000254
##  4 Belgium  [34 × 3] <lm>  year   0.00320  0.000765      4.19 2.07e-4  0.0301   
##  5 Guinea   [28 × 3] <lm>  year   0.00362  0.000833      4.35 1.87e-4  0.0275   
##  6 Morocco  [29 × 3] <lm>  year   0.00380  0.000860      4.42 1.46e-4  0.0218   
##  7 Belarus  [34 × 3] <lm>  year   0.00391  0.000759      5.15 1.28e-5  0.00208  
##  8 Iran     [34 × 3] <lm>  year   0.00391  0.000856      4.57 6.91e-5  0.0107   
##  9 Congo -… [27 × 3] <lm>  year   0.00397  0.000922      4.30 2.27e-4  0.0326   
## 10 Sudan    [29 × 3] <lm>  year   0.00399  0.000961      4.15 2.98e-4  0.0420   
## # … with 51 more rows

# Sort for the countries decreasing most quickly
arrange(filtered_countries, desc(estimate))

## # A tibble: 61 x 9
##    country     data model term  estimate std.error statistic  p.value p.adjusted
##    <chr>   <list<d> <lis> <chr>    <dbl>     <dbl>     <dbl>    <dbl>      <dbl>
##  1 South … [25 × 3] <lm>  year   0.0119   0.00140       8.47 1.60e- 8    3.05e-6
##  2 Kazakh… [11 × 3] <lm>  year   0.0110   0.00195       5.62 3.24e- 4    4.51e-2
##  3 Yemen … [22 × 3] <lm>  year   0.0109   0.00159       6.84 1.20e- 6    2.11e-4
##  4 Kyrgyz…  [9 × 3] <lm>  year   0.00973  0.000988      9.84 2.38e- 5    3.78e-3
##  5 Malawi  [25 × 3] <lm>  year   0.00908  0.00181       5.02 4.48e- 5    7.03e-3
##  6 Domini… [33 × 3] <lm>  year   0.00806  0.000914      8.81 5.96e-10    1.17e-7
##  7 Portug… [29 × 3] <lm>  year   0.00802  0.00171       4.68 7.13e- 5    1.10e-2
##  8 Hondur… [34 × 3] <lm>  year   0.00772  0.000921      8.38 1.43e- 9    2.81e-7
##  9 Peru    [34 × 3] <lm>  year   0.00730  0.000976      7.48 1.65e- 8    3.12e-6
## 10 Nicara… [34 × 3] <lm>  year   0.00708  0.00107       6.60 1.92e- 7    3.55e-5
## # … with 51 more rows

4 Joining data sets

# Load dplyr package
#library(dplyr)

# Print the votes_processed dataset
votes_processed

## # A tibble: 353,547 x 7
##        X  rcid session  vote ccode  year country           
##    <int> <int>   <int> <int> <int> <dbl> <chr>             
##  1     1    46       2     1     2  1947 United States     
##  2     2    46       2     1    20  1947 Canada            
##  3     4    46       2     1    40  1947 Cuba              
##  4     5    46       2     1    41  1947 Haiti             
##  5     6    46       2     1    42  1947 Dominican Republic
##  6    16    46       2     1    70  1947 Mexico            
##  7    18    46       2     1    90  1947 Guatemala         
##  8    19    46       2     1    91  1947 Honduras          
##  9    20    46       2     1    92  1947 El Salvador       
## 10    21    46       2     1    93  1947 Nicaragua         
## # … with 353,537 more rows

# Print the descriptions dataset
descriptions

## # A tibble: 2,589 x 11
##        X  rcid session date       unres      me    nu    di    hr    co    ec
##    <int> <int>   <int> <fct>      <fct>   <int> <int> <int> <int> <int> <int>
##  1     1    46       2 1947-09-04 R/2/299     0     0     0     0     0     0
##  2     2    47       2 1947-10-05 R/2/355     0     0     0     1     0     0
##  3     3    48       2 1947-10-06 R/2/461     0     0     0     0     0     0
##  4     4    49       2 1947-10-06 R/2/463     0     0     0     0     0     0
##  5     5    50       2 1947-10-06 R/2/465     0     0     0     0     0     0
##  6     6    51       2 1947-10-02 R/2/561     0     0     0     0     1     0
##  7     7    52       2 1947-11-06 R/2/650     0     0     0     0     1     0
##  8     8    53       2 1947-11-06 R/2/651     0     0     0     0     1     0
##  9     9    54       2 1947-11-06 R/2/651     0     0     0     0     1     0
## 10    10    55       2 1947-11-06 R/2/667     0     0     0     0     1     0
## # … with 2,579 more rows

# Join them together based on the "rcid" and "session" columns
votes_joined <- inner_join(votes_processed, descriptions, 
by=c("rcid", "session"))

# Filter for votes related to colonialism
votes_joined %>% filter(co==1)

## # A tibble: 60,962 x 16
##      X.x  rcid session  vote ccode  year country   X.y date  unres    me    nu
##    <int> <int>   <int> <int> <int> <dbl> <chr>   <int> <fct> <fct> <int> <int>
##  1   986    51       2     3     2  1947 United…     6 1947… R/2/…     0     0
##  2   987    51       2     3    20  1947 Canada      6 1947… R/2/…     0     0
##  3   989    51       2     2    40  1947 Cuba        6 1947… R/2/…     0     0
##  4   990    51       2     1    41  1947 Haiti       6 1947… R/2/…     0     0
##  5   991    51       2     3    42  1947 Domini…     6 1947… R/2/…     0     0
##  6  1001    51       2     2    70  1947 Mexico      6 1947… R/2/…     0     0
##  7  1003    51       2     2    90  1947 Guatem…     6 1947… R/2/…     0     0
##  8  1005    51       2     2    92  1947 El Sal…     6 1947… R/2/…     0     0
##  9  1006    51       2     3    93  1947 Nicara…     6 1947… R/2/…     0     0
## 10  1008    51       2     2    95  1947 Panama      6 1947… R/2/…     0     0
## # … with 60,952 more rows, and 4 more variables: di <int>, hr <int>, co <int>,
## #   ec <int>

# Load the ggplot2 package
#library(ggplot2)

# Filter, then summarize by year: US_co_by_year
US_co_by_year <- votes_joined %>% filter(co==1, country=="United States") %>% group_by(year) %>% summarize(percent_yes=mean(vote==1, na.rm=TRUE)) 

# Graph the % of "yes" votes over time
ggplot(US_co_by_year, aes(x=year, y=percent_yes)) +
geom_line()

# Load the tidyr package
#library(tidyr)

# Gather the six me/nu/di/hr/co/ec columns
votes_joined %>%
  gather(topic, has_topic, me:ec)

## # A tibble: 2,121,282 x 12
##      X.x  rcid session  vote ccode  year country   X.y date  unres topic
##    <int> <int>   <int> <int> <int> <dbl> <chr>   <int> <fct> <fct> <chr>
##  1     1    46       2     1     2  1947 United…     1 1947… R/2/… me   
##  2     2    46       2     1    20  1947 Canada      1 1947… R/2/… me   
##  3     4    46       2     1    40  1947 Cuba        1 1947… R/2/… me   
##  4     5    46       2     1    41  1947 Haiti       1 1947… R/2/… me   
##  5     6    46       2     1    42  1947 Domini…     1 1947… R/2/… me   
##  6    16    46       2     1    70  1947 Mexico      1 1947… R/2/… me   
##  7    18    46       2     1    90  1947 Guatem…     1 1947… R/2/… me   
##  8    19    46       2     1    91  1947 Hondur…     1 1947… R/2/… me   
##  9    20    46       2     1    92  1947 El Sal…     1 1947… R/2/… me   
## 10    21    46       2     1    93  1947 Nicara…     1 1947… R/2/… me   
## # … with 2,121,272 more rows, and 1 more variable: has_topic <int>

# Perform gather again, then filter
votes_gathered <- votes_joined %>%
  gather(topic, has_topic, me:ec) %>%
  filter(has_topic == 1)

# Replace the two-letter codes in topic: votes_tidied
votes_tidied <- votes_gathered %>%
  mutate(topic = recode(topic,
                        me = "Palestinian conflict",
                        nu = "Nuclear weapons and nuclear material",
                        di = "Arms control and disarmament",
                        hr = "Human rights",
                        co = "Colonialism",
                        ec = "Economic development"))

# Print votes_tidied
votes_tidied

## # A tibble: 350,032 x 12
##      X.x  rcid session  vote ccode  year country   X.y date  unres topic
##    <int> <int>   <int> <int> <int> <dbl> <chr>   <int> <fct> <fct> <chr>
##  1  6108    77       2     1     2  1947 United…    32 1947… R/2/… Pale…
##  2  6109    77       2     1    20  1947 Canada     32 1947… R/2/… Pale…
##  3  6111    77       2     3    40  1947 Cuba       32 1947… R/2/… Pale…
##  4  6112    77       2     1    41  1947 Haiti      32 1947… R/2/… Pale…
##  5  6113    77       2     1    42  1947 Domini…    32 1947… R/2/… Pale…
##  6  6123    77       2     2    70  1947 Mexico     32 1947… R/2/… Pale…
##  7  6125    77       2     1    90  1947 Guatem…    32 1947… R/2/… Pale…
##  8  6126    77       2     2    91  1947 Hondur…    32 1947… R/2/… Pale…
##  9  6127    77       2     2    92  1947 El Sal…    32 1947… R/2/… Pale…
## 10  6128    77       2     1    93  1947 Nicara…    32 1947… R/2/… Pale…
## # … with 350,022 more rows, and 1 more variable: has_topic <int>

# Summarize the percentage "yes" per country-year-topic
by_country_year_topic <- votes_tidied %>% group_by(country, year, topic) %>%
summarize(total=n(), percent_yes=mean(vote==1, na.rm=TRUE)) %>% ungroup()

# Print by_country_year_topic
by_country_year_topic

## # A tibble: 26,968 x 5
##    country      year topic                                total percent_yes
##    <chr>       <dbl> <chr>                                <int>       <dbl>
##  1 Afghanistan  1947 Colonialism                              8       0.5  
##  2 Afghanistan  1947 Economic development                     1       0    
##  3 Afghanistan  1947 Human rights                             1       0    
##  4 Afghanistan  1947 Palestinian conflict                     6       0    
##  5 Afghanistan  1949 Arms control and disarmament             3       0    
##  6 Afghanistan  1949 Colonialism                             22       0.864
##  7 Afghanistan  1949 Economic development                     1       1    
##  8 Afghanistan  1949 Human rights                             3       0    
##  9 Afghanistan  1949 Nuclear weapons and nuclear material     3       0    
## 10 Afghanistan  1949 Palestinian conflict                    11       0.818
## # … with 26,958 more rows

# Load the ggplot2 package
#library(ggplot2)

# Filter by_country_year_topic for just the US
US_by_country_year_topic <- by_country_year_topic %>% filter (country=="United States")

# Plot % yes over time for the US, faceting by topic
ggplot (US_by_country_year_topic, aes (x=year, y=percent_yes)) + geom_line() +
facet_wrap(~topic)

# Print by_country_year_topic
by_country_year_topic

## # A tibble: 26,968 x 5
##    country      year topic                                total percent_yes
##    <chr>       <dbl> <chr>                                <int>       <dbl>
##  1 Afghanistan  1947 Colonialism                              8       0.5  
##  2 Afghanistan  1947 Economic development                     1       0    
##  3 Afghanistan  1947 Human rights                             1       0    
##  4 Afghanistan  1947 Palestinian conflict                     6       0    
##  5 Afghanistan  1949 Arms control and disarmament             3       0    
##  6 Afghanistan  1949 Colonialism                             22       0.864
##  7 Afghanistan  1949 Economic development                     1       1    
##  8 Afghanistan  1949 Human rights                             3       0    
##  9 Afghanistan  1949 Nuclear weapons and nuclear material     3       0    
## 10 Afghanistan  1949 Palestinian conflict                    11       0.818
## # … with 26,958 more rows

# Fit model on the by_country_year_topic dataset
country_topic_coefficients <- by_country_year_topic %>%
nest(-country, -topic) %>%
mutate(model=map(data, ~lm(percent_yes ~ year, data= .)),
tidied=map(model, tidy)) %>%
unnest(tidied)


# Print country_topic_coefficients
country_topic_coefficients

## # A tibble: 2,383 x 9
##    country  topic          data model term  estimate std.error statistic p.value
##    <chr>    <chr>     <list<df> <lis> <chr>    <dbl>     <dbl>     <dbl>   <dbl>
##  1 Afghani… Colonial…  [34 × 3] <lm>  (Int… -9.20e+0  1.96         -4.70 4.76e-5
##  2 Afghani… Colonial…  [34 × 3] <lm>  year   5.11e-3  0.000989      5.17 1.23e-5
##  3 Afghani… Economic…  [32 × 3] <lm>  (Int… -1.15e+1  3.62         -3.17 3.49e-3
##  4 Afghani… Economic…  [32 × 3] <lm>  year   6.24e-3  0.00183       3.42 1.85e-3
##  5 Afghani… Human ri…  [34 × 3] <lm>  (Int… -7.27e+0  4.37         -1.66 1.06e-1
##  6 Afghani… Human ri…  [34 × 3] <lm>  year   4.08e-3  0.00221       1.85 7.43e-2
##  7 Afghani… Palestin…  [30 × 3] <lm>  (Int… -1.33e+1  3.57         -3.73 8.66e-4
##  8 Afghani… Palestin…  [30 × 3] <lm>  year   7.17e-3  0.00180       3.98 4.42e-4
##  9 Afghani… Arms con…  [29 × 3] <lm>  (Int… -1.38e+1  4.13         -3.33 2.53e-3
## 10 Afghani… Arms con…  [29 × 3] <lm>  year   7.37e-3  0.00208       3.54 1.49e-3
## # … with 2,373 more rows

# Create country_topic_filtered
country_topic_filtered <- country_topic_coefficients %>%
filter(term=="year")

country_topic_filtered <- country_topic_filtered %>% mutate(p.adjusted=p.adjust(p.value)) %>%
filter(p.adjusted <0.05)

# Create vanuatu_by_country_year_topic
vanuatu_by_country_year_topic <- by_country_year_topic %>% 
filter (country=="Vanuatu")

# Plot of percentage "yes" over time, faceted by topic
ggplot(vanuatu_by_country_year_topic, aes(x=year, y=percent_yes)) + 
  geom_line() +
  facet_wrap (~topic)