The Task

Choose any three of the “wide” datasets identified in the Week 5 Discussion items. (You may use your own dataset; please don’t use my Sample Post dataset, since that was used in your Week 6 assignment!)

For each of the three chosen datasets:

Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset. You’re encouraged to use a “wide” structure similar to how the information appears in the discussion item, so that you can practice tidying and transformations as described below.
Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. [Most of your grade will be based on this step!]
Perform the analysis requested in the discussion item.

Load Packages

knitr::opts_chunk$set(#echo=FALSE, 
                      warning=FALSE, 
                      message=FALSE,
                      tidy=F,
                      #comment = "",
                      dev="png", 
                      dev.args=list(type="cairo"))
#https://cran.r-project.org/web/packages/prettydoc/vignettes/
#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf

load.packages <- c("stringr","prettydoc", "tidyr", "dplyr", "knitr", "janitor", "ggplot2", "data.table")

ipak <- function(pkg){
    #FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg)) 
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE, quietly = TRUE, warn.conflicts = FALSE)
}

ipak(load.packages)

## Warning: package 'janitor' was built under R version 3.3.3

## -------------------------------------------------------------------------

## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!

## -------------------------------------------------------------------------

##    stringr  prettydoc      tidyr      dplyr      knitr    janitor 
##       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE 
##    ggplot2 data.table 
##       TRUE       TRUE

Dataset #1: Nkasi’s Congressional Voting Records

Load data, get headers & take a look at it

ds1 <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data", header = F, stringsAsFactors = F)

#get the header names
ds1_names <- readLines("https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.names")[33:49]
ds1_names <- str_extract(ds1_names,"[[:alpha:]-]{2,40}")
ds1_names[1] <- "party"
names(ds1) <- ds1_names

kable(head(ds1))

party	handicapped-infants	water-project-cost-sharing	adoption-of-the-budget-resolution	physician-fee-freeze	el-salvador-aid	religious-groups-in-schools	anti-satellite-test-ban	aid-to-nicaraguan-contras	mx-missile	immigration	synfuels-corporation-cutback	education-spending	superfund-right-to-sue	crime	duty-free-exports	export-administration-act-south-africa
republican	n	y	n	y	y	y	n	n	n	y	?	y	y	y	n	y
republican	n	y	n	y	y	y	n	n	n	n	n	y	y	y	n	?
democrat	?	y	y	?	y	y	n	n	n	n	y	n	y	y	n	n
democrat	n	y	y	n	?	y	n	n	n	n	y	n	y	n	n	y
democrat	y	y	y	n	y	y	n	n	n	n	y	?	y	y	y	y
democrat	n	y	y	n	y	y	n	n	n	n	n	n	y	y	y	y

glimpse(ds1)

## Observations: 435
## Variables: 17
## $ party                                  <chr> "republican", "republic...
## $ handicapped-infants                    <chr> "n", "n", "?", "n", "y"...
## $ water-project-cost-sharing             <chr> "y", "y", "y", "y", "y"...
## $ adoption-of-the-budget-resolution      <chr> "n", "n", "y", "y", "y"...
## $ physician-fee-freeze                   <chr> "y", "y", "?", "n", "n"...
## $ el-salvador-aid                        <chr> "y", "y", "y", "?", "y"...
## $ religious-groups-in-schools            <chr> "y", "y", "y", "y", "y"...
## $ anti-satellite-test-ban                <chr> "n", "n", "n", "n", "n"...
## $ aid-to-nicaraguan-contras              <chr> "n", "n", "n", "n", "n"...
## $ mx-missile                             <chr> "n", "n", "n", "n", "n"...
## $ immigration                            <chr> "y", "n", "n", "n", "n"...
## $ synfuels-corporation-cutback           <chr> "?", "n", "y", "y", "y"...
## $ education-spending                     <chr> "y", "y", "n", "n", "?"...
## $ superfund-right-to-sue                 <chr> "y", "y", "y", "y", "y"...
## $ crime                                  <chr> "y", "y", "y", "n", "y"...
## $ duty-free-exports                      <chr> "n", "n", "n", "n", "y"...
## $ export-administration-act-south-africa <chr> "y", "?", "n", "y", "y"...

Tidy & mutate the data, remove non-yea-or-nay votes

party_agreement <- ds1 %>% 
  gather("voting_issue", "vote", 2:17) %>%
  filter(vote %in% c("y", "n")) %>% 
  group_by(party, voting_issue, vote) %>% 
  summarise(vote_count = n()) %>% 
  spread(vote, vote_count) %>% 
  rename(nay_votes = n, yea_votes = y) %>% 
  mutate(total_votes = nay_votes + yea_votes, 
         agreement_rate = pmax(nay_votes, yea_votes) / total_votes, 
         most_votes = ifelse(yea_votes > nay_votes, "yea", "nay")) %>% 
  select(party, voting_issue, agreement_rate, most_votes) 
    #switch("n" = "nay_votes", "y" = "yea_votes")

kable(head(party_agreement))

party	voting_issue	agreement_rate	most_votes
democrat	adoption-of-the-budget-resolution	0.8884615	yea
democrat	aid-to-nicaraguan-contras	0.8288973	yea
democrat	anti-satellite-test-ban	0.7722008	yea
democrat	crime	0.6498054	nay
democrat	duty-free-exports	0.6374502	yea
democrat	education-spending	0.8554217	nay

Analysis that can be done:

Which issue attracted the most difference (in voting pattern) between members of congress according to affiliation?

My interpretation of the question: On which issue was each party the most divided?

party_agreement %>% 
  filter(party == "democrat") %>% 
  arrange(agreement_rate) %>% 
  head(1) %>% 
  kable()

party	voting_issue	agreement_rate	most_votes
democrat	water-project-cost-sharing	0.5020921	yea

party_agreement %>% 
  filter(party == "republican") %>% 
  arrange(agreement_rate) %>% 
  head(1) %>% 
  kable()

party	voting_issue	agreement_rate	most_votes
republican	water-project-cost-sharing	0.5067568	yea

Which issue showed the most common ground (in voting pattern) between members of congress according to affiliation?

My interpretation of the question: On which issue was each party the most unified?

party_agreement %>% 
  filter(party == "democrat") %>% 
  arrange(desc(agreement_rate)) %>% 
  head(1) %>% 
  kable()

party	voting_issue	agreement_rate	most_votes
democrat	physician-fee-freeze	0.9459459	nay

party_agreement %>% 
  filter(party == "republican") %>% 
  arrange(desc(agreement_rate)) %>% 
  head(1) %>% 
  kable()

party	voting_issue	agreement_rate	most_votes
republican	physician-fee-freeze	0.9878788	yea

Dataset #2: Joel’s GINI Index

“Measuring the wealth distribution between the people in each country has been something economists have been measuring for many years. In the GINI index, a higher GINI coefficient signifies inequality in wealth distribution, with 1 being complete inequality and 0 being complete equality. The World Bank has been maintaining this data.”

A potential analysis that could be performed is the trend in the GINI coefficient for each country.

Load data & take a look at it

gini_data <- read.csv("https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/data-sets/1966f167-824f-4105-8927-8f5a80a92c0f_Data.csv", stringsAsFactors = F)

glimpse(gini_data)

## Observations: 222
## Variables: 16
## $ ï..Series.Name <chr> "GINI index (World Bank estimate)", "GINI index...
## $ Series.Code    <chr> "SI.POV.GINI", "SI.POV.GINI", "SI.POV.GINI", "S...
## $ Country.Name   <chr> "Afghanistan", "Albania", "Algeria", "American ...
## $ Country.Code   <chr> "AFG", "ALB", "DZA", "ASM", "ADO", "AGO", "ATG"...
## $ X1990..YR1990. <chr> "..", "..", "..", "..", "..", "..", "..", "..",...
## $ X2000..YR2000. <chr> "..", "..", "..", "..", "..", "51.96", "..", "5...
## $ X2007..YR2007. <chr> "..", "..", "..", "..", "..", "..", "..", "47.3...
## $ X2008..YR2008. <chr> "..", "29.98", "..", "..", "..", "42.72", "..",...
## $ X2009..YR2009. <chr> "..", "..", "..", "..", "..", "..", "..", "45.2...
## $ X2010..YR2010. <chr> "..", "..", "..", "..", "..", "..", "..", "44.5...
## $ X2011..YR2011. <chr> "..", "..", "..", "..", "..", "..", "..", "43.5...
## $ X2012..YR2012. <chr> "..", "28.96", "..", "..", "..", "..", "..", "4...
## $ X2013..YR2013. <chr> "..", "..", "..", "..", "..", "..", "..", "42.2...
## $ X2014..YR2014. <chr> "..", "..", "..", "..", "..", "..", "..", "42.6...
## $ X2015..YR2015. <chr> "..", "..", "..", "..", "..", "..", "..", "..",...
## $ X2016..YR2016. <chr> "..", "..", "..", "..", "..", "..", "..", "..",...

Tidy & transform & filter to countries that have at least 8 observations since 2007

gini_data1 <- gini_data %>% 
  clean_names() %>% 
  gather("year_chr", "gini_chr", 5:16) %>% 
  mutate(year = as.integer(str_extract(year_chr,"[0-9]{4}"))) %>% 
  mutate(gini_coefficient = as.numeric(str_replace(gini_chr, "\\.{2}", ""))) %>% 
  select(country_name, year, gini_coefficient) %>% 
  filter(gini_coefficient >= 0 & year >= 2007)

country_8obs <- filter(count(gini_data1, country_name),n >= 8)$country_name
gini_data2 <- filter(gini_data1, country_name %in% country_8obs)

glimpse(gini_data2)

## Observations: 136
## Variables: 3
## $ country_name     <chr> "Argentina", "Armenia", "Belarus", "Costa Ric...
## $ year             <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 200...
## $ gini_coefficient <dbl> 47.37, 29.83, 29.61, 49.49, 54.33, 45.24, 40....

“A potential analysis that could be performed is the trend in the GINI coefficient for each country.”

There are appears to be an slight overall decline in inquality among these 17 countries.
Honduras and Paraguay have had more volatility in their coefficients than other countries in this group.

ggplot(gini_data2, aes(x = year, y = gini_coefficient, color = country_name)) + geom_line(size=2)

Dataset #3: World Bank: both wide and long data set (mine)

In their World Development Indicators collection, this zip folder contains a file called WDI_Data, which I believe is an example of a table that is both wide and long/narrow. In it, each year is given its own column, which is characteristic of a “wide” format. However, it also has a column that is called “Indicator Name.” It contains metric names that would typically be given their own columns instead of rows, and so this is a characteristic of narrow/long table.

It would be a good exercise to tidy this table by putting all of the years in a one column and the metrics in their own columns. Then you could use scatter plots to look for correlations between variables.

Download & upzip the large file, take a glimpse

wb_url <- "http://databank.worldbank.org/data/download/WDI_csv.zip"

# Commented out to avoid re-downloading this large file
temp <- tempfile()
download.file(wb_url,temp)
wb_data <- read.csv(unz(temp, "WDI_Data.csv"), stringsAsFactors = F)
# unlink(temp)

glimpse(wb_data)

## Observations: 383,328
## Variables: 61
## $ Country.Name   <chr> "Arab World", "Arab World", "Arab World", "Arab...
## $ Country.Code   <chr> "ARB", "ARB", "ARB", "ARB", "ARB", "ARB", "ARB"...
## $ Indicator.Name <chr> "2005 PPP conversion factor, GDP (LCU per inter...
## $ Indicator.Code <chr> "PA.NUS.PPP.05", "PA.NUS.PRVT.PP.05", "EG.ELC.A...
## $ X1960          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1961          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1962          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1963          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1964          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1965          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1966          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1967          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1968          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1969          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1970          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1971          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1972          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1973          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1974          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1975          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1976          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1977          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1978          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1979          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1980          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1981          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1982          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1983          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1984          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1985          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1986          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1987          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1988          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1989          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1990          <dbl> NA, NA, 7.546396e+01, 5.869606e+01, 9.184719e+0...
## $ X1991          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1992          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1993          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1994          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1995          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1996          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1997          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1998          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X1999          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2000          <dbl> NA, NA, 7.953486e+01, 6.586757e+01, 9.165335e+0...
## $ X2001          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2002          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2003          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2004          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2005          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2006          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2007          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2008          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2009          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2010          <dbl> NA, NA, 8.436235e+01, 7.198423e+01, 9.383966e+0...
## $ X2011          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2012          <dbl> NA, NA, 8.628035e+01, 7.391781e+01, 9.515673e+0...
## $ X2013          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2014          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2015          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ X2016          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...

Let’s tidy up this massive mess!

wb_data1 <- wb_data

#remove the Xs from the year columns
names(wb_data1)[5:61] <- str_extract(names(wb_data1)[5:61],"[0-9]{4}")

selected_indicators <- c(
  "NY.GDP.PCAP.KD.ZG",
  "NY.GDP.PCAP.PP.KD",
  "SH.MED.BEDS.ZS",
  "SE.ADT.LITR.FE.ZS"
)

#tidy & filter
wb_data2 <- wb_data1 %>%
  filter(Indicator.Code %in% selected_indicators) %>% 
  gather("year_chr", "value", 5:61) %>%
  mutate(year = as.integer(year_chr)) %>% 
  select(-Country.Code, -Indicator.Code, -year_chr) %>%
  spread(Indicator.Name, value) %>% 
  clean_names() 

glimpse(wb_data2)

## Observations: 15,048
## Variables: 6
## $ country_name                                                    <chr> ...
## $ year                                                            <int> ...
## $ gdp_per_capita_growth_annual_percent                            <dbl> ...
## $ gdp_per_capita_ppp_constant_2011_international                  <dbl> ...
## $ hospital_beds_per_1_000_people                                  <dbl> ...
## $ literacy_rate_adult_female_percent_of_females_ages_15_and_above <dbl> ...

Question 1: Does there appear to be correlation between GDP per capita & the rate of hosital beds per 1000 people?

There appears to be small amount of correlation.

ggplot(wb_data2, aes(x = gdp_per_capita_ppp_constant_2011_international, y = hospital_beds_per_1_000_people)) + geom_point(size = .5)

Question 2: Does there appear to be correlation between GDP per capita & the literacy rate of adult females?

There appears to be possibly some non-linear correlation between these 2 variables.

ggplot(wb_data2, aes(x = gdp_per_capita_ppp_constant_2011_international, y = literacy_rate_adult_female_percent_of_females_ages_15_and_above)) + geom_point(size = .5)

D607 Project 2 - Tidy Data

Kyle Gilde

Mar. 4, 2017