Coding-Camp-lab-1-read-and-manipulate-data-09082020.R

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## -- Attaching packages ------------------------------------------ tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v stringr 1.4.0
## v tidyr   1.1.1     v forcats 0.5.0
## v readr   1.3.1

## -- Conflicts --------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readxl)

# Warm-up questions 1-6

## awesome_data <- read_dta("midwest.dta")
## We get Error in read_dta("awesome_dta.dta") : could not find function "read_dta" because we have not yet installed the 'haven' package

# install.packages("haven")
library(haven)
awesome_data <- read_dta("midwest.dta")

## remove (almost) everything in the working environment.
## You will get no warning, so don't do this unless you are really sure.
rm(list = ls())

# Working with data and scripts questions 1-7
## Getting and setting working directory
getwd()

## [1] "C:/Users/ESalgado/Desktop/Harris MPP/CodingCamp"

## "C:/Users/ESalgado/Documents"
setwd("C:/Users/ESalgado/Desktop/Harris MPP/CodingCamp")
wid_data <- read_xlsx("world_wealth_inequality.xlsx")

## New names:
## * `` -> ...5

## Display first few rows of dataset
head(wid_data)

## # A tibble: 6 x 5
##   China `shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100 `1900`  ...5
##   <chr> <chr>                                               <chr>    <dbl> <dbl>
## 1 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100   1901    NA
## 2 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100   1902    NA
## 3 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100   1903    NA
## 4 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100   1904    NA
## 5 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100   1905    NA
## 6 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100   1906    NA

## function `glimpse()` shows dimensions, column names, column types, and its rows
glimpse(wid_data)

## Rows: 4,719
## Columns: 5
## $ China                                                        <chr> "China...
## $ `shweal_p99p100_z_CN\nNet personal wealth\nTop 1% | share\n` <chr> "shwea...
## $ p99p100                                                      <chr> "p99p1...
## $ `1900`                                                       <dbl> 1901, ...
## $ ...5                                                         <dbl> NA, NA...

## names(wid_data)
## nrow(wid_data)

# Examining `wid_data` questions 1-2
wid_data_raw <-
  readxl::read_xlsx("world_wealth_inequality.xlsx",
                    col_names = c("country", "indicator", "percentile", "year", "value")) %>%
  separate(indicator, sep = "\\n", into = c("row_tag", "type", "notes"))

## Warning: Expected 3 pieces. Additional pieces discarded in 4720 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

# Manipulating world inequality data with dplyr questions 1-2, 1-5, 1
## The relevant `dplyr` verb is the select function
wid_data <- wid_data_raw %>%
  select(-notes, everything()) %>%
  select(-row_tag)

## filter the data so we only have "Net personal wealth" for France, then name the resulting data `french_data`
french_data <- wid_data %>%
  filter(type == "Net personal wealth", country == "France")
## When referring to words in the data, make sure they are in quotes "France"
## When referring to columns, do not use quotes.
french_data %>%
  ggplot(aes(y = value, x = year, color = percentile)) +
    geom_line()

## Warning: Removed 20 row(s) containing missing values (geom_path).

## We observe the lines on the chart are not continuous

## Look at `french_data` between 1960 and 1970
## We observe gaps in the data supporting our observation of non continuous lines on the previous chart
french_data %>%
  filter(year >= 1960 & year <= 1970)

## # A tibble: 44 x 6
##    country type                percentile  year  value notes         
##    <chr>   <chr>               <chr>      <dbl>  <dbl> <chr>         
##  1 France  Net personal wealth p99p100     1960  0.314 Top 1% | share
##  2 France  Net personal wealth p99p100     1961 NA     Top 1% | share
##  3 France  Net personal wealth p99p100     1962  0.320 Top 1% | share
##  4 France  Net personal wealth p99p100     1963 NA     Top 1% | share
##  5 France  Net personal wealth p99p100     1964  0.326 Top 1% | share
##  6 France  Net personal wealth p99p100     1965  0.319 Top 1% | share
##  7 France  Net personal wealth p99p100     1966  0.305 Top 1% | share
##  8 France  Net personal wealth p99p100     1967  0.292 Top 1% | share
##  9 France  Net personal wealth p99p100     1968  0.257 Top 1% | share
## 10 France  Net personal wealth p99p100     1969  0.233 Top 1% | share
## # ... with 34 more rows

## Using mutate(), create a new column called `perc_national_wealth` that equals value multiplied by 100.
french_data <- french_data %>%
  mutate(perc_national_wealth = value * 100)
## Adjust the graph code so that the y axis shows `perc_national_wealth` instead of `value`.
french_data %>%
  ggplot(aes(y = perc_national_wealth, x = year, color = percentile)) +
  geom_line()

## Warning: Removed 20 row(s) containing missing values (geom_path).

## Now following the same steps, explore data from the "Russian Federation"
russian_data <- wid_data %>%
  filter(type == "Net personal wealth", country == "Russian Federation")

russian_data %>%
  ggplot(aes(y = value, x = year, color = percentile)) +
  geom_line()

## Warning: Removed 388 row(s) containing missing values (geom_path).

russian_data %>%
  filter(year >= 1960 & year <= 1970)

## # A tibble: 44 x 6
##    country            type                percentile  year value notes         
##    <chr>              <chr>               <chr>      <dbl> <dbl> <chr>         
##  1 Russian Federation Net personal wealth p99p100     1960    NA Top 1% | share
##  2 Russian Federation Net personal wealth p99p100     1961    NA Top 1% | share
##  3 Russian Federation Net personal wealth p99p100     1962    NA Top 1% | share
##  4 Russian Federation Net personal wealth p99p100     1963    NA Top 1% | share
##  5 Russian Federation Net personal wealth p99p100     1964    NA Top 1% | share
##  6 Russian Federation Net personal wealth p99p100     1965    NA Top 1% | share
##  7 Russian Federation Net personal wealth p99p100     1966    NA Top 1% | share
##  8 Russian Federation Net personal wealth p99p100     1967    NA Top 1% | share
##  9 Russian Federation Net personal wealth p99p100     1968    NA Top 1% | share
## 10 Russian Federation Net personal wealth p99p100     1969    NA Top 1% | share
## # ... with 34 more rows

russian_data <- russian_data %>%
  mutate(perc_national_wealth = value * 100)

russian_data %>%
  ggplot(aes(y = perc_national_wealth, x = year, color = percentile)) +
  geom_line()

## Warning: Removed 388 row(s) containing missing values (geom_path).

## The data for "Russian Federation" does not start in 1900, but our y-axis does. That's because we have a bunch of NAs. Let's filter out the NAs and remake the plot. You cannot test for NA using == (Try: NA == NA). Instead we have a function called is.na(). (Try: is.na(NA) and !is.na(NA)).
russian_data <- russian_data %>%
  filter(!is.na(perc_national_wealth))

russian_data %>%
  ggplot(aes(y = perc_national_wealth, x = year, color = percentile)) +
  geom_line()

## Use two dplyr verbs to figure out what year the bottom 50 percent held the least wealth. 
## First, choose the rows that cover the bottom 50 percent and then sort the data in descending order using arrange()
russian_data %>%
  filter(percentile == 'p0p50') %>%
  arrange(value)

## # A tibble: 21 x 7
##    country      type        percentile  year  value notes      perc_national_we~
##    <chr>        <chr>       <chr>      <dbl>  <dbl> <chr>                  <dbl>
##  1 Russian Fed~ Net person~ p0p50       2011 0.0342 Bottom 50~              3.42
##  2 Russian Fed~ Net person~ p0p50       2010 0.0349 Bottom 50~              3.49
##  3 Russian Fed~ Net person~ p0p50       2015 0.0349 Bottom 50~              3.49
##  4 Russian Fed~ Net person~ p0p50       2012 0.0367 Bottom 50~              3.67
##  5 Russian Fed~ Net person~ p0p50       2013 0.0383 Bottom 50~              3.83
##  6 Russian Fed~ Net person~ p0p50       2014 0.0384 Bottom 50~              3.84
##  7 Russian Fed~ Net person~ p0p50       2008 0.0494 Bottom 50~              4.94
##  8 Russian Fed~ Net person~ p0p50       2009 0.0510 Bottom 50~              5.10
##  9 Russian Fed~ Net person~ p0p50       2004 0.0555 Bottom 50~              5.55
## 10 Russian Fed~ Net person~ p0p50       2007 0.0568 Bottom 50~              5.68
## # ... with 11 more rows

## For both the Russian Federation and French data, calculate the average proportion of wealth owned by the top 10 percent over the period from 1995 to 2010. 
## You'll have to filter and then summarize with summarize()
russian_data %>%
  filter(percentile == 'p90p100') %>%
  filter(between(year, 1995, 2010)) %>%
  summarize(top10 = mean(value))

## # A tibble: 1 x 1
##   top10
##   <dbl>
## 1 0.633

french_data %>%
  filter(percentile == 'p90p100') %>%
  filter(between(year, 1995, 2010)) %>%
  summarize(top10 = mean(value))

## # A tibble: 1 x 1
##   top10
##   <dbl>
## 1 0.544

# Manipulating midwest demographic data with dplyr questions 1-8
midwest <- read_dta("midwest.dta")
glimpse(midwest)

## Rows: 437
## Columns: 28
## $ PID                  <dbl> 561, 562, 563, 564, 565, 566, 567, 568, 569, 5...
## $ county               <chr> "ADAMS", "ALEXANDER", "BOND", "BOONE", "BROWN"...
## $ state                <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL"...
## $ area                 <dbl> 0.052, 0.014, 0.022, 0.017, 0.018, 0.050, 0.01...
## $ poptotal             <dbl> 66090, 10626, 14991, 30806, 5836, 35688, 5322,...
## $ popdensity           <dbl> 1270.9615, 759.0000, 681.4091, 1812.1176, 324....
## $ popwhite             <dbl> 63917, 7054, 14477, 29344, 5264, 35157, 5298, ...
## $ popblack             <dbl> 1702, 3496, 429, 127, 547, 50, 1, 111, 16, 165...
## $ popamerindian        <dbl> 98, 19, 35, 46, 14, 65, 8, 30, 8, 331, 51, 26,...
## $ popasian             <dbl> 249, 48, 16, 150, 5, 195, 15, 61, 23, 8033, 89...
## $ popother             <dbl> 124, 9, 34, 1139, 6, 221, 0, 84, 6, 1596, 20, ...
## $ percwhite            <dbl> 96.71206, 66.38434, 96.57128, 95.25417, 90.198...
## $ percblack            <dbl> 2.57527614, 32.90043290, 2.86171703, 0.4122573...
## $ percamerindan        <dbl> 0.14828264, 0.17880670, 0.23347342, 0.14932156...
## $ percasian            <dbl> 0.37675897, 0.45172219, 0.10673071, 0.48691813...
## $ percother            <dbl> 0.18762294, 0.08469791, 0.22680275, 3.69733169...
## $ popadults            <dbl> 43298, 6724, 9669, 19272, 3979, 23444, 3583, 1...
## $ perchsd              <dbl> 75.10740, 59.72635, 69.33499, 75.47219, 68.861...
## $ percollege           <dbl> 19.63139, 11.24331, 17.03382, 17.27895, 14.476...
## $ percprof             <dbl> 4.355859, 2.870315, 4.488572, 4.197800, 3.3676...
## $ poppovertyknown      <dbl> 63628, 10529, 14235, 30337, 4815, 35107, 5241,...
## $ percpovertyknown     <dbl> 96.27478, 99.08714, 94.95697, 98.47757, 82.505...
## $ percbelowpoverty     <dbl> 13.151443, 32.244278, 12.068844, 7.209019, 13....
## $ percchildbelowpovert <dbl> 18.011717, 45.826514, 14.036061, 11.179536, 13...
## $ percadultpoverty     <dbl> 11.009776, 27.385647, 10.852090, 5.536013, 11....
## $ percelderlypoverty   <dbl> 12.443812, 25.228976, 12.697410, 6.217047, 19....
## $ inmetro              <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1...
## $ category             <chr> "AAR", "LHR", "AAR", "ALU", "AAR", "AAR", "LAR...

midwest_pop <-
  midwest %>%
  select(county, state, starts_with("pop"))

names(midwest_pop)

##  [1] "county"          "state"           "poptotal"        "popdensity"     
##  [5] "popwhite"        "popblack"        "popamerindian"   "popasian"       
##  [9] "popother"        "popadults"       "poppovertyknown"

## From midwest_pop calculate the area of each county.
## What's the largest county in the midwest? How about in Illinois?
midwest_pop %>%
  mutate(area = poptotal/popdensity) %>% # use mutate to add a column
  arrange(desc(area)) %>%
  select(c(county, state, area))

## # A tibble: 437 x 3
##    county      state  area
##    <chr>       <chr> <dbl>
##  1 MARQUETTE   MI    110. 
##  2 MARATHON    WI     94.0
##  3 BAYFIELD    WI     89.0
##  4 MARINETTE   WI     82.0
##  5 SAWYER      WI     79.0
##  6 DOUGLAS     WI     78.0
##  7 CHIPPEWA    MI     78.0
##  8 ONTONAGON   MI     78.0
##  9 SCHOOLCRAFT MI     75.0
## 10 PRICE       WI     75  
## # ... with 427 more rows

## Largest county in the midwest: Marquette

midwest_pop %>%
  filter(state=='IL') %>%
  mutate(area = poptotal/popdensity) %>%
  arrange(desc(area)) %>%
  select(c(county, state, area))

## # A tibble: 102 x 3
##    county     state  area
##    <chr>      <chr> <dbl>
##  1 MCLEAN     IL     68.0
##  2 LA SALLE   IL     68  
##  3 IROQUOIS   IL     67.0
##  4 LIVINGSTON IL     62.0
##  5 COOK       IL     58.0
##  6 CHAMPAIGN  IL     58.0
##  7 FULTON     IL     52.0
##  8 VERMILION  IL     52  
##  9 ADAMS      IL     52.0
## 10 SANGAMON   IL     51.0
## # ... with 92 more rows

## Largest county in IL: Mclean and La Salle

## From midwest_pop calculate percentage adults for each county. What county in the midwest has the highest proportion of adults? What's county in the midwest has the lowest proportion of adults?
midwest_pop %>%
  mutate(perc_adults = popadults/poptotal) %>%
  arrange(desc(perc_adults)) %>%
  select(c(county, state, perc_adults))

## # A tibble: 437 x 3
##    county      state perc_adults
##    <chr>       <chr>       <dbl>
##  1 KEWEENAW    MI          0.757
##  2 ROSCOMMON   MI          0.730
##  3 IRON        MI          0.728
##  4 ALCONA      MI          0.726
##  5 ADAMS       WI          0.726
##  6 VILAS       WI          0.724
##  7 IRON        WI          0.723
##  8 MONTMORENCY MI          0.703
##  9 ONTONAGON   MI          0.700
## 10 ONEIDA      WI          0.699
## # ... with 427 more rows

## Highest proportion of adults: Keweenaw

midwest_pop %>%
  mutate(perc_adults = popadults/poptotal) %>%
  arrange(perc_adults) %>%
  select(c(county, state, perc_adults))

## # A tibble: 437 x 3
##    county     state perc_adults
##    <chr>      <chr>       <dbl>
##  1 ISABELLA   MI          0.485
##  2 MENOMINEE  WI          0.494
##  3 ATHENS     OH          0.507
##  4 MECOSTA    MI          0.509
##  5 MONROE     IN          0.526
##  6 JACKSON    IL          0.527
##  7 TIPPECANOE IN          0.529
##  8 MCDONOUGH  IL          0.533
##  9 DE KALB    IL          0.537
## 10 HOLMES     OH          0.541
## # ... with 427 more rows

## Lowest proportion of adults: Isabella

midwest_pop %>%
  filter(state=='MI') %>%
  summarize(totalpop = sum(poptotal))

## # A tibble: 1 x 1
##   totalpop
##      <dbl>
## 1  9295297

## Total population of Michigan is 9,295,297

midwest_pop %>%
  filter(state=='IL') %>%
  mutate(area = poptotal/popdensity) %>%
  summarise(totalarea = sum(area))

## # A tibble: 1 x 1
##   totalarea
##       <dbl>
## 1     3304.

## The total area of Illinois is 3,304. 
## The units of totalarea is unclear.

Coding-Camp-lab-1-read-and-manipulate-data-09082020.R

ESalgado

2020-09-09