library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v stringr 1.4.0
## v tidyr 1.1.1 v forcats 0.5.0
## v readr 1.3.1
## -- Conflicts --------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
# Warm-up questions 1-6
## awesome_data <- read_dta("midwest.dta")
## We get Error in read_dta("awesome_dta.dta") : could not find function "read_dta" because we have not yet installed the 'haven' package
# install.packages("haven")
library(haven)
awesome_data <- read_dta("midwest.dta")
## remove (almost) everything in the working environment.
## You will get no warning, so don't do this unless you are really sure.
rm(list = ls())
# Working with data and scripts questions 1-7
## Getting and setting working directory
getwd()
## [1] "C:/Users/ESalgado/Desktop/Harris MPP/CodingCamp"
## "C:/Users/ESalgado/Documents"
setwd("C:/Users/ESalgado/Desktop/Harris MPP/CodingCamp")
wid_data <- read_xlsx("world_wealth_inequality.xlsx")
## New names:
## * `` -> ...5
## Display first few rows of dataset
head(wid_data)
## # A tibble: 6 x 5
## China `shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100 `1900` ...5
## <chr> <chr> <chr> <dbl> <dbl>
## 1 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100 1901 NA
## 2 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100 1902 NA
## 3 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100 1903 NA
## 4 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100 1904 NA
## 5 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100 1905 NA
## 6 China "shweal_p99p100_z_CN\nNet personal wealth\nTop 1% ~ p99p100 1906 NA
## function `glimpse()` shows dimensions, column names, column types, and its rows
glimpse(wid_data)
## Rows: 4,719
## Columns: 5
## $ China <chr> "China...
## $ `shweal_p99p100_z_CN\nNet personal wealth\nTop 1% | share\n` <chr> "shwea...
## $ p99p100 <chr> "p99p1...
## $ `1900` <dbl> 1901, ...
## $ ...5 <dbl> NA, NA...
## names(wid_data)
## nrow(wid_data)
# Examining `wid_data` questions 1-2
wid_data_raw <-
readxl::read_xlsx("world_wealth_inequality.xlsx",
col_names = c("country", "indicator", "percentile", "year", "value")) %>%
separate(indicator, sep = "\\n", into = c("row_tag", "type", "notes"))
## Warning: Expected 3 pieces. Additional pieces discarded in 4720 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Manipulating world inequality data with dplyr questions 1-2, 1-5, 1
## The relevant `dplyr` verb is the select function
wid_data <- wid_data_raw %>%
select(-notes, everything()) %>%
select(-row_tag)
## filter the data so we only have "Net personal wealth" for France, then name the resulting data `french_data`
french_data <- wid_data %>%
filter(type == "Net personal wealth", country == "France")
## When referring to words in the data, make sure they are in quotes "France"
## When referring to columns, do not use quotes.
french_data %>%
ggplot(aes(y = value, x = year, color = percentile)) +
geom_line()
## Warning: Removed 20 row(s) containing missing values (geom_path).

## We observe the lines on the chart are not continuous
## Look at `french_data` between 1960 and 1970
## We observe gaps in the data supporting our observation of non continuous lines on the previous chart
french_data %>%
filter(year >= 1960 & year <= 1970)
## # A tibble: 44 x 6
## country type percentile year value notes
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 France Net personal wealth p99p100 1960 0.314 Top 1% | share
## 2 France Net personal wealth p99p100 1961 NA Top 1% | share
## 3 France Net personal wealth p99p100 1962 0.320 Top 1% | share
## 4 France Net personal wealth p99p100 1963 NA Top 1% | share
## 5 France Net personal wealth p99p100 1964 0.326 Top 1% | share
## 6 France Net personal wealth p99p100 1965 0.319 Top 1% | share
## 7 France Net personal wealth p99p100 1966 0.305 Top 1% | share
## 8 France Net personal wealth p99p100 1967 0.292 Top 1% | share
## 9 France Net personal wealth p99p100 1968 0.257 Top 1% | share
## 10 France Net personal wealth p99p100 1969 0.233 Top 1% | share
## # ... with 34 more rows
## Using mutate(), create a new column called `perc_national_wealth` that equals value multiplied by 100.
french_data <- french_data %>%
mutate(perc_national_wealth = value * 100)
## Adjust the graph code so that the y axis shows `perc_national_wealth` instead of `value`.
french_data %>%
ggplot(aes(y = perc_national_wealth, x = year, color = percentile)) +
geom_line()
## Warning: Removed 20 row(s) containing missing values (geom_path).

## Now following the same steps, explore data from the "Russian Federation"
russian_data <- wid_data %>%
filter(type == "Net personal wealth", country == "Russian Federation")
russian_data %>%
ggplot(aes(y = value, x = year, color = percentile)) +
geom_line()
## Warning: Removed 388 row(s) containing missing values (geom_path).

russian_data %>%
filter(year >= 1960 & year <= 1970)
## # A tibble: 44 x 6
## country type percentile year value notes
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Russian Federation Net personal wealth p99p100 1960 NA Top 1% | share
## 2 Russian Federation Net personal wealth p99p100 1961 NA Top 1% | share
## 3 Russian Federation Net personal wealth p99p100 1962 NA Top 1% | share
## 4 Russian Federation Net personal wealth p99p100 1963 NA Top 1% | share
## 5 Russian Federation Net personal wealth p99p100 1964 NA Top 1% | share
## 6 Russian Federation Net personal wealth p99p100 1965 NA Top 1% | share
## 7 Russian Federation Net personal wealth p99p100 1966 NA Top 1% | share
## 8 Russian Federation Net personal wealth p99p100 1967 NA Top 1% | share
## 9 Russian Federation Net personal wealth p99p100 1968 NA Top 1% | share
## 10 Russian Federation Net personal wealth p99p100 1969 NA Top 1% | share
## # ... with 34 more rows
russian_data <- russian_data %>%
mutate(perc_national_wealth = value * 100)
russian_data %>%
ggplot(aes(y = perc_national_wealth, x = year, color = percentile)) +
geom_line()
## Warning: Removed 388 row(s) containing missing values (geom_path).

## The data for "Russian Federation" does not start in 1900, but our y-axis does. That's because we have a bunch of NAs. Let's filter out the NAs and remake the plot. You cannot test for NA using == (Try: NA == NA). Instead we have a function called is.na(). (Try: is.na(NA) and !is.na(NA)).
russian_data <- russian_data %>%
filter(!is.na(perc_national_wealth))
russian_data %>%
ggplot(aes(y = perc_national_wealth, x = year, color = percentile)) +
geom_line()

## Use two dplyr verbs to figure out what year the bottom 50 percent held the least wealth.
## First, choose the rows that cover the bottom 50 percent and then sort the data in descending order using arrange()
russian_data %>%
filter(percentile == 'p0p50') %>%
arrange(value)
## # A tibble: 21 x 7
## country type percentile year value notes perc_national_we~
## <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Russian Fed~ Net person~ p0p50 2011 0.0342 Bottom 50~ 3.42
## 2 Russian Fed~ Net person~ p0p50 2010 0.0349 Bottom 50~ 3.49
## 3 Russian Fed~ Net person~ p0p50 2015 0.0349 Bottom 50~ 3.49
## 4 Russian Fed~ Net person~ p0p50 2012 0.0367 Bottom 50~ 3.67
## 5 Russian Fed~ Net person~ p0p50 2013 0.0383 Bottom 50~ 3.83
## 6 Russian Fed~ Net person~ p0p50 2014 0.0384 Bottom 50~ 3.84
## 7 Russian Fed~ Net person~ p0p50 2008 0.0494 Bottom 50~ 4.94
## 8 Russian Fed~ Net person~ p0p50 2009 0.0510 Bottom 50~ 5.10
## 9 Russian Fed~ Net person~ p0p50 2004 0.0555 Bottom 50~ 5.55
## 10 Russian Fed~ Net person~ p0p50 2007 0.0568 Bottom 50~ 5.68
## # ... with 11 more rows
## For both the Russian Federation and French data, calculate the average proportion of wealth owned by the top 10 percent over the period from 1995 to 2010.
## You'll have to filter and then summarize with summarize()
russian_data %>%
filter(percentile == 'p90p100') %>%
filter(between(year, 1995, 2010)) %>%
summarize(top10 = mean(value))
## # A tibble: 1 x 1
## top10
## <dbl>
## 1 0.633
french_data %>%
filter(percentile == 'p90p100') %>%
filter(between(year, 1995, 2010)) %>%
summarize(top10 = mean(value))
## # A tibble: 1 x 1
## top10
## <dbl>
## 1 0.544
# Manipulating midwest demographic data with dplyr questions 1-8
midwest <- read_dta("midwest.dta")
glimpse(midwest)
## Rows: 437
## Columns: 28
## $ PID <dbl> 561, 562, 563, 564, 565, 566, 567, 568, 569, 5...
## $ county <chr> "ADAMS", "ALEXANDER", "BOND", "BOONE", "BROWN"...
## $ state <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL"...
## $ area <dbl> 0.052, 0.014, 0.022, 0.017, 0.018, 0.050, 0.01...
## $ poptotal <dbl> 66090, 10626, 14991, 30806, 5836, 35688, 5322,...
## $ popdensity <dbl> 1270.9615, 759.0000, 681.4091, 1812.1176, 324....
## $ popwhite <dbl> 63917, 7054, 14477, 29344, 5264, 35157, 5298, ...
## $ popblack <dbl> 1702, 3496, 429, 127, 547, 50, 1, 111, 16, 165...
## $ popamerindian <dbl> 98, 19, 35, 46, 14, 65, 8, 30, 8, 331, 51, 26,...
## $ popasian <dbl> 249, 48, 16, 150, 5, 195, 15, 61, 23, 8033, 89...
## $ popother <dbl> 124, 9, 34, 1139, 6, 221, 0, 84, 6, 1596, 20, ...
## $ percwhite <dbl> 96.71206, 66.38434, 96.57128, 95.25417, 90.198...
## $ percblack <dbl> 2.57527614, 32.90043290, 2.86171703, 0.4122573...
## $ percamerindan <dbl> 0.14828264, 0.17880670, 0.23347342, 0.14932156...
## $ percasian <dbl> 0.37675897, 0.45172219, 0.10673071, 0.48691813...
## $ percother <dbl> 0.18762294, 0.08469791, 0.22680275, 3.69733169...
## $ popadults <dbl> 43298, 6724, 9669, 19272, 3979, 23444, 3583, 1...
## $ perchsd <dbl> 75.10740, 59.72635, 69.33499, 75.47219, 68.861...
## $ percollege <dbl> 19.63139, 11.24331, 17.03382, 17.27895, 14.476...
## $ percprof <dbl> 4.355859, 2.870315, 4.488572, 4.197800, 3.3676...
## $ poppovertyknown <dbl> 63628, 10529, 14235, 30337, 4815, 35107, 5241,...
## $ percpovertyknown <dbl> 96.27478, 99.08714, 94.95697, 98.47757, 82.505...
## $ percbelowpoverty <dbl> 13.151443, 32.244278, 12.068844, 7.209019, 13....
## $ percchildbelowpovert <dbl> 18.011717, 45.826514, 14.036061, 11.179536, 13...
## $ percadultpoverty <dbl> 11.009776, 27.385647, 10.852090, 5.536013, 11....
## $ percelderlypoverty <dbl> 12.443812, 25.228976, 12.697410, 6.217047, 19....
## $ inmetro <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1...
## $ category <chr> "AAR", "LHR", "AAR", "ALU", "AAR", "AAR", "LAR...
midwest_pop <-
midwest %>%
select(county, state, starts_with("pop"))
names(midwest_pop)
## [1] "county" "state" "poptotal" "popdensity"
## [5] "popwhite" "popblack" "popamerindian" "popasian"
## [9] "popother" "popadults" "poppovertyknown"
## From midwest_pop calculate the area of each county.
## What's the largest county in the midwest? How about in Illinois?
midwest_pop %>%
mutate(area = poptotal/popdensity) %>% # use mutate to add a column
arrange(desc(area)) %>%
select(c(county, state, area))
## # A tibble: 437 x 3
## county state area
## <chr> <chr> <dbl>
## 1 MARQUETTE MI 110.
## 2 MARATHON WI 94.0
## 3 BAYFIELD WI 89.0
## 4 MARINETTE WI 82.0
## 5 SAWYER WI 79.0
## 6 DOUGLAS WI 78.0
## 7 CHIPPEWA MI 78.0
## 8 ONTONAGON MI 78.0
## 9 SCHOOLCRAFT MI 75.0
## 10 PRICE WI 75
## # ... with 427 more rows
## Largest county in the midwest: Marquette
midwest_pop %>%
filter(state=='IL') %>%
mutate(area = poptotal/popdensity) %>%
arrange(desc(area)) %>%
select(c(county, state, area))
## # A tibble: 102 x 3
## county state area
## <chr> <chr> <dbl>
## 1 MCLEAN IL 68.0
## 2 LA SALLE IL 68
## 3 IROQUOIS IL 67.0
## 4 LIVINGSTON IL 62.0
## 5 COOK IL 58.0
## 6 CHAMPAIGN IL 58.0
## 7 FULTON IL 52.0
## 8 VERMILION IL 52
## 9 ADAMS IL 52.0
## 10 SANGAMON IL 51.0
## # ... with 92 more rows
## Largest county in IL: Mclean and La Salle
## From midwest_pop calculate percentage adults for each county. What county in the midwest has the highest proportion of adults? What's county in the midwest has the lowest proportion of adults?
midwest_pop %>%
mutate(perc_adults = popadults/poptotal) %>%
arrange(desc(perc_adults)) %>%
select(c(county, state, perc_adults))
## # A tibble: 437 x 3
## county state perc_adults
## <chr> <chr> <dbl>
## 1 KEWEENAW MI 0.757
## 2 ROSCOMMON MI 0.730
## 3 IRON MI 0.728
## 4 ALCONA MI 0.726
## 5 ADAMS WI 0.726
## 6 VILAS WI 0.724
## 7 IRON WI 0.723
## 8 MONTMORENCY MI 0.703
## 9 ONTONAGON MI 0.700
## 10 ONEIDA WI 0.699
## # ... with 427 more rows
## Highest proportion of adults: Keweenaw
midwest_pop %>%
mutate(perc_adults = popadults/poptotal) %>%
arrange(perc_adults) %>%
select(c(county, state, perc_adults))
## # A tibble: 437 x 3
## county state perc_adults
## <chr> <chr> <dbl>
## 1 ISABELLA MI 0.485
## 2 MENOMINEE WI 0.494
## 3 ATHENS OH 0.507
## 4 MECOSTA MI 0.509
## 5 MONROE IN 0.526
## 6 JACKSON IL 0.527
## 7 TIPPECANOE IN 0.529
## 8 MCDONOUGH IL 0.533
## 9 DE KALB IL 0.537
## 10 HOLMES OH 0.541
## # ... with 427 more rows
## Lowest proportion of adults: Isabella
midwest_pop %>%
filter(state=='MI') %>%
summarize(totalpop = sum(poptotal))
## # A tibble: 1 x 1
## totalpop
## <dbl>
## 1 9295297
## Total population of Michigan is 9,295,297
midwest_pop %>%
filter(state=='IL') %>%
mutate(area = poptotal/popdensity) %>%
summarise(totalarea = sum(area))
## # A tibble: 1 x 1
## totalarea
## <dbl>
## 1 3304.
## The total area of Illinois is 3,304.
## The units of totalarea is unclear.