library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(stringr)
library(rmarkdown)
library(here)
## here() starts at C:/Users/ajb22/Documents/school/dacss_601
library(RColorBrewer)
library(patchwork)

data_path <- paste(here(), "/DACSS601Fall21/_data/", sep="")
knitr::opts_chunk$set(echo = TRUE)

Objective

This homework is focused on cleaning a data set and finding meaning within the data through descriptive statistics and visualizations. For this homework, I am going to clean the “USA Households by Total Money Income, Race, and Hispanic Origin of Householder 1967 to 2019” data set. This data set is interesting because it has many different variables that can be analyzed, such as income bracket distribution over time or race. Below is the raw data set.

# Read in the dataset
income_path <- paste(data_path, "USA Households by Total Money Income, Race, and Hispanic Origin of Householder 1967 to 2019.xlsx", sep="")
income_data <- read_excel(income_path)
## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ...
paged_table(income_data)

Cleaning the Data

First we notice that the data has dead space that should be skipped. Additionally, we should also rename the column names to more useful names for manipulation. For instance, any column that we would like to remove, we will with with “D”. This will allow us to easily search with a regular expression. Some of the columns that we are interested in removing are the ones that have redundant information or can be recalculated, such as total and mean_income_estimate.

income_data <- read_excel(income_path, skip=5,
                          col_names=c("year", "number", "d1", "lt_15k", 
                                      "15k_25k", "25k_35k", "35k_50k", "50k_75k", 
                                      "75k_100k", "100k_150k", "150k_200k", "gt_200k", 
                                      "d2", "d3", 
                                      "d4", "d5"))
paged_table(income_data)

More cleaning…

We would like to evaluate the income statistics from 1967 to 2019 for each race. This means we will need to separate the first column into to separate columns of race and year. This can be done a mutate and case_when if statement that finds the year based on a regex. Another cleaning item to do is to remove the footnotes at the bottom of the data set and delete the columns that are unnecessary.

# removing the unnecessary columns and footnotes
income_data <- income_data %>% 
  select(!starts_with("d", ignore.case = TRUE)) %>% 
  # TODO: is there a better way to find the footnotes instead of hardcoding the rows???
  slice(-(352:382))
paged_table(income_data)
# clean up the first column and separate into race and year
income_data <- income_data %>% 
  mutate(race = case_when(str_starts(year, "[a-zA-Z]") ~ year, TRUE ~ NA_character_)) %>% 
  fill(race) %>% 
  filter(!str_starts(year, "[a-zA-Z]"))

# now let us take out the spaces between the years
income_data <- income_data %>% 
  mutate(year = str_remove_all(year, " .*"))

# convert number to numeric from character
income_data <- income_data %>% 
  mutate(number = as.numeric(income_data$number))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
paged_table(income_data)

Tidy data

Now that we have clean data, we are going to transfer our data into a “Tidy” format by pivoting. We will pivot the data into longer and have a column of income brackets. This will be used to color our graphs later.

income_data <- income_data %>% 
  pivot_longer(-c(year, race, number), names_to = "incomeBracket", values_to = "incomePercent")
paged_table(income_data)

Evaluating the data

There are many ways we can decide to evaluate the data. The first variable that we will look into is the income bracket over time. To do this, we will take a subset of the data for when the race is equivalent to “ALL RACES”. Another variable we will evaluate is the income bracket distribution over the different race categories. To get a better idea of these two variables, we will first plot these subsets of data with ggplot. Initially, we look at these variables through point graphs.

Income Distribution over Time

# subset of data that contains income brackets for when race = "ALL RACES" over time
filter_all_race_data <- filter(income_data, race == "ALL RACES")

ggplot(data = filter_all_race_data, mapping = aes(x = year, y = incomePercent, color = incomeBracket)) + geom_point() + scale_x_discrete(guide = guide_axis(n.dodge = 3)) + labs(title = "Time Series of 'ALL RACES'")

# get each specific race trend: white, black, asian, hispanic
filter_white_data <- income_data %>% filter(grepl("WHITE", race))
filter_black_data <- income_data %>% filter(grepl("BLACK", race))
filter_asian_data <- income_data %>% filter(grepl("ASIAN", race))
filter_hispanic_data <- income_data %>% filter(grepl("HISPANIC", race))

p2 <- ggplot(data = filter_white_data, mapping = aes(x = year, y = incomePercent, color = incomeBracket)) + geom_point() + scale_x_discrete(guide = guide_axis(n.dodge = 3)) + labs(title = "Time Series of 'WHITE RACES'")
p2 

p3 <- ggplot(data = filter_black_data, mapping = aes(x = year, y = incomePercent, color = incomeBracket)) + geom_point() + scale_x_discrete(guide = guide_axis(n.dodge = 3)) + labs(title = "Time Series of 'BLACK RACES'")
p3

p4 <- ggplot(data = filter_asian_data, mapping = aes(x = year, y = incomePercent, color = incomeBracket)) + geom_point() + scale_x_discrete(guide = guide_axis(n.dodge = 3)) + labs(title = "Time Series of 'ASIAN RACES'")
p4

p5 <- ggplot(data = filter_hispanic_data, mapping = aes(x = year, y = incomePercent, color = incomeBracket)) + geom_point() + scale_x_discrete(guide = guide_axis(n.dodge = 3)) + labs(title = "Time Series of 'HISPANIC RACES'")
p5

We can look even deeper within the data by examining just one of the race categories.

# Go a little deeper and look at a specific race within a specific income bracket over time
filter_income_bracket <- income_data %>% 
  filter(incomeBracket == "75k_100k") %>%
  filter(grepl("ASIAN", race))
# filter_income_bracket <- filter(filter_income_bracket_raw, race == )
ggplot(data = filter_income_bracket, mapping = aes(x = year, y = incomePercent, color= race)) + geom_point() + scale_x_discrete(guide = guide_axis(n.dodge = 2)) + 
  theme(text=element_text(size=5))

Something to notice that is confusing about this data is that the categories of the race changes over the years. For instance, initially, the Asian population was just “ASIAN ALONE OR IN COMBINATION” and then later was broken down into two groups of “ASIAN ALONE 27” and “ASIAN AND PACIFIC ISLANDER 25”. This can make it hard to find a specific trend within these groups. To simplify our categories we will group similar race categories into one through mutate.

# we are going to group similar race groups to make it easier to see overall trends
group_race_data <- income_data %>% 
  mutate(race = case_when(str_detect(race, "ALL") ~ race, TRUE ~ str_extract(race, "^([A-Za-z])+")))


# Go a little deeper and look at a specific race within a specific income bracket over time
filter_income_bracket <- group_race_data %>% 
  filter(incomeBracket == "75k_100k") %>%
  filter(grepl("ASIAN", race))

ggplot(data = filter_income_bracket, mapping = aes(x = year, y = incomePercent, color= race)) + geom_point(mapping = aes(x = year, y = incomePercent)) + scale_x_discrete(guide = guide_axis(n.dodge = 2)) 

Now that we see that the group data has a little more consistency, we can start try to find the trends that we find important for certain variables.

Income Distribution over Race

The point graph is useful for the time series variable, but we will need a different visual for the other variable, the income percentage distribution over the different race categories. The best graph to properly show this would either be the heat map or a bar chart.

Heat Map: another way to view this data is that the income percent is equivalent to an intensity that can be mapped over race and income bracket. A heat map is a good way to portray third dimensional information in a 2d graph. It can be thought of as a slice from the top view of a 3d graph.

# subset of data that contains income brackets over all races for when year = 2019
filter_year_data <- filter(group_race_data, year == "2019")

ggplot(data = filter_year_data, mapping = aes(x = race, y = incomeBracket, fill= incomePercent)) + geom_tile() + scale_x_discrete(guide = guide_axis(n.dodge = 2))

Bar Chart: A bar chart is used to display the relationship between numeric and categorical data. This fits our data of categories of races and numeric income percentages. The bar chart can be thought of as the side view slice of 3d graph.

Bar chart seems the most intuitive representation because you can see the difference between the magnitude of the different income brackets. The heat map shows the are different magnitudes, but does not show the difference in a concrete way other than two different colors.

filter_year_data <- filter(group_race_data, year == "2019")

ggplot(data = filter_year_data) + geom_bar(mapping = aes(x = race, y= incomePercent, fill = incomeBracket), stat = "identity", position=position_dodge()) + scale_fill_brewer(palette = "Set3")

Univariate Statistics

Now lets look at some numbers to investigate this data further. First, using summarise_all will show some basic values, such as mean. This is the statistics of the race and income bracket distribution over years.

group_race_data %>% group_by(race, incomeBracket) %>% summarise_at("incomePercent", list(mean=mean, median=median, min=min, max=max, sd=sd), na.rm=TRUE)
## # A tibble: 45 x 7
## # Groups:   race [5]
##    race      incomeBracket  mean median   min   max    sd
##    <chr>     <chr>         <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1 ALL RACES 100k_150k     13.3    13.9   7.7  15.6 1.96 
##  2 ALL RACES 150k_200k      4.97    5.1   1.7   8.3 1.79 
##  3 ALL RACES 15k_25k       10.2    10.2   8    11.4 0.637
##  4 ALL RACES 25k_35k        9.78    9.7   8.3  11   0.570
##  5 ALL RACES 35k_50k       13.6    13.4  11.7  16.8 1.15 
##  6 ALL RACES 50k_75k       19.1    18.4  16.1  24.8 2.40 
##  7 ALL RACES 75k_100k      13.3    13.3  11.9  15.1 0.881
##  8 ALL RACES gt_200k        4.41    4.2   1.1  10.3 2.40 
##  9 ALL RACES lt_15k        11.3    11.4   9    14.8 1.22 
## 10 ASIAN     100k_150k     17.8    17.8  16.3  19.9 0.861
## # ... with 35 more rows