Challenge 6

Author

Jingyi Yang

1, Start Up

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readr)
library(readxl)
library(stringr)
library("scales")

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor
library(dplyr)
library(lubridate)
library("khroma")

2. Import the data

2.1 Import the “debt” data set

setwd("C:\\8-601\\challenge_datasets")
debt<- read_excel("debt_in_trillions.xlsx")

debt
# A tibble: 74 × 8
   `Year and Quarter` Mortgage `HE Revolving` `Auto Loan` `Credit Card`
   <chr>                 <dbl>          <dbl>       <dbl>         <dbl>
 1 03:Q1                  4.94          0.242       0.641         0.688
 2 03:Q2                  5.08          0.26        0.622         0.693
 3 03:Q3                  5.18          0.269       0.684         0.693
 4 03:Q4                  5.66          0.302       0.704         0.698
 5 04:Q1                  5.84          0.328       0.72          0.695
 6 04:Q2                  5.97          0.367       0.743         0.697
 7 04:Q3                  6.21          0.426       0.751         0.706
 8 04:Q4                  6.36          0.468       0.728         0.717
 9 05:Q1                  6.51          0.502       0.725         0.71 
10 05:Q2                  6.70          0.528       0.774         0.717
# ℹ 64 more rows
# ℹ 3 more variables: `Student Loan` <dbl>, Other <dbl>, Total <dbl>

2.2 Import the “fed_rate” data set

setwd("C:\\8-601\\challenge_datasets")
fed_rate<- read_csv("FedFundsRate.csv")

fed_rate
# A tibble: 904 × 10
    Year Month   Day `Federal Funds Target Rate` `Federal Funds Upper Target`
   <dbl> <dbl> <dbl>                       <dbl>                        <dbl>
 1  1954     7     1                          NA                           NA
 2  1954     8     1                          NA                           NA
 3  1954     9     1                          NA                           NA
 4  1954    10     1                          NA                           NA
 5  1954    11     1                          NA                           NA
 6  1954    12     1                          NA                           NA
 7  1955     1     1                          NA                           NA
 8  1955     2     1                          NA                           NA
 9  1955     3     1                          NA                           NA
10  1955     4     1                          NA                           NA
# ℹ 894 more rows
# ℹ 5 more variables: `Federal Funds Lower Target` <dbl>,
#   `Effective Federal Funds Rate` <dbl>, `Real GDP (Percent Change)` <dbl>,
#   `Unemployment Rate` <dbl>, `Inflation Rate` <dbl>

2.3 Import the “US_Household” data set

setwd("C:\\8-601\\challenge_datasets")
US_Household<- read_excel("USA Households by Total Money Income, Race, and Hispanic Origin of Householder 1967 to 2019.xlsx", skip=4,n_max = 351)

US_Household
# A tibble: 351 × 16
   ...1      ...2   Total `Under $15,000` `$15,000\r\nto\r\n$24,999`
   <chr>     <chr>  <dbl>           <dbl>                      <dbl>
 1 ALL RACES <NA>      NA            NA                         NA  
 2 2019      128451   100             9.1                        8  
 3 2018      128579   100            10.1                        8.8
 4 2017 2    127669   100            10                          9.1
 5 2017      127586   100            10.1                        9.1
 6 2016      126224   100            10.4                        9  
 7 2015      125819   100            10.6                       10  
 8 2014      124587   100            11.4                       10.5
 9 2013 3    123931   100            11.4                       10.3
10 2013 4    122952   100            11.3                       10.4
# ℹ 341 more rows
# ℹ 11 more variables: `$25,000\r\nto\r\n$34,999` <dbl>,
#   `$35,000\r\nto\r\n$49,999` <dbl>, `$50,000\r\nto\r\n$74,999` <dbl>,
#   `$75,000\r\nto\r\n$99,999` <dbl>, `$100,000\r\nto\r\n$149,999` <dbl>,
#   `$150,000\r\nto\r\n$199,999` <dbl>, `$200,000 and over` <dbl>,
#   Estimate...13 <dbl>, `Margin of error1 (±)...14` <dbl>,
#   Estimate...15 <chr>, `Margin of error1 (±)...16` <chr>

3. Clean the data

3.1 Clean the “debt” data set

The data set includes information about 1) the quarter of the year, 2) the component of debt, which involves “Mortgage,” “HE Revolving,” “Auto Loan,” “Credit Card,” “Student Loan,” “Other,” 3) The total number of the debt. To clean the data set making the “year and quarter” into the date format can make the data set cleaner and easier to understand.

debt_clean <- debt %>%
  mutate(`Year and Quarter`= yq(`Year and Quarter`))
           
debt_clean
# A tibble: 74 × 8
   `Year and Quarter` Mortgage `HE Revolving` `Auto Loan` `Credit Card`
   <date>                <dbl>          <dbl>       <dbl>         <dbl>
 1 2003-01-01             4.94          0.242       0.641         0.688
 2 2003-04-01             5.08          0.26        0.622         0.693
 3 2003-07-01             5.18          0.269       0.684         0.693
 4 2003-10-01             5.66          0.302       0.704         0.698
 5 2004-01-01             5.84          0.328       0.72          0.695
 6 2004-04-01             5.97          0.367       0.743         0.697
 7 2004-07-01             6.21          0.426       0.751         0.706
 8 2004-10-01             6.36          0.468       0.728         0.717
 9 2005-01-01             6.51          0.502       0.725         0.71 
10 2005-04-01             6.70          0.528       0.774         0.717
# ℹ 64 more rows
# ℹ 3 more variables: `Student Loan` <dbl>, Other <dbl>, Total <dbl>

3.2 Clean the “fed_rate” data set

The data set includes information 1) dates, 2) Federal Funds rates and their lower and upper levels for the interest rate range, 3)Effective Federal Funds Rate, which is calculated through a volume-weighted median of overnight federal funds transactions, 4) GDP, and 5) Unemployment Rate. To clean the data set, put the information in the “Year,” “Month,” and “Day” into one column and fit with the date format. It can make the data set more concise and more easy to understand.

fed_rate_clean<- fed_rate %>%
  mutate(date = str_c(Year, Month, Day, sep="-"),
         date = ymd(date))

fed_rate_clean
# A tibble: 904 × 11
    Year Month   Day `Federal Funds Target Rate` `Federal Funds Upper Target`
   <dbl> <dbl> <dbl>                       <dbl>                        <dbl>
 1  1954     7     1                          NA                           NA
 2  1954     8     1                          NA                           NA
 3  1954     9     1                          NA                           NA
 4  1954    10     1                          NA                           NA
 5  1954    11     1                          NA                           NA
 6  1954    12     1                          NA                           NA
 7  1955     1     1                          NA                           NA
 8  1955     2     1                          NA                           NA
 9  1955     3     1                          NA                           NA
10  1955     4     1                          NA                           NA
# ℹ 894 more rows
# ℹ 6 more variables: `Federal Funds Lower Target` <dbl>,
#   `Effective Federal Funds Rate` <dbl>, `Real GDP (Percent Change)` <dbl>,
#   `Unemployment Rate` <dbl>, `Inflation Rate` <dbl>, date <date>

3.3 Clean the “US_Household” data set

The data set involves information about 1) the year”, 2) the total number of income, 3) Percent distribution for different income levels, 4) Median income and margin of error, and 5)Mean income and margin of error. To make the data set clearer and easier to understand, it is necessary to 1) rename the column, 2) put the race and identity information into its own column, not stay with the year information, and 3) arrange the data set by year.

US_Household_clean <- US_Household %>%
  rename("Year"="...1", "Number"="...2", "Median income_Estimate"="Estimate...13", "Median income_Margin of error"="Margin of error1 (±)...14","Mean income_Estimate"="Estimate...15", "Mean income_Margin of error"="Margin of error1 (±)...16" )%>%
mutate(Race_Identity=case_when(str_detect(Year, "[[:alpha:]]")~ Year,TRUE ~ NA_character_))%>%
  fill(Race_Identity)%>%
  filter(!str_detect(Year, "[[:alpha:]]"))%>%
  separate(Year, into=c("Year", "delete"), sep=" ")%>%
   select(-delete)%>%
  mutate(Race_Identity = str_remove(`Race_Identity`,"[0-9]+"), across(any_of(c("Number", "Mean income_Estimate", "Mean income_Margin of error", "Year")), as.numeric))%>%
  mutate(Race_abbr= abbreviate(`Race_Identity`,4, dot = "TRUE", strict = "TRUE"))%>%
  arrange(Year)
  
  US_Household_clean
# A tibble: 339 × 18
    Year Number Total `Under $15,000` `$15,000\r\nto\r\n$24,999`
   <dbl>  <dbl> <dbl>           <dbl>                      <dbl>
 1  1967  60813   100            14.8                       10.2
 2  1967  54188   100            13.5                        9.4
 3  1967   5728   100            26.8                       17.7
 4  1968  62214   100            13.4                       10.1
 5  1968  55394   100            12.3                        9.3
 6  1968   5870   100            24.4                       17  
 7  1969  63401   100            13.2                        9.9
 8  1969  56248   100            12                          9.3
 9  1969   6053   100            24                         16.1
10  1970  64778   100            13.3                       10.1
# ℹ 329 more rows
# ℹ 13 more variables: `$25,000\r\nto\r\n$34,999` <dbl>,
#   `$35,000\r\nto\r\n$49,999` <dbl>, `$50,000\r\nto\r\n$74,999` <dbl>,
#   `$75,000\r\nto\r\n$99,999` <dbl>, `$100,000\r\nto\r\n$149,999` <dbl>,
#   `$150,000\r\nto\r\n$199,999` <dbl>, `$200,000 and over` <dbl>,
#   `Median income_Estimate` <dbl>, `Median income_Margin of error` <dbl>,
#   `Mean income_Estimate` <dbl>, `Mean income_Margin of error` <dbl>, …

4. Graph Including Time

4.1 “debt” data set

# Analysis of the total number of debt  

##| Using the "geom_path()" can show the change for the total number of debt over time by drawing lines across each data point. 

debt_clean %>%
  ggplot(aes(`Year and Quarter`, `Total`))+
  geom_path()+
  scale_x_date(limits=range(debt_clean$`Year and Quarter`),breaks="2 years",date_labels = "%Y")+
  labs(title="Total number of debt, 2003-2021")

# Analysis of the percentage distribution of the debt in 2021

##| The pie chart is because it divides the circle into several sectors, which shows the proportions of the whole. As 2003 to 2021 is a large data frame, focusing on one year, like 2021, can make the chart more understandable and ready for publication. 

debt_2021 <- debt_clean %>%
  group_by(`Year and Quarter`)%>%
  filter(`Year and Quarter`>="2021-01-01    ")%>%
  pivot_longer(col=Mortgage:Other, names_to = "Types", values_to = "Values") %>%
  mutate(Percentage= `Values`/`Total`) %>%
  mutate(Percentage= scales::percent(Percentage))

  debt_2021%>%
  ggplot(aes(x = "", y = `Values`, fill= Types)) +
  geom_col()+
    facet_wrap(~`Year and Quarter`, nrow = 1)+
geom_text(aes(label = `Percentage`),position = position_stack(vjust = 0.5), size=3)+
  coord_polar(theta = "y")+
    theme_void()+
   theme(legend.position = "bottom")+
    labs(title= "Percentage distribution of the debt, 2021")

# Analysis the sum of the total debt number from 2003 to 2019
  
##| Using the "geom_path()" can show the change for the sum of the debt total number over time by drawing lines across each data point.
  
debt_clean_1 <- debt_clean%>%
  separate(`Year and Quarter`, into=c("year", "month", "date"))

debt_sum <- debt_clean_1 %>%
  group_by(`year`) %>%
  summarise(sum_number_Mortgage= sum(Mortgage),
            sum_number_HE_Revolving= sum(`HE Revolving`),
            sum_number_Auto_Loan= sum(`Auto Loan`),
            sum_number_Credit_Card= sum(`Credit Card`),
            sum_number_Student_Loan= sum(`Student Loan`),
            sum_number_Other= sum(Other),
            sum_number_total= sum(Total)) %>%
  mutate(year= as.numeric(year)) 

 debt_sum %>%
  ggplot(aes(year,sum_number_total))+
  geom_path()+
   scale_x_continuous(limits=range(debt_sum$year), n.breaks=10)+
   scale_y_continuous(limits=range(debt_sum$sum_number_total), n.breaks=10)+
   labs( x="Year", y="Total Number", title = "Sum of the debt total number, 2003-2019")

 # Analysis of the percentage distribution for the sum of the total debt number in 2020
 
 ##| The pie chart is because it divides the circle into several sectors, which shows the proportions of the whole. As 2003 to 2021 is a large data frame, focusing on one year, like 2020, can make the chart more understandable and ready for publication. 
 
 debt_sum_2020 <- debt_sum %>% filter(year == 2020) %>%
   pivot_longer(cols =sum_number_Mortgage:sum_number_Other, names_to= "sum_number_types", values_to = "values" ) %>%
   mutate(Percentage = values/sum_number_total) %>%
   mutate(Percentage= scales::percent(Percentage))
  
debt_sum_2020 %>%
  ggplot(aes(x = "", y = `values`, fill= `sum_number_types`)) +
  geom_col()+
geom_text(aes(label = `Percentage`),position = position_stack(vjust = 0.5), size=3)+
  coord_polar(theta = "y")+
  theme_void()+
   theme(legend.position = "bottom")+
  scale_fill_discrete(name= "Sum Number- Types")+
  labs(title = "Percentage distribution for the sum of the total debt number, 2020")

4.2 “fed_rate” data set

#  Analysis the "Unemployment Rate" from 1954 to 2017

##| Using the "geom_path()" can show the change for the unemployment rate over time by drawing lines across each data point. 

fed_rate_clean %>%
   ggplot(aes(`date`, `Unemployment Rate`))+
  geom_path()+
  scale_x_date(breaks="5 years",date_labels = "%Y")+
  labs (x= "Date", title= "Unemployment Rate, 1954-2017")

#  Analysis the "Inflation Rate" from 1954 to 2017

##| Using the "geom_path()" can show the change for the inflation rate over time by drawing lines across each data point. 

fed_rate_clean %>%
   ggplot(aes(`date`, `Inflation Rate`))+
  geom_path()+
  scale_x_date(breaks="5 years",date_labels = "%Y")+
  labs(x= "Date", title ="Inflation Rate, 1954-2017" )

# Analysis the "Federal Funds Target Rate" from 1982 to 2008

##| Using the "geom_path()" can show the change for the inflation rate over time by drawing lines across each data point. 

fed_rate_clean %>%
  filter(date > "1982-09-01")%>%
  filter(date < "2008-12-16")%>%
  ggplot(aes(`date`, `Federal Funds Target Rate`))+
  geom_path()+
  scale_x_date(breaks="2 years",date_labels = "%Y")+
  scale_y_continuous(limits = range(fed_rate_clean$`Federal Funds Target Rate`))+
  labs(x= "Date", title = "Federal Funds Target Rate, 1982-2008")

4.3 “US_Household” data set

# Analysis of the total money income through the "All Races" category from 1967 to 2019

##| Using the "geom_path()" can show the change for the total money income in the "All Race" category over time by drawing lines across each data point. 

US_Household_clean %>%
  filter(Race_Identity=="ALL RACES") %>%
  ggplot(aes(Year, `Number`))+
  geom_path()+
scale_x_continuous(limits=range(US_Household_clean$Year),n.breaks = 15)+
  scale_y_continuous(labels=scales::label_number(suffix="T",scale=1e-0), n.breaks = 10)+
  scale_fill_discrete(name="Race abbreviation")+
   theme(axis.text.x = element_text(angle=90))+
  labs(x="Year",y="Number",
       title="Total Money Income, All Race, 1967-2019")

# Analysis of the total median income and its margin error through the "All Races" category from 1967 to 2019

##| Using the "geom_col()" function because 1) the height of the bar can represent the value in data, and 2) it can combine with the "geom_errorbar()" function can include the information about the margin error into the graphic.

US_Household_clean %>%
  filter(Race_Identity=="ALL RACES") %>%
  mutate(se_lower=`Median income_Estimate`-`Median income_Margin of error`, se_upper=`Median income_Estimate`+`Median income_Margin of error` ) %>%
  ggplot(aes(Year, `Median income_Estimate`, fill=`Year`))+
  geom_col(position = "dodge")+
  geom_errorbar(aes(ymin=se_lower,ymax=se_upper))+
  scale_x_continuous(limits=range(US_Household_clean$Year),n.breaks = 15)+
  scale_y_continuous(limits=c(0,ceiling(max(US_Household_clean$`Median income_Estimate`))),labels=scales::label_number(suffix="T",scale=1e-3))+
  labs(y="Median Income", title = "Median Income, All Race,1967-2019")+
  theme(axis.text.x = element_text(angle=90))

# Analysis of the total mean income and its margin error through the "All Races" category from 1967 to 2019

##| Using the "geom_col()" function because 1) the height of the bar can represent the value in data, and 2) it can combine with the "geom_errorbar()" function can include the information about the margin error into the graphic.

 US_Household_clean %>%
  filter(Race_Identity=="ALL RACES")%>%
  mutate(se_lower=`Mean income_Estimate`-`Mean income_Margin of error`, se_upper=`Mean income_Estimate`+`Mean income_Margin of error` ) %>%
  ggplot(aes(Year, `Mean income_Estimate`, fill=`Year`))+
  geom_col(position = "dodge")+
  geom_errorbar(aes(ymin=se_lower,ymax=se_upper))+
  scale_x_continuous(limits=range(US_Household_clean$Year),n.breaks = 15)+
  scale_y_continuous(limits=c(0,ceiling(max(US_Household_clean$`Mean income_Estimate`))),labels=scales::label_number(suffix="T",scale=1e-3))+
  labs(y="Mean Income", title = "Mean Income, All Race, 1967-2019")+
  theme(axis.text.x = element_text(angle=90))

 # Analysis of the Percent Distribution for total income in 1967
 
 ##| The pie chart is because it divides the circle into several sectors, which shows the proportions of the whole. As 1967 to 2019 is a large data frame, focusing on one year, like 1967, can make the chart more understandable and ready for publication. 
 
US_Household_distribution <- US_Household_clean %>%
reframe(across(c(`Year`,`Total`,`Under $15,000`:`$200,000 and over`, Race_Identity)))%>%
  pivot_longer(cols = `Under $15,000`:`$200,000 and over`, names_to = "percent_distribution", values_to = "Percent")%>%
 mutate(Percent= Percent/Total)%>%
  mutate(labels = scales::percent(Percent))%>%
  mutate(percent_distribution= as.factor(percent_distribution))

US_Household_distribution_1967 <-US_Household_distribution %>%
  group_by(Year)%>%
  filter(Year== "1967")

ggplot(US_Household_distribution_1967,aes(x = "", y = Percent, fill= percent_distribution)) +
  geom_col()+
geom_text(aes(label = labels),
          position = position_stack(vjust = 0.5), size=2.5)+
  facet_wrap(~Race_Identity, nrow = 1)+
  coord_polar("y")+
  theme_void()+
   theme(legend.position = "bottom")+
  scale_fill_discrete(name="Percentage Distribution")+
  labs(title = "Percent Distribution in 1967")

# Analysis of the sum of the total income in 2019

##| Using the "geom_col()" can make the portion of different race categories more obvious and avoid overlapping of labels compared to the "coord_polar()" function.

Race_2019_number <- US_Household_clean %>%
  filter(Year=="2019")%>%
  group_by(Race_Identity) %>%
  summarise(total=as.numeric(sum(Number, na.rm = TRUE)) )%>%
  ungroup()%>%
  slice(-1)%>%
 mutate(Race_abbr= abbreviate(`Race_Identity`,4, dot = "TRUE", strict = "TRUE"))

Race_2019_number1 <- Race_2019_number %>%
  mutate(Propportion=total/sum(total))
Race_2019_number1 %>%
  ggplot(aes(Race_abbr,Propportion, fill= `Race_Identity`))+
  geom_col()+
  theme(axis.text.x = element_text(angle=90))+
  theme(legend.position = "bottom") +
  labs (x= "Race", title = "Sum of the total income, All Race, 2019")