library(reshape2)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(readr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v stringr 1.4.0
## v ggplot2 3.2.1 v forcats 0.4.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(RColorBrewer)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggthemes)
library(highcharter)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
df<-read.csv("household_debt.csv")
df<-na.omit(df)
names(df) <- tolower(names(df))
names(df)<-gsub(" ","",names(df))
head(df)
## period mortgage he.revolving auto.loan credit.card student.loan other
## 1 03:Q1 4.94 0.24 0.64 0.69 0.24 0.48
## 2 03:Q2 5.08 0.26 0.62 0.69 0.24 0.49
## 3 03:Q3 5.18 0.27 0.68 0.69 0.25 0.48
## 4 03:Q4 5.66 0.30 0.70 0.70 0.25 0.45
## 5 04:Q1 5.84 0.33 0.72 0.70 0.26 0.45
## 6 04:Q2 5.97 0.37 0.74 0.70 0.26 0.42
## total
## 1 7.23
## 2 7.38
## 3 7.56
## 4 8.07
## 5 8.29
## 6 8.46
Cleaning up the data by removing the na values, setting the character case to lower case and removing any extra spaces.
df <- df %>%
mutate(date = as.Date(as.yearqtr(period, format = "%y:Q%q")))
df
## period mortgage he.revolving auto.loan credit.card student.loan other
## 1 03:Q1 4.94 0.24 0.64 0.69 0.24 0.48
## 2 03:Q2 5.08 0.26 0.62 0.69 0.24 0.49
## 3 03:Q3 5.18 0.27 0.68 0.69 0.25 0.48
## 4 03:Q4 5.66 0.30 0.70 0.70 0.25 0.45
## 5 04:Q1 5.84 0.33 0.72 0.70 0.26 0.45
## 6 04:Q2 5.97 0.37 0.74 0.70 0.26 0.42
## 7 04:Q3 6.21 0.43 0.75 0.71 0.33 0.41
## 8 04:Q4 6.36 0.47 0.73 0.72 0.35 0.42
## 9 05:Q1 6.51 0.50 0.73 0.71 0.36 0.39
## 10 05:Q2 6.70 0.53 0.77 0.72 0.37 0.40
## 11 05:Q3 6.91 0.54 0.83 0.73 0.38 0.41
## 12 05:Q4 7.10 0.57 0.79 0.74 0.39 0.42
## 13 06:Q1 7.44 0.58 0.79 0.72 0.43 0.42
## 14 06:Q2 7.76 0.59 0.80 0.74 0.44 0.42
## 15 06:Q3 8.05 0.60 0.82 0.75 0.45 0.44
## 16 06:Q4 8.23 0.60 0.82 0.77 0.48 0.41
## 17 07:Q1 8.42 0.61 0.79 0.76 0.51 0.40
## 18 07:Q2 8.71 0.62 0.81 0.80 0.51 0.41
## 19 07:Q3 8.93 0.63 0.82 0.82 0.53 0.41
## 20 07:Q4 9.10 0.65 0.82 0.84 0.55 0.42
## 21 08:Q1 9.23 0.66 0.81 0.84 0.58 0.42
## 22 08:Q2 9.27 0.68 0.81 0.85 0.59 0.40
## 23 08:Q3 9.29 0.69 0.81 0.86 0.61 0.41
## 24 08:Q4 9.26 0.71 0.79 0.87 0.64 0.41
## 25 09:Q1 9.14 0.71 0.77 0.84 0.66 0.41
## 26 09:Q2 9.06 0.71 0.74 0.82 0.68 0.39
## 27 09:Q3 8.94 0.71 0.74 0.81 0.69 0.38
## 28 09:Q4 8.84 0.71 0.72 0.80 0.72 0.38
## 29 10:Q1 8.83 0.70 0.70 0.76 0.76 0.36
## 30 10:Q2 8.70 0.68 0.70 0.74 0.76 0.35
## 31 10:Q3 8.61 0.67 0.71 0.73 0.78 0.34
## 32 10:Q4 8.45 0.67 0.71 0.73 0.81 0.34
## 33 11:Q1 8.54 0.64 0.71 0.70 0.84 0.33
## 34 11:Q2 8.52 0.62 0.71 0.69 0.85 0.33
## 35 11:Q3 8.40 0.64 0.73 0.69 0.87 0.33
## 36 11:Q4 8.27 0.63 0.73 0.70 0.87 0.33
## 37 12:Q1 8.19 0.61 0.74 0.68 0.90 0.32
## 38 12:Q2 8.15 0.59 0.75 0.67 0.91 0.31
## 39 12:Q3 8.03 0.57 0.77 0.67 0.96 0.31
## 40 12:Q4 8.03 0.56 0.78 0.68 0.97 0.32
## 41 13:Q1 7.93 0.55 0.79 0.66 0.99 0.31
## 42 13:Q2 7.84 0.54 0.81 0.67 0.99 0.30
## 43 13:Q3 7.90 0.54 0.85 0.67 1.03 0.30
## 44 13:Q4 8.05 0.53 0.86 0.68 1.08 0.32
## 45 14:Q1 8.17 0.53 0.88 0.66 1.11 0.31
## 46 14:Q2 8.10 0.52 0.91 0.67 1.12 0.32
## 47 14:Q3 8.13 0.51 0.93 0.68 1.13 0.33
## 48 14:Q4 8.17 0.51 0.96 0.70 1.16 0.34
## 49 15:Q1 8.17 0.51 0.97 0.68 1.19 0.33
## 50 15:Q2 8.12 0.50 1.01 0.70 1.19 0.34
## 51 15:Q3 8.26 0.49 1.05 0.71 1.20 0.35
## 52 15:Q4 8.25 0.49 1.06 0.73 1.23 0.35
## 53 16:Q1 8.37 0.49 1.07 0.71 1.26 0.35
## 54 16:Q2 8.36 0.48 1.10 0.73 1.26 0.36
## 55 16:Q3 8.35 0.47 1.14 0.75 1.28 0.37
## 56 16:Q4 8.48 0.47 1.16 0.78 1.31 0.38
## 57 17:Q1 8.63 0.46 1.17 0.76 1.34 0.37
## 58 17:Q2 8.69 0.45 1.19 0.78 1.34 0.38
## 59 17:Q3 8.74 0.45 1.21 0.81 1.36 0.39
## 60 17:Q4 8.88 0.44 1.22 0.83 1.38 0.39
## 61 18:Q1 8.94 0.44 1.23 0.82 1.41 0.39
## 62 18:Q2 9.00 0.43 1.24 0.83 1.41 0.39
## 63 18:Q3 9.14 0.42 1.27 0.84 1.44 0.40
## 64 18:Q4 9.12 0.41 1.27 0.87 1.46 0.41
## total date
## 1 7.23 2003-01-01
## 2 7.38 2003-04-01
## 3 7.56 2003-07-01
## 4 8.07 2003-10-01
## 5 8.29 2004-01-01
## 6 8.46 2004-04-01
## 7 8.83 2004-07-01
## 8 9.04 2004-10-01
## 9 9.21 2005-01-01
## 10 9.49 2005-04-01
## 11 9.79 2005-07-01
## 12 10.00 2005-10-01
## 13 10.38 2006-01-01
## 14 10.75 2006-04-01
## 15 11.11 2006-07-01
## 16 11.31 2006-10-01
## 17 11.50 2007-01-01
## 18 11.85 2007-04-01
## 19 12.13 2007-07-01
## 20 12.37 2007-10-01
## 21 12.54 2008-01-01
## 22 12.60 2008-04-01
## 23 12.68 2008-07-01
## 24 12.67 2008-10-01
## 25 12.53 2009-01-01
## 26 12.41 2009-04-01
## 27 12.28 2009-07-01
## 28 12.17 2009-10-01
## 29 12.12 2010-01-01
## 30 11.94 2010-04-01
## 31 11.84 2010-07-01
## 32 11.71 2010-10-01
## 33 11.75 2011-01-01
## 34 11.73 2011-04-01
## 35 11.66 2011-07-01
## 36 11.54 2011-10-01
## 37 11.44 2012-01-01
## 38 11.38 2012-04-01
## 39 11.31 2012-07-01
## 40 11.34 2012-10-01
## 41 11.23 2013-01-01
## 42 11.15 2013-04-01
## 43 11.28 2013-07-01
## 44 11.52 2013-10-01
## 45 11.65 2014-01-01
## 46 11.63 2014-04-01
## 47 11.71 2014-07-01
## 48 11.83 2014-10-01
## 49 11.85 2015-01-01
## 50 11.85 2015-04-01
## 51 12.07 2015-07-01
## 52 12.12 2015-10-01
## 53 12.25 2016-01-01
## 54 12.29 2016-04-01
## 55 12.35 2016-07-01
## 56 12.58 2016-10-01
## 57 12.73 2017-01-01
## 58 12.84 2017-04-01
## 59 12.96 2017-07-01
## 60 13.15 2017-10-01
## 61 13.21 2018-01-01
## 62 13.29 2018-04-01
## 63 13.51 2018-07-01
## 64 13.54 2018-10-01
Adding the data column so that it is easier to read, rather than the period column.
summary(df$total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.23 11.21 11.74 11.39 12.36 13.54
plot2<-ggplot(df, mapping=aes(x = date, y= total)) +
xlab("Date") +
ylab("Total Amount of Debt")+
geom_boxplot(color="blue") +
theme_bw()+
ggtitle("Total Debt Between 2003 and 2018")+
coord_flip()
plot2
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
Both the summary and the box plot show the different summary statistics for the total debt for the entire data frame.
plot <- df %>% tidyr::gather("id", "value",2:7) %>%
ggplot(., aes(date, value))+
geom_point()+
aes(color = as.factor(id)) +
ylab('Amount of Debt')+
xlab('Date')+
ggtitle('All Types of Debt Per Year')+
labs(fill='Different Kinds of Debt')
plot
plot1 <- df %>%
ggplot(aes(date, total)) +
xlab("Date") +
ylab("Total Amount of Debt")+
geom_point(color="red") +
geom_smooth(method='lm') +
theme_economist() +
ggtitle("Total Debt Between 2003 and 2018")
plot1
Scatter plot that graphs the total amount of data for the different months during the years. The line is a residual line to mark the relationship between the total amount of debt and the date.
top5<- df %>%
group_by(total) %>%
arrange(desc(total)) %>%
top_n(n=5)
## Selecting by date
top5
## # A tibble: 64 x 9
## # Groups: total [59]
## period mortgage he.revolving auto.loan credit.card student.loan other
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18:Q4 9.12 0.41 1.27 0.87 1.46 0.41
## 2 18:Q3 9.14 0.42 1.27 0.84 1.44 0.4
## 3 18:Q2 9 0.43 1.24 0.83 1.41 0.39
## 4 18:Q1 8.94 0.44 1.23 0.82 1.41 0.39
## 5 17:Q4 8.88 0.44 1.22 0.83 1.38 0.39
## 6 17:Q3 8.74 0.45 1.21 0.81 1.36 0.39
## 7 17:Q2 8.69 0.45 1.19 0.78 1.34 0.38
## 8 17:Q1 8.63 0.46 1.17 0.76 1.34 0.37
## 9 08:Q3 9.29 0.69 0.81 0.86 0.61 0.41
## 10 08:Q4 9.26 0.71 0.79 0.87 0.64 0.41
## # ... with 54 more rows, and 2 more variables: total <dbl>, date <date>
Arranging the data from the highest amount of debt to the lowest amount of debt.
plot3 <- df %>%
filter(date == "2018-10-01"| date == "2018-07-01" | date == "2018-04-01" | date == "2018-01-01" | date == "2017-10-01") %>%
ggplot() +
scale_y_continuous(limits=c(12,15),oob = rescale_none) +
geom_bar(aes(x=date, y=total), position = "dodge", stat = "identity",fill = "blue", colour = "red")+
xlab("Date") +
ylab("Total Amount of Debt")+
labs( ylab = "Price of Rentals", title = "Top 5 Months with Highest Debt") +
theme_minimal()
plot3
Plot that shows the values of the top 5 months with the highest debt. The growth increases from 2017 to 2018, with 2018 having the highest amount of debt.
Is the mean credit card debt larger than the mean student loan debt?
Ho: μc = μs
Ha: μc > μs
t.test(df$credit.card, df$student.loan, alternative="less", conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: df$credit.card and df$student.loan
## t = -1.6764, df = 66.367, p-value = 0.04919
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -0.0003952529
## sample estimates:
## mean of x mean of y
## 0.7429687 0.8234375
P-Value: 0.049 Conclusion: Reject Ho. Real-World Interpretation: There is enough evidence to show that the the mean credit card debt is larger than the mean student loan debt.
lin_model<-lm(df$total ~ df$date)
lin_model
##
## Call:
## lm(formula = df$total ~ df$date)
##
## Coefficients:
## (Intercept) df$date
## 1.0731445 0.0006911
So, we have the following model: total=0.0006911 ∗ date + 1.073. But is it statistically significant?
summary(lin_model)
##
## Call:
## lm(formula = df$total ~ df$date)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1729 -0.5852 -0.1833 0.5856 1.8893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.073e+00 1.106e+00 0.971 0.335
## df$date 6.911e-04 7.359e-05 9.392 1.59e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.993 on 62 degrees of freedom
## Multiple R-squared: 0.5872, Adjusted R-squared: 0.5806
## F-statistic: 88.21 on 1 and 62 DF, p-value: 1.592e-13
Yes! Our model is statistically significant (p-value is 1.592x10^-13). So, we have enough evidence to support Ha: there is a correlation and linear relationship between Total Debt owed and the date.
res <- residuals(lin_model)
head(res)
## 1 2 3 4 5 6
## -2.172934 -2.085133 -1.968023 -1.521604 -1.365184 -1.258074
Debt_idx <- complete.cases(df[,c('total','date')])
Debt_complete <- df$total[Debt_idx]
plot(Debt_complete,res, ylab = 'residual', xlab = 'Total Debt', title('Residual plot'))
abline(0,0)
Residual plot showing the relationship between the total debt and the date.
This data set focuses on the amount of Debt that households are in. The different types of debt include, Mortgage, Auto Loans, Credit Cards, Student Loans and Total debt. The variables that I focused on the most was the date and the total amount of debt that households had. The data was cleaned by removing any NA values and extra spaces and lowering all characters to lower cases. I chose this data set because the amount of debt that households are in are at an all-time high. So, I wanted to investigate it a little bit more and see for myself.
After looking at the data set and the plots, it is clear to see that the household debt is increasing gradually and looks like it will continue to increase. One reason why household debt is high is because auto loan balances and credit card debt increased throughout the years (Lea). Another reason is that people are making investments into houses and education, which is a good thing, as long as the economy does not crash, and they are able to pay off all the debt (Lea). The amounts of credit card debt that was considered seriously past due, more than 90 days, increase from 5% to 5.2%, which is a small increase but if it continues to grow, it will cause a problem with people being able to pay off their debt (Lea).
Looking at the overall situation, the total amount of debt that households accumulate increased from 2003 to 2018 and will continue to rise. Looking at the plots and visuals it is clear to see this. The bar plot shows the end of 2017 having the highest amount of debt and it increases as it goes to 2018, where it just continues to increase. Looking at the residual plot the amount of debt increased from what it used to be. Since there is a linear relationship between the total amount of debt and the date, there is enough evidence to state that the total amount of debt will continue to rise.
Works Cited: Lea, Brittany De. “Why Is US Household Debt Rising?” Fox Business, Fox Business, 29 Sept. 2019, https://www.foxbusiness.com/money/us-household-debt-rising.