START DATASET 1

# Using file “FRB_FOR.csv” from Raghed Mirza’s discussion post “Industry capacity utilization and recessions”

Step 1:

Reading input file “FRB_FOR.csv” into R

# libraries used
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(DescTools)
# Reading file into R using "read.table" function
input_df <- read.table("FRB_FOR.csv", sep = ",", header = T, stringsAsFactors = F)
# Deleting unwanted rows
input_df <- input_df[-(1:5),]
# Redoing the row sequence
rownames(input_df) <- 1:nrow(input_df)

#Data type for the "Series.Description" coulmn was upated and We have the input data ready in required format for further analysis
input_df[-1] <- round(sapply(input_df[-1], as.numeric),2)
input_df
##    Series.Description Debt.service.ratio..seasonally.adjusted
## 1              2014Q3                                    9.87
## 2              2014Q4                                    9.86
## 3              2015Q1                                    9.92
## 4              2015Q2                                    9.91
## 5              2015Q3                                    9.95
## 6              2015Q4                                    9.93
## 7              2016Q1                                    9.90
## 8              2016Q2                                   10.00
## 9              2016Q3                                   10.06
## 10             2016Q4                                   10.03
## 11             2017Q1                                    9.96
## 12             2017Q2                                    9.96
## 13             2017Q3                                    9.94
## 14             2017Q4                                    9.88
## 15             2018Q1                                    9.73
## 16             2018Q2                                    9.70
## 17             2018Q3                                    9.69
## 18             2018Q4                                    9.74
## 19             2019Q1                                    9.71
## 20             2019Q2                                    9.69
##    Mortgage.debt.service.ratio..seasonally.adjusted
## 1                                              4.61
## 2                                              4.56
## 3                                              4.55
## 4                                              4.49
## 5                                              4.46
## 6                                              4.48
## 7                                              4.46
## 8                                              4.46
## 9                                              4.45
## 10                                             4.38
## 11                                             4.33
## 12                                             4.31
## 13                                             4.30
## 14                                             4.25
## 15                                             4.19
## 16                                             4.18
## 17                                             4.16
## 18                                             4.18
## 19                                             4.16
## 20                                             4.13
##    Consumer.debt.service.ratio..seasonally.adjusted
## 1                                              5.26
## 2                                              5.30
## 3                                              5.37
## 4                                              5.42
## 5                                              5.49
## 6                                              5.44
## 7                                              5.44
## 8                                              5.53
## 9                                              5.61
## 10                                             5.65
## 11                                             5.63
## 12                                             5.65
## 13                                             5.64
## 14                                             5.63
## 15                                             5.55
## 16                                             5.52
## 17                                             5.53
## 18                                             5.56
## 19                                             5.55
## 20                                             5.56
##    Financial.obligations.ratio..seasonally.adjusted
## 1                                             15.16
## 2                                             15.19
## 3                                             15.34
## 4                                             15.39
## 5                                             15.48
## 6                                             15.52
## 7                                             15.52
## 8                                             15.66
## 9                                             15.73
## 10                                            15.69
## 11                                            15.57
## 12                                            15.56
## 13                                            15.52
## 14                                            15.41
## 15                                            15.18
## 16                                            15.12
## 17                                            15.09
## 18                                            15.12
## 19                                            15.07
## 20                                            15.03

step 2:

Tidying data

#Using pipe operator and adding new columns by mutate and seleting the required columns
#Year and quarter were seperated for better view of the data. Then data types were updated as required to be able to perform math operations.
clean_df <- input_df %>% mutate(year = str_extract(Series.Description, "[0-9]+"), quarter = str_extract(Series.Description, "[[:alpha:]]+[0-9]+")) %>% dplyr::select(year, quarter, 2:5)
#changing the datatype of columns
clean_df[-2] <- round(sapply(clean_df[-2], as.numeric),2)

clean_df
##    year quarter Debt.service.ratio..seasonally.adjusted
## 1  2014      Q3                                    9.87
## 2  2014      Q4                                    9.86
## 3  2015      Q1                                    9.92
## 4  2015      Q2                                    9.91
## 5  2015      Q3                                    9.95
## 6  2015      Q4                                    9.93
## 7  2016      Q1                                    9.90
## 8  2016      Q2                                   10.00
## 9  2016      Q3                                   10.06
## 10 2016      Q4                                   10.03
## 11 2017      Q1                                    9.96
## 12 2017      Q2                                    9.96
## 13 2017      Q3                                    9.94
## 14 2017      Q4                                    9.88
## 15 2018      Q1                                    9.73
## 16 2018      Q2                                    9.70
## 17 2018      Q3                                    9.69
## 18 2018      Q4                                    9.74
## 19 2019      Q1                                    9.71
## 20 2019      Q2                                    9.69
##    Mortgage.debt.service.ratio..seasonally.adjusted
## 1                                              4.61
## 2                                              4.56
## 3                                              4.55
## 4                                              4.49
## 5                                              4.46
## 6                                              4.48
## 7                                              4.46
## 8                                              4.46
## 9                                              4.45
## 10                                             4.38
## 11                                             4.33
## 12                                             4.31
## 13                                             4.30
## 14                                             4.25
## 15                                             4.19
## 16                                             4.18
## 17                                             4.16
## 18                                             4.18
## 19                                             4.16
## 20                                             4.13
##    Consumer.debt.service.ratio..seasonally.adjusted
## 1                                              5.26
## 2                                              5.30
## 3                                              5.37
## 4                                              5.42
## 5                                              5.49
## 6                                              5.44
## 7                                              5.44
## 8                                              5.53
## 9                                              5.61
## 10                                             5.65
## 11                                             5.63
## 12                                             5.65
## 13                                             5.64
## 14                                             5.63
## 15                                             5.55
## 16                                             5.52
## 17                                             5.53
## 18                                             5.56
## 19                                             5.55
## 20                                             5.56
##    Financial.obligations.ratio..seasonally.adjusted
## 1                                             15.16
## 2                                             15.19
## 3                                             15.34
## 4                                             15.39
## 5                                             15.48
## 6                                             15.52
## 7                                             15.52
## 8                                             15.66
## 9                                             15.73
## 10                                            15.69
## 11                                            15.57
## 12                                            15.56
## 13                                            15.52
## 14                                            15.41
## 15                                            15.18
## 16                                            15.12
## 17                                            15.09
## 18                                            15.12
## 19                                            15.07
## 20                                            15.03

step 3:

Analyzinig data

question1: What is the trend of Mortgage debt service ratio and consumer debt service ratio.

finding: Mortgage deb ratio is always lower than Cosumer debt ratio. And Mortgage debt ratio has a clear downward trend while the Consumer debt ratio had fluctuating upward trend. This could be indication that consumer paying less attention to consumer debt and concentrating more on Mortgage debt. This will be a better sign for mortgage lenders.

# I am using line graph in GGplot to show the trend of the Mortgage service ratio and Consumer debt service ratio.
# Initial dataset was used here
# geom_point marks each of the ratios as dots in the plot.
# geom_line draws a line through these points and arrow pointer wass used additionally to show the direction we are using in analysis
#notes: group=1 msut be used to have the proportions of each x value to be relative to all vleus of x. Otherwise the geom_line option doesn't work in this scenario

ggplot(input_df, aes(x=Series.Description, group = 1)) + 
  geom_point(aes(y= Mortgage.debt.service.ratio..seasonally.adjusted)) + 
  geom_point(aes(y= Consumer.debt.service.ratio..seasonally.adjusted)) + 
  geom_line(aes(y= Mortgage.debt.service.ratio..seasonally.adjusted, colour = "1"),arrow = arrow()) +
  geom_line(aes(y= Consumer.debt.service.ratio..seasonally.adjusted, colour = "2"),arrow = arrow()) +
  scale_color_discrete(name = "Debt Ratios", labels = c("Mortgage", "Consumer")) + 
  labs(y="Multiple Debt service Ratios") + 
  theme(axis.text.x = element_text(size=10, angle=90))

question2: what are the percentages of mortgage service ratio and consumer service ratio in the total rate service ratio and financial obligations ratio?

Answer: Consumer service ratio played a mjor role in the total ratios. Mortgage ratio had lower percentage and consumer rate ratio had higher percentage. The data below is also segregated by quarters as I wanted to check if any particulr quarter was better(had lower number). But the quarters didn’t matter. The data appears to be synched with year on year.

# 1
# using clean_df dataframe to create subset for percentages of numbers in total_debt_service_ratio
# Percentages were calculated and gather function was used to show percent of mortgage and consumer ratios per each year and quarter combination. Ans sapply was used to change datatypes
debt_service_percentages_df <- clean_df %>% 
  mutate(mortgage_debt_ratio_percent = round(clean_df$Mortgage.debt.service.ratio..seasonally.adjusted/clean_df$Debt.service.ratio..seasonally.adjusted*100,2),
                    consumer_debt_ratio_percent = round(clean_df$Consumer.debt.service.ratio..seasonally.adjusted/clean_df$Debt.service.ratio..seasonally.adjusted*100,2)) %>% 
  dplyr::select(year, quarter, mortgage_debt_ratio_percent, consumer_debt_ratio_percent) %>% 
  gather(., debt_ratio_percentages, percent, 3:4)

#changing the datatype of columns
debt_service_percentages_df[1,4] <- round(sapply(debt_service_percentages_df[1,4], as.numeric),2)
#showing the data from newly created dataframe
debt_service_percentages_df
##    year quarter      debt_ratio_percentages percent
## 1  2014      Q3 mortgage_debt_ratio_percent   46.71
## 2  2014      Q4 mortgage_debt_ratio_percent   46.25
## 3  2015      Q1 mortgage_debt_ratio_percent   45.87
## 4  2015      Q2 mortgage_debt_ratio_percent   45.31
## 5  2015      Q3 mortgage_debt_ratio_percent   44.82
## 6  2015      Q4 mortgage_debt_ratio_percent   45.12
## 7  2016      Q1 mortgage_debt_ratio_percent   45.05
## 8  2016      Q2 mortgage_debt_ratio_percent   44.60
## 9  2016      Q3 mortgage_debt_ratio_percent   44.23
## 10 2016      Q4 mortgage_debt_ratio_percent   43.67
## 11 2017      Q1 mortgage_debt_ratio_percent   43.47
## 12 2017      Q2 mortgage_debt_ratio_percent   43.27
## 13 2017      Q3 mortgage_debt_ratio_percent   43.26
## 14 2017      Q4 mortgage_debt_ratio_percent   43.02
## 15 2018      Q1 mortgage_debt_ratio_percent   43.06
## 16 2018      Q2 mortgage_debt_ratio_percent   43.09
## 17 2018      Q3 mortgage_debt_ratio_percent   42.93
## 18 2018      Q4 mortgage_debt_ratio_percent   42.92
## 19 2019      Q1 mortgage_debt_ratio_percent   42.84
## 20 2019      Q2 mortgage_debt_ratio_percent   42.62
## 21 2014      Q3 consumer_debt_ratio_percent   53.29
## 22 2014      Q4 consumer_debt_ratio_percent   53.75
## 23 2015      Q1 consumer_debt_ratio_percent   54.13
## 24 2015      Q2 consumer_debt_ratio_percent   54.69
## 25 2015      Q3 consumer_debt_ratio_percent   55.18
## 26 2015      Q4 consumer_debt_ratio_percent   54.78
## 27 2016      Q1 consumer_debt_ratio_percent   54.95
## 28 2016      Q2 consumer_debt_ratio_percent   55.30
## 29 2016      Q3 consumer_debt_ratio_percent   55.77
## 30 2016      Q4 consumer_debt_ratio_percent   56.33
## 31 2017      Q1 consumer_debt_ratio_percent   56.53
## 32 2017      Q2 consumer_debt_ratio_percent   56.73
## 33 2017      Q3 consumer_debt_ratio_percent   56.74
## 34 2017      Q4 consumer_debt_ratio_percent   56.98
## 35 2018      Q1 consumer_debt_ratio_percent   57.04
## 36 2018      Q2 consumer_debt_ratio_percent   56.91
## 37 2018      Q3 consumer_debt_ratio_percent   57.07
## 38 2018      Q4 consumer_debt_ratio_percent   57.08
## 39 2019      Q1 consumer_debt_ratio_percent   57.16
## 40 2019      Q2 consumer_debt_ratio_percent   57.38
# using ggplot to show the percentages of mortgage and consuler debt ratios in total debt ratio segregated by quarters
# geom_bar shows the percentages as bars.
# gem_text shows the percentages above each bar
# facet_wrap segregates by quarter. wrap was used instead of grid for better view of the chart.
ggplot(debt_service_percentages_df, aes(x = year, y = percent, fill = debt_ratio_percentages)) +
  geom_bar(stat = "identity", width=0.5, position=position_dodge()) + 
  geom_text(aes(label=percent), vjust=-0.1, size=2.5, position = position_dodge(width = 0.9)) + 
  facet_wrap(. ~ quarter, scales = "free_x") + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(y = "debt_ratio_percentages", title = "Percantages of Mortgage & consumer debt ratios in debt service ratio by quarter")

# 2
# using clean_df dataframe to create subset for percentages of numbers in financial_obligations_ratio
# Percentages were calculated and gather function was used to show percent of mortgage and consumer ratios per each year and quarter combination. Ans sapply was used to change datatypes
financial_obligations_percentages_df <- clean_df %>% 
  mutate(mortgage_debt_ratio_percent = round(clean_df$Mortgage.debt.service.ratio..seasonally.adjusted/clean_df$Financial.obligations.ratio..seasonally.adjusted*100,2),
                    consumer_debt_ratio_percent = round(clean_df$Consumer.debt.service.ratio..seasonally.adjusted/clean_df$Financial.obligations.ratio..seasonally.adjusted*100,2)) %>% 
  dplyr::select(year, quarter, mortgage_debt_ratio_percent, consumer_debt_ratio_percent) %>% 
  gather(., financial_obligations_ratio_percentages, percent, 3:4)

#changing the datatype of columns
financial_obligations_percentages_df[1,4] <- round(sapply(financial_obligations_percentages_df[1,4], as.numeric),2)
#showing the data from newly created dataframe
financial_obligations_percentages_df
##    year quarter financial_obligations_ratio_percentages percent
## 1  2014      Q3             mortgage_debt_ratio_percent   30.41
## 2  2014      Q4             mortgage_debt_ratio_percent   30.02
## 3  2015      Q1             mortgage_debt_ratio_percent   29.66
## 4  2015      Q2             mortgage_debt_ratio_percent   29.17
## 5  2015      Q3             mortgage_debt_ratio_percent   28.81
## 6  2015      Q4             mortgage_debt_ratio_percent   28.87
## 7  2016      Q1             mortgage_debt_ratio_percent   28.74
## 8  2016      Q2             mortgage_debt_ratio_percent   28.48
## 9  2016      Q3             mortgage_debt_ratio_percent   28.29
## 10 2016      Q4             mortgage_debt_ratio_percent   27.92
## 11 2017      Q1             mortgage_debt_ratio_percent   27.81
## 12 2017      Q2             mortgage_debt_ratio_percent   27.70
## 13 2017      Q3             mortgage_debt_ratio_percent   27.71
## 14 2017      Q4             mortgage_debt_ratio_percent   27.58
## 15 2018      Q1             mortgage_debt_ratio_percent   27.60
## 16 2018      Q2             mortgage_debt_ratio_percent   27.65
## 17 2018      Q3             mortgage_debt_ratio_percent   27.57
## 18 2018      Q4             mortgage_debt_ratio_percent   27.65
## 19 2019      Q1             mortgage_debt_ratio_percent   27.60
## 20 2019      Q2             mortgage_debt_ratio_percent   27.48
## 21 2014      Q3             consumer_debt_ratio_percent   34.70
## 22 2014      Q4             consumer_debt_ratio_percent   34.89
## 23 2015      Q1             consumer_debt_ratio_percent   35.01
## 24 2015      Q2             consumer_debt_ratio_percent   35.22
## 25 2015      Q3             consumer_debt_ratio_percent   35.47
## 26 2015      Q4             consumer_debt_ratio_percent   35.05
## 27 2016      Q1             consumer_debt_ratio_percent   35.05
## 28 2016      Q2             consumer_debt_ratio_percent   35.31
## 29 2016      Q3             consumer_debt_ratio_percent   35.66
## 30 2016      Q4             consumer_debt_ratio_percent   36.01
## 31 2017      Q1             consumer_debt_ratio_percent   36.16
## 32 2017      Q2             consumer_debt_ratio_percent   36.31
## 33 2017      Q3             consumer_debt_ratio_percent   36.34
## 34 2017      Q4             consumer_debt_ratio_percent   36.53
## 35 2018      Q1             consumer_debt_ratio_percent   36.56
## 36 2018      Q2             consumer_debt_ratio_percent   36.51
## 37 2018      Q3             consumer_debt_ratio_percent   36.65
## 38 2018      Q4             consumer_debt_ratio_percent   36.77
## 39 2019      Q1             consumer_debt_ratio_percent   36.83
## 40 2019      Q2             consumer_debt_ratio_percent   36.99
# using ggplot to show the percentages of mortgage and consuler debt ratios in financial_obligations ratio segregated by quarters
# geom_bar shows the percentages as bars.
# gem_text shows the percentages above each bar
# facet_wrap segregates by quarter. wrap was used instead of grid for better view of the chart.
ggplot(financial_obligations_percentages_df, aes(x = year, y = percent, fill = financial_obligations_ratio_percentages)) +
  geom_bar(stat = "identity", width=0.5, position=position_dodge()) + 
  geom_text(aes(label=percent), vjust=-0.1, size=2.5, position = position_dodge(width = 0.9)) + 
  facet_wrap(. ~ quarter, scales = "free_x") + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(y = "financial_obligations_ratio_percentages", title = "Percantages of Mortgage & consumer debt ratios in the financial obligations ratio")

#Question3: What is the ratio of other debt ratios in the overall ratio (Financial obligations ratio). # Answer: other debt ratios combined appears to be also playing major part in the increase in overall ratio. This combined other debt ratio is close to consumer debt ratio. # line graph was also created to show the total ratios as well as other ratios to see how they varied in each quarter

# creating dataframe to store the calculated remaiaing ratio from other debts
other_debt_df <- clean_df %>% 
  mutate(other.debt.ratio = round(Financial.obligations.ratio..seasonally.adjusted-Debt.service.ratio..seasonally.adjusted,2))

#summary of the other_debt_df written to a dataframe
summary.df <- data.frame(summary(other_debt_df)) %>% dplyr::select(2:3)
names(summary.df)
## [1] "Var2" "Freq"
summary.df
##                                                Var2               Freq
## 1                                              year     Min.   :2014  
## 2                                              year     1st Qu.:2015  
## 3                                              year     Median :2016  
## 4                                              year     Mean   :2016  
## 5                                              year     3rd Qu.:2018  
## 6                                              year     Max.   :2019  
## 7                                           quarter Length:20         
## 8                                           quarter Class :character  
## 9                                           quarter Mode  :character  
## 10                                          quarter               <NA>
## 11                                          quarter               <NA>
## 12                                          quarter               <NA>
## 13          Debt.service.ratio..seasonally.adjusted   Min.   : 9.690  
## 14          Debt.service.ratio..seasonally.adjusted   1st Qu.: 9.738  
## 15          Debt.service.ratio..seasonally.adjusted   Median : 9.905  
## 16          Debt.service.ratio..seasonally.adjusted   Mean   : 9.871  
## 17          Debt.service.ratio..seasonally.adjusted   3rd Qu.: 9.953  
## 18          Debt.service.ratio..seasonally.adjusted   Max.   :10.060  
## 19 Mortgage.debt.service.ratio..seasonally.adjusted    Min.   :4.130  
## 20 Mortgage.debt.service.ratio..seasonally.adjusted    1st Qu.:4.188  
## 21 Mortgage.debt.service.ratio..seasonally.adjusted    Median :4.355  
## 22 Mortgage.debt.service.ratio..seasonally.adjusted    Mean   :4.354  
## 23 Mortgage.debt.service.ratio..seasonally.adjusted    3rd Qu.:4.465  
## 24 Mortgage.debt.service.ratio..seasonally.adjusted    Max.   :4.610  
## 25 Consumer.debt.service.ratio..seasonally.adjusted    Min.   :5.260  
## 26 Consumer.debt.service.ratio..seasonally.adjusted    1st Qu.:5.440  
## 27 Consumer.debt.service.ratio..seasonally.adjusted    Median :5.540  
## 28 Consumer.debt.service.ratio..seasonally.adjusted    Mean   :5.516  
## 29 Consumer.debt.service.ratio..seasonally.adjusted    3rd Qu.:5.615  
## 30 Consumer.debt.service.ratio..seasonally.adjusted    Max.   :5.650  
## 31 Financial.obligations.ratio..seasonally.adjusted    Min.   :15.03  
## 32 Financial.obligations.ratio..seasonally.adjusted    1st Qu.:15.15  
## 33 Financial.obligations.ratio..seasonally.adjusted    Median :15.40  
## 34 Financial.obligations.ratio..seasonally.adjusted    Mean   :15.37  
## 35 Financial.obligations.ratio..seasonally.adjusted    3rd Qu.:15.53  
## 36 Financial.obligations.ratio..seasonally.adjusted    Max.   :15.73  
## 37                                 other.debt.ratio    Min.   :5.290  
## 38                                 other.debt.ratio    1st Qu.:5.395  
## 39                                 other.debt.ratio    Median :5.505  
## 40                                 other.debt.ratio    Mean   :5.496  
## 41                                 other.debt.ratio    3rd Qu.:5.603  
## 42                                 other.debt.ratio    Max.   :5.670
# extracting lowest number for all debt ratios
lowest_financial_obligation_ratio <- (filter(summary.df, Freq %like% "^(Min. )[[:print:]]+"))
lowest_financial_obligation_ratio
##                                               Var2             Freq
## 1                                             year   Min.   :2014  
## 2          Debt.service.ratio..seasonally.adjusted Min.   : 9.690  
## 3 Mortgage.debt.service.ratio..seasonally.adjusted  Min.   :4.130  
## 4 Consumer.debt.service.ratio..seasonally.adjusted  Min.   :5.260  
## 5 Financial.obligations.ratio..seasonally.adjusted  Min.   :15.03  
## 6                                 other.debt.ratio  Min.   :5.290
# extracting highest number for all debt ratios
highest_financial_obligation_ratio <- filter(summary.df, Freq %like% "^(Max. )[[:print:]]+")
highest_financial_obligation_ratio
##                                               Var2             Freq
## 1                                             year   Max.   :2019  
## 2          Debt.service.ratio..seasonally.adjusted Max.   :10.060  
## 3 Mortgage.debt.service.ratio..seasonally.adjusted  Max.   :4.610  
## 4 Consumer.debt.service.ratio..seasonally.adjusted  Max.   :5.650  
## 5 Financial.obligations.ratio..seasonally.adjusted  Max.   :15.73  
## 6                                 other.debt.ratio  Max.   :5.670
# showing the results based on minimum and maximum Financial.obligations.ratio as this the overall ratio
# year 2016 and Q3 has highest ratio
# year 2019 and Q2 has lowest ratio
(filter(other_debt_df, Financial.obligations.ratio..seasonally.adjusted %in% c(sapply(str_extract(lowest_financial_obligation_ratio$Freq, "[0-9]+\\.[0-9]+"),as.numeric),sapply(str_extract(highest_financial_obligation_ratio$Freq, "[0-9]+\\.[0-9]+"),as.numeric))))
##   year quarter Debt.service.ratio..seasonally.adjusted
## 1 2016      Q3                                   10.06
## 2 2019      Q2                                    9.69
##   Mortgage.debt.service.ratio..seasonally.adjusted
## 1                                             4.45
## 2                                             4.13
##   Consumer.debt.service.ratio..seasonally.adjusted
## 1                                             5.61
## 2                                             5.56
##   Financial.obligations.ratio..seasonally.adjusted other.debt.ratio
## 1                                            15.73             5.67
## 2                                            15.03             5.34
# using ggplot to show the line graphs for the debt rate ratio, financial obligation ratio and other debt ratio. The results aree also segregated by quarter to see if we can get some insights based on quarter. No effect of quarter n this either
ggplot(other_debt_df, aes(x=year, group = 1)) + 
  geom_point(aes(y= Financial.obligations.ratio..seasonally.adjusted)) + 
  geom_point(aes(y= Debt.service.ratio..seasonally.adjusted)) + 
  geom_point(aes(y= other.debt.ratio)) + 
  geom_line(aes(y= Financial.obligations.ratio..seasonally.adjusted, colour = "1"),arrow = arrow()) +
  geom_line(aes(y= Debt.service.ratio..seasonally.adjusted, colour = "2"),arrow = arrow()) +
  geom_line(aes(y= other.debt.ratio, colour = "3"),arrow = arrow()) +
  facet_grid(. ~ quarter) +
  scale_color_discrete(name = "Debt Ratios", labels = c("Financial.obligations.ratio", "Debt.service.ratio", "other.debt.ratio,")) + 
  labs(y="Multiple Debt service Ratios") + 
  theme(axis.text.x = element_text(size=10, angle=90))

END DATASET 1

—————————————————————————————————————————————————————————————————————————————-

START DATASET 2

Using file “Arrests.csv” from Sie Song Wong’s discussion post “Marijuana Arrests Dataset”

Step 1:

Reading input file “Arrests.csv” into R

# Reading file into R using "read.table" function
input_df <- read.table("Arrests.csv", sep = ",", header = T, stringsAsFactors = F)
#Deleting unwanted column color
input_df <- input_df[,-c(3)]
#Renamng column 1
names(input_df)[1] <-"convict_id"

#print part of the dataframe
head(input_df)
##   convict_id released year age    sex employed citizen checks
## 1          1      Yes 2002  21   Male      Yes     Yes      3
## 2          2       No 1999  17   Male      Yes     Yes      3
## 3          3      Yes 2000  24   Male      Yes     Yes      3
## 4          4       No 2000  46   Male      Yes     Yes      1
## 5          5      Yes 1999  27 Female      Yes     Yes      1
## 6          6      Yes 1998  16 Female      Yes     Yes      0

Step 2:

simple tidying

NOte: dplyr package was used infront of select function as it was not functioning properly

# Reordering columns
clean_df <- input_df %>% dplyr::select(convict_id, year, released, checks, age, sex, citizen, employed) %>% arrange(., year)
names(clean_df)[4] <-"Past_history_count"
names(clean_df)[6] <-"Gender"

##print part of the clean dataframe
head(clean_df)
##   convict_id year released Past_history_count age Gender citizen employed
## 1         14 1997      Yes                  0  42   Male     Yes      Yes
## 2         43 1997      Yes                  1  39   Male     Yes      Yes
## 3         46 1997       No                  1  21   Male      No      Yes
## 4         49 1997      Yes                  0  26 Female     Yes      Yes
## 5         53 1997      Yes                  4  18   Male     Yes      Yes
## 6         62 1997      Yes                  3  53   Male      No      Yes

Question1: Figure out which gender had more arrests between year 1997 and 2002.

Answer: Based on the data, females had fewer arrests than males.

# Creating a dataset with counts of arrests based on Gender
arrest_count_by_gender <- clean_df %>% 
  filter(., year %in% c(1997:2002)) %>%  
  group_by(Gender) %>% 
  summarise(.,count = n())

arrest_count_by_gender
## # A tibble: 2 x 2
##   Gender count
##   <chr>  <int>
## 1 Female   443
## 2 Male    4783
# barplot
bp <- barplot(arrest_count_by_gender$count, main = "Arrest count by Gender", xlab = "color", ylab = "count", 
        names.arg = arrest_count_by_gender$Gender, ylim= range(pretty(c(0, arrest_count_by_gender$count+1000))), 
        col = c("pink","blue"))
text(x = bp, y = arrest_count_by_gender$count, label = arrest_count_by_gender$count, pos = 3, cex = 0.8)
legend("topleft", unique(arrest_count_by_gender$Gender), cex = 1, fill = c("pink","blue"))

Question2: Figure out which age group had more arrests and check what is the percent of males and females in those groups. Consider adulthood (18 to 35 years), middle age (36 to 55 years), and older adulthood (56 years and older).

Answer: Based on the data, females had fewer arrests than males. And adulthood (18 to 35 years) was seen to have more arrest percentages

# creating a new data frame by adding column age_group 
agegroup_df <- clean_df %>% 
  filter(., year %in% c(1997:2002)) %>% 
  mutate(age_group = "18 below")
head(agegroup_df)
##   convict_id year released Past_history_count age Gender citizen employed
## 1         14 1997      Yes                  0  42   Male     Yes      Yes
## 2         43 1997      Yes                  1  39   Male     Yes      Yes
## 3         46 1997       No                  1  21   Male      No      Yes
## 4         49 1997      Yes                  0  26 Female     Yes      Yes
## 5         53 1997      Yes                  4  18   Male     Yes      Yes
## 6         62 1997      Yes                  3  53   Male      No      Yes
##   age_group
## 1  18 below
## 2  18 below
## 3  18 below
## 4  18 below
## 5  18 below
## 6  18 below
# updating column age_group with appropriate values
i <- 1
  while(i <= nrow(agegroup_df))
    {
    agegroup_df$age_group[i] = if(agegroup_df$age[i] >= 18 & agegroup_df$age[i] <=35)
      {
      "18 to 35"
      }else if(agegroup_df$age[i] >= 36 & agegroup_df$age[i] <=55)
      {
          "36 to 55"
      }else if(agegroup_df$age[i] >= 56)
      {
          "56 and older"
      }else
      {
          "18 below"
      }
    i <- i+1
    }

head(agegroup_df)
##   convict_id year released Past_history_count age Gender citizen employed
## 1         14 1997      Yes                  0  42   Male     Yes      Yes
## 2         43 1997      Yes                  1  39   Male     Yes      Yes
## 3         46 1997       No                  1  21   Male      No      Yes
## 4         49 1997      Yes                  0  26 Female     Yes      Yes
## 5         53 1997      Yes                  4  18   Male     Yes      Yes
## 6         62 1997      Yes                  3  53   Male      No      Yes
##   age_group
## 1  36 to 55
## 2  36 to 55
## 3  18 to 35
## 4  18 to 35
## 5  18 to 35
## 6  36 to 55
arrest_count_by_age <- agegroup_df %>% 
  group_by(age_group, Gender) %>% 
  summarise(.,count = n()) %>% 
  mutate(count_percent = round(count/sum(.$count)*100,2))
arrest_count_by_age
## # A tibble: 7 x 4
## # Groups:   age_group [4]
##   age_group    Gender count count_percent
##   <chr>        <chr>  <int>         <dbl>
## 1 18 below     Female   125          2.39
## 2 18 below     Male     934         17.9 
## 3 18 to 35     Female   259          4.96
## 4 18 to 35     Male    3305         63.2 
## 5 36 to 55     Female    59          1.13
## 6 36 to 55     Male     527         10.1 
## 7 56 and older Male      17          0.33
ggplot(arrest_count_by_age, aes(x = age_group, y = count_percent, fill = Gender)) +
  geom_bar(stat = "identity", width=0.5, position=position_dodge()) + 
  geom_text(aes(label= count_percent), vjust=-0.1, size=3.5, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ Gender) +
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(y = "count_percent", title = "Arrest percent by agegroup for Male and Female")

Question3: Most times arrested and were they citizens and were they employed

Answer: Majority convicted are citizens and employed, And few were convicted who are both non citizen and not employed. Also, 85% are citizens.

# creating dataframe to answer question 3
# required columns were selected, data in citizen and employed status was updated and columns were reorganized 
other_factors_df <- clean_df %>% 
  filter(., year %in% c(1997:2002)) %>% 
  dplyr::select(convict_id, Past_history_count, citizen, employed) %>%
  mutate(citizen = replace(replace(citizen, citizen == "No", "Non Citizen"), citizen == "Yes", "Citizen")) %>%
  mutate(employed = replace(replace(employed, employed == "No", "Not employed"), employed == "Yes", "employed")) %>%
  mutate(status = str_c(citizen," & ",employed)) %>%
  dplyr::select(convict_id, status, Past_history_count)

#adding count 1 which is the current arrest record to the past history count
other_factors_df$Past_history_count <- other_factors_df$Past_history_count+1
#showing part of the dataset
head(other_factors_df)
##   convict_id                 status Past_history_count
## 1         14     Citizen & employed                  1
## 2         43     Citizen & employed                  2
## 3         46 Non Citizen & employed                  2
## 4         49     Citizen & employed                  1
## 5         53     Citizen & employed                  5
## 6         62 Non Citizen & employed                  4
#final dataset to show the arrest count and percentages by status
# Arrest count and percentages were calculated additionally in this data frame
arrest_count_by_other_factors <- other_factors_df %>%
  group_by(status) %>% 
  summarise(.,count = n()) %>% 
  mutate(count_percent = round(count/sum(.$count)*100,2))

#showing the results of arrest count and percentage
arrest_count_by_other_factors
## # A tibble: 4 x 3
##   status                     count count_percent
##   <chr>                      <int>         <dbl>
## 1 Citizen & employed          3564         68.2 
## 2 Citizen & Not employed       891         17.0 
## 3 Non Citizen & employed       547         10.5 
## 4 Non Citizen & Not employed   224          4.29
# Pie chart to give visual display of the percentages
pie(arrest_count_by_other_factors$count_percent, label = arrest_count_by_other_factors$count_percent, main = "Arrest percentage by citizenship and employment status",col = rainbow(length(arrest_count_by_other_factors$count_percent)))
legend("topright", arrest_count_by_other_factors$status, cex = 0.7,
   fill = rainbow(length(arrest_count_by_other_factors$count_percent)))

END DATASET 2

—————————————————————————————————————————————————————————————————————————————-

START DATASET 3

Using file “AB_NYC_2019.csv” from Kaggle “https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/downloads/new-york-city-airbnb-open-data.zip/3

Step 1:

Reading input file “AB_NYC_2019.csv” into R

# Reading file into R using "read.table" function
# using read.csv as read.table is data reading was not proper.Whole row is populaitng in a column randomly
input_df <- read.csv("AB_NYC_2019.csv", stringsAsFactors = F)

#showing the data in dataframe
head(input_df)
##     id                                             name host_id
## 1 2539               Clean & quiet apt home by the park    2787
## 2 2595                            Skylit Midtown Castle    2845
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632
## 4 3831                  Cozy Entire Floor of Brownstone    4869
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322
##     host_name neighbourhood_group neighbourhood latitude longitude
## 1        John            Brooklyn    Kensington 40.64749 -73.97237
## 2    Jennifer           Manhattan       Midtown 40.75362 -73.98377
## 3   Elisabeth           Manhattan        Harlem 40.80902 -73.94190
## 4 LisaRoxanne            Brooklyn  Clinton Hill 40.68514 -73.95976
## 5       Laura           Manhattan   East Harlem 40.79851 -73.94399
## 6       Chris           Manhattan   Murray Hill 40.74767 -73.97500
##         room_type price minimum_nights number_of_reviews last_review
## 1    Private room   149              1                 9  2018-10-19
## 2 Entire home/apt   225              1                45  2019-05-21
## 3    Private room   150              3                 0            
## 4 Entire home/apt    89              1               270  2019-07-05
## 5 Entire home/apt    80             10                 9  2018-11-19
## 6 Entire home/apt   200              3                74  2019-06-22
##   reviews_per_month calculated_host_listings_count availability_365
## 1              0.21                              6              365
## 2              0.38                              2              355
## 3                NA                              1              365
## 4              4.64                              1              194
## 5              0.10                              1                0
## 6              0.59                              1              129

step 2: Analysis

question1: which neightbourhood has the rooms available for 365 days and has all the room types to choose from

Anser: There are 130 cities in total which offer rooms 365 days.And there are 23 neighbourhoods with room avaialibity for 365 days with all the room type options.

Private room offering is more when checked for 365 days availability

availability_365_df <- input_df %>% 
  filter(., availability_365 == 365) %>% 
  group_by(neighbourhood,room_type) %>% 
  summarise(., roomtype_count = n()) %>%
  arrange(neighbourhood)

# what are the room tpes? "Entire home/apt" "Private room"    "Shared room"  
unique(availability_365_df$room_type)
## [1] "Entire home/apt" "Private room"    "Shared room"
# showing all the neighbourhoods which the available room types offering for 365 days
availability_365_df
## # A tibble: 233 x 3
## # Groups:   neighbourhood [130]
##    neighbourhood      room_type       roomtype_count
##    <chr>              <chr>                    <int>
##  1 Astoria            Entire home/apt              5
##  2 Astoria            Private room                 7
##  3 Astoria            Shared room                  1
##  4 Battery Park City  Entire home/apt              1
##  5 Bay Ridge          Private room                 1
##  6 Bayside            Private room                 1
##  7 Bedford-Stuyvesant Entire home/apt             17
##  8 Bedford-Stuyvesant Private room                58
##  9 Bedford-Stuyvesant Shared room                 12
## 10 Bellerose          Private room                 2
## # … with 223 more rows
# ggplot depicting the avaialbility of each of the room types in the neighbourhoods which are avaialble 365 days
ggplot(availability_365_df, aes(x = neighbourhood, y = room_type , fill = room_type, colour = room_type)) +
  geom_point() +
  theme(axis.text.x=element_blank())

neighbourood_with365_and3roomtypes_df <- availability_365_df %>% 
  summarise(., roomtype_count = n())
# showing all the neighbourhoods which have the room offering for 365 days
neighbourood_with365_and3roomtypes_df
## # A tibble: 130 x 2
##    neighbourhood      roomtype_count
##    <chr>                       <int>
##  1 Astoria                         3
##  2 Battery Park City               1
##  3 Bay Ridge                       1
##  4 Bayside                         1
##  5 Bedford-Stuyvesant              3
##  6 Bellerose                       2
##  7 Bensonhurst                     1
##  8 Boerum Hill                     2
##  9 Borough Park                    2
## 10 Briarwood                       3
## # … with 120 more rows
neighbourood_with365_and3roomtypes_df <- neighbourood_with365_and3roomtypes_df %>% 
  filter(., roomtype_count == 3)
# showing all the neighbourhoods which have all the room types offering for 365 days
neighbourood_with365_and3roomtypes_df
## # A tibble: 23 x 2
##    neighbourhood      roomtype_count
##    <chr>                       <int>
##  1 Astoria                         3
##  2 Bedford-Stuyvesant              3
##  3 Briarwood                       3
##  4 Bushwick                        3
##  5 Chelsea                         3
##  6 Crown Heights                   3
##  7 East Flatbush                   3
##  8 East Harlem                     3
##  9 East New York                   3
## 10 Flatbush                        3
## # … with 13 more rows

Analysis2: Cheapest option avaialbe in each of the nieghbourhoods

cheapest_options_in_neighbouhood_df <- input_df %>%
  group_by(neighbourhood_group,neighbourhood) %>%
  filter(., price == min(price))

# lising newighbourds with the cheapest options
cheapest_options_in_neighbouhood_df
## # A tibble: 299 x 16
## # Groups:   neighbourhood_group, neighbourhood [221]
##        id name  host_id host_name neighbourhood_g… neighbourhood latitude
##     <int> <chr>   <int> <chr>     <chr>            <chr>            <dbl>
##  1 1.69e5 " Af…  8.06e5 Vanessa   Bronx            University H…     40.9
##  2 3.75e5 Enjo…  1.89e6 Rimma & … Staten Island    Graniteville      40.6
##  3 1.13e6 "spa…  8.35e5 Leah      Brooklyn         South Slope       40.7
##  4 1.62e6 Larg…  2.20e6 Sally     Manhattan        East Village      40.7
##  5 1.68e6 Arve…  7.63e5 Christop… Queens           Arverne           40.6
##  6 1.78e6 Spac…  3.98e6 Aude      Manhattan        Battery Park…     40.7
##  7 1.80e6 Spac…  9.43e6 Donna     Staten Island    Woodrow           40.5
##  8 1.80e6 DOMI…  3.11e5 Vie       Bronx            Co-op City        40.9
##  9 2.04e6 Priv…  1.04e7 Susan     Brooklyn         Dyker Heights     40.6
## 10 3.47e6 "“No…  4.06e6 Lisa, Na… Staten Island    Eltingville       40.5
## # … with 289 more rows, and 9 more variables: longitude <dbl>,
## #   room_type <chr>, price <int>, minimum_nights <int>,
## #   number_of_reviews <int>, last_review <chr>, reviews_per_month <dbl>,
## #   calculated_host_listings_count <int>, availability_365 <int>

questoin2: What are my room type options?

answer: Manhattan has most options for “Entire home/apt” and Brooklyn has most options for “Pivate room”, while Manhattan and Brooklyn had almost the same options for “Shared room”.

Staten Island and Bronx are the one’s with very few options. Queens has moderate availability

# creating dataframe for newighbourhood groups, room type and the percentages of room types for each of the newighbourhood groups
# Notie, the sum taken is the total sum and not by the each nieghbouood group
neighbourhood_room_options_df <- input_df %>% 
  group_by(neighbourhood_group,room_type) %>% 
  summarise(., roomtype_count = n()) %>%
  mutate(roomtype_percent = round(roomtype_count/sum(.$roomtype_count)*100,2)) %>%
  arrange(neighbourhood_group)

neighbourhood_room_options_df
## # A tibble: 15 x 4
## # Groups:   neighbourhood_group [5]
##    neighbourhood_group room_type       roomtype_count roomtype_percent
##    <chr>               <chr>                    <int>            <dbl>
##  1 Bronx               Entire home/apt            379             0.78
##  2 Bronx               Private room               652             1.33
##  3 Bronx               Shared room                 60             0.12
##  4 Brooklyn            Entire home/apt           9559            19.6 
##  5 Brooklyn            Private room             10132            20.7 
##  6 Brooklyn            Shared room                413             0.84
##  7 Manhattan           Entire home/apt          13199            27.0 
##  8 Manhattan           Private room              7982            16.3 
##  9 Manhattan           Shared room                480             0.98
## 10 Queens              Entire home/apt           2096             4.29
## 11 Queens              Private room              3372             6.9 
## 12 Queens              Shared room                198             0.4 
## 13 Staten Island       Entire home/apt            176             0.36
## 14 Staten Island       Private room               188             0.38
## 15 Staten Island       Shared room                  9             0.02
ggplot(neighbourhood_room_options_df, aes(x= room_type, y = roomtype_percent, fill = room_type)) + 
  geom_bar(stat = "identity", width=0.5, position=position_dodge()) + 
  geom_text(aes(label=roomtype_percent), vjust=-0.3, size=3, position = position_dodge(width = 0.9)) + 
  facet_grid(. ~ neighbourhood_group, scales = "free_x") + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(y = "roomtype_percent", title = "Chart showing percentages of room types availabe by the neighbourhood groups")

END DATASET 3

—————————————————————————————————————————————————————————————————————————————-