START DATASET 1
# Using file “FRB_FOR.csv” from Raghed Mirza’s discussion post “Industry capacity utilization and recessions”
# libraries used
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(DescTools)
# Reading file into R using "read.table" function
input_df <- read.table("FRB_FOR.csv", sep = ",", header = T, stringsAsFactors = F)
# Deleting unwanted rows
input_df <- input_df[-(1:5),]
# Redoing the row sequence
rownames(input_df) <- 1:nrow(input_df)
#Data type for the "Series.Description" coulmn was upated and We have the input data ready in required format for further analysis
input_df[-1] <- round(sapply(input_df[-1], as.numeric),2)
input_df
## Series.Description Debt.service.ratio..seasonally.adjusted
## 1 2014Q3 9.87
## 2 2014Q4 9.86
## 3 2015Q1 9.92
## 4 2015Q2 9.91
## 5 2015Q3 9.95
## 6 2015Q4 9.93
## 7 2016Q1 9.90
## 8 2016Q2 10.00
## 9 2016Q3 10.06
## 10 2016Q4 10.03
## 11 2017Q1 9.96
## 12 2017Q2 9.96
## 13 2017Q3 9.94
## 14 2017Q4 9.88
## 15 2018Q1 9.73
## 16 2018Q2 9.70
## 17 2018Q3 9.69
## 18 2018Q4 9.74
## 19 2019Q1 9.71
## 20 2019Q2 9.69
## Mortgage.debt.service.ratio..seasonally.adjusted
## 1 4.61
## 2 4.56
## 3 4.55
## 4 4.49
## 5 4.46
## 6 4.48
## 7 4.46
## 8 4.46
## 9 4.45
## 10 4.38
## 11 4.33
## 12 4.31
## 13 4.30
## 14 4.25
## 15 4.19
## 16 4.18
## 17 4.16
## 18 4.18
## 19 4.16
## 20 4.13
## Consumer.debt.service.ratio..seasonally.adjusted
## 1 5.26
## 2 5.30
## 3 5.37
## 4 5.42
## 5 5.49
## 6 5.44
## 7 5.44
## 8 5.53
## 9 5.61
## 10 5.65
## 11 5.63
## 12 5.65
## 13 5.64
## 14 5.63
## 15 5.55
## 16 5.52
## 17 5.53
## 18 5.56
## 19 5.55
## 20 5.56
## Financial.obligations.ratio..seasonally.adjusted
## 1 15.16
## 2 15.19
## 3 15.34
## 4 15.39
## 5 15.48
## 6 15.52
## 7 15.52
## 8 15.66
## 9 15.73
## 10 15.69
## 11 15.57
## 12 15.56
## 13 15.52
## 14 15.41
## 15 15.18
## 16 15.12
## 17 15.09
## 18 15.12
## 19 15.07
## 20 15.03
#Using pipe operator and adding new columns by mutate and seleting the required columns
#Year and quarter were seperated for better view of the data. Then data types were updated as required to be able to perform math operations.
clean_df <- input_df %>% mutate(year = str_extract(Series.Description, "[0-9]+"), quarter = str_extract(Series.Description, "[[:alpha:]]+[0-9]+")) %>% dplyr::select(year, quarter, 2:5)
#changing the datatype of columns
clean_df[-2] <- round(sapply(clean_df[-2], as.numeric),2)
clean_df
## year quarter Debt.service.ratio..seasonally.adjusted
## 1 2014 Q3 9.87
## 2 2014 Q4 9.86
## 3 2015 Q1 9.92
## 4 2015 Q2 9.91
## 5 2015 Q3 9.95
## 6 2015 Q4 9.93
## 7 2016 Q1 9.90
## 8 2016 Q2 10.00
## 9 2016 Q3 10.06
## 10 2016 Q4 10.03
## 11 2017 Q1 9.96
## 12 2017 Q2 9.96
## 13 2017 Q3 9.94
## 14 2017 Q4 9.88
## 15 2018 Q1 9.73
## 16 2018 Q2 9.70
## 17 2018 Q3 9.69
## 18 2018 Q4 9.74
## 19 2019 Q1 9.71
## 20 2019 Q2 9.69
## Mortgage.debt.service.ratio..seasonally.adjusted
## 1 4.61
## 2 4.56
## 3 4.55
## 4 4.49
## 5 4.46
## 6 4.48
## 7 4.46
## 8 4.46
## 9 4.45
## 10 4.38
## 11 4.33
## 12 4.31
## 13 4.30
## 14 4.25
## 15 4.19
## 16 4.18
## 17 4.16
## 18 4.18
## 19 4.16
## 20 4.13
## Consumer.debt.service.ratio..seasonally.adjusted
## 1 5.26
## 2 5.30
## 3 5.37
## 4 5.42
## 5 5.49
## 6 5.44
## 7 5.44
## 8 5.53
## 9 5.61
## 10 5.65
## 11 5.63
## 12 5.65
## 13 5.64
## 14 5.63
## 15 5.55
## 16 5.52
## 17 5.53
## 18 5.56
## 19 5.55
## 20 5.56
## Financial.obligations.ratio..seasonally.adjusted
## 1 15.16
## 2 15.19
## 3 15.34
## 4 15.39
## 5 15.48
## 6 15.52
## 7 15.52
## 8 15.66
## 9 15.73
## 10 15.69
## 11 15.57
## 12 15.56
## 13 15.52
## 14 15.41
## 15 15.18
## 16 15.12
## 17 15.09
## 18 15.12
## 19 15.07
## 20 15.03
question1: What is the trend of Mortgage debt service ratio and consumer debt service ratio.
finding: Mortgage deb ratio is always lower than Cosumer debt ratio. And Mortgage debt ratio has a clear downward trend while the Consumer debt ratio had fluctuating upward trend. This could be indication that consumer paying less attention to consumer debt and concentrating more on Mortgage debt. This will be a better sign for mortgage lenders.
# I am using line graph in GGplot to show the trend of the Mortgage service ratio and Consumer debt service ratio.
# Initial dataset was used here
# geom_point marks each of the ratios as dots in the plot.
# geom_line draws a line through these points and arrow pointer wass used additionally to show the direction we are using in analysis
#notes: group=1 msut be used to have the proportions of each x value to be relative to all vleus of x. Otherwise the geom_line option doesn't work in this scenario
ggplot(input_df, aes(x=Series.Description, group = 1)) +
geom_point(aes(y= Mortgage.debt.service.ratio..seasonally.adjusted)) +
geom_point(aes(y= Consumer.debt.service.ratio..seasonally.adjusted)) +
geom_line(aes(y= Mortgage.debt.service.ratio..seasonally.adjusted, colour = "1"),arrow = arrow()) +
geom_line(aes(y= Consumer.debt.service.ratio..seasonally.adjusted, colour = "2"),arrow = arrow()) +
scale_color_discrete(name = "Debt Ratios", labels = c("Mortgage", "Consumer")) +
labs(y="Multiple Debt service Ratios") +
theme(axis.text.x = element_text(size=10, angle=90))
question2: what are the percentages of mortgage service ratio and consumer service ratio in the total rate service ratio and financial obligations ratio?
Answer: Consumer service ratio played a mjor role in the total ratios. Mortgage ratio had lower percentage and consumer rate ratio had higher percentage. The data below is also segregated by quarters as I wanted to check if any particulr quarter was better(had lower number). But the quarters didn’t matter. The data appears to be synched with year on year.
# 1
# using clean_df dataframe to create subset for percentages of numbers in total_debt_service_ratio
# Percentages were calculated and gather function was used to show percent of mortgage and consumer ratios per each year and quarter combination. Ans sapply was used to change datatypes
debt_service_percentages_df <- clean_df %>%
mutate(mortgage_debt_ratio_percent = round(clean_df$Mortgage.debt.service.ratio..seasonally.adjusted/clean_df$Debt.service.ratio..seasonally.adjusted*100,2),
consumer_debt_ratio_percent = round(clean_df$Consumer.debt.service.ratio..seasonally.adjusted/clean_df$Debt.service.ratio..seasonally.adjusted*100,2)) %>%
dplyr::select(year, quarter, mortgage_debt_ratio_percent, consumer_debt_ratio_percent) %>%
gather(., debt_ratio_percentages, percent, 3:4)
#changing the datatype of columns
debt_service_percentages_df[1,4] <- round(sapply(debt_service_percentages_df[1,4], as.numeric),2)
#showing the data from newly created dataframe
debt_service_percentages_df
## year quarter debt_ratio_percentages percent
## 1 2014 Q3 mortgage_debt_ratio_percent 46.71
## 2 2014 Q4 mortgage_debt_ratio_percent 46.25
## 3 2015 Q1 mortgage_debt_ratio_percent 45.87
## 4 2015 Q2 mortgage_debt_ratio_percent 45.31
## 5 2015 Q3 mortgage_debt_ratio_percent 44.82
## 6 2015 Q4 mortgage_debt_ratio_percent 45.12
## 7 2016 Q1 mortgage_debt_ratio_percent 45.05
## 8 2016 Q2 mortgage_debt_ratio_percent 44.60
## 9 2016 Q3 mortgage_debt_ratio_percent 44.23
## 10 2016 Q4 mortgage_debt_ratio_percent 43.67
## 11 2017 Q1 mortgage_debt_ratio_percent 43.47
## 12 2017 Q2 mortgage_debt_ratio_percent 43.27
## 13 2017 Q3 mortgage_debt_ratio_percent 43.26
## 14 2017 Q4 mortgage_debt_ratio_percent 43.02
## 15 2018 Q1 mortgage_debt_ratio_percent 43.06
## 16 2018 Q2 mortgage_debt_ratio_percent 43.09
## 17 2018 Q3 mortgage_debt_ratio_percent 42.93
## 18 2018 Q4 mortgage_debt_ratio_percent 42.92
## 19 2019 Q1 mortgage_debt_ratio_percent 42.84
## 20 2019 Q2 mortgage_debt_ratio_percent 42.62
## 21 2014 Q3 consumer_debt_ratio_percent 53.29
## 22 2014 Q4 consumer_debt_ratio_percent 53.75
## 23 2015 Q1 consumer_debt_ratio_percent 54.13
## 24 2015 Q2 consumer_debt_ratio_percent 54.69
## 25 2015 Q3 consumer_debt_ratio_percent 55.18
## 26 2015 Q4 consumer_debt_ratio_percent 54.78
## 27 2016 Q1 consumer_debt_ratio_percent 54.95
## 28 2016 Q2 consumer_debt_ratio_percent 55.30
## 29 2016 Q3 consumer_debt_ratio_percent 55.77
## 30 2016 Q4 consumer_debt_ratio_percent 56.33
## 31 2017 Q1 consumer_debt_ratio_percent 56.53
## 32 2017 Q2 consumer_debt_ratio_percent 56.73
## 33 2017 Q3 consumer_debt_ratio_percent 56.74
## 34 2017 Q4 consumer_debt_ratio_percent 56.98
## 35 2018 Q1 consumer_debt_ratio_percent 57.04
## 36 2018 Q2 consumer_debt_ratio_percent 56.91
## 37 2018 Q3 consumer_debt_ratio_percent 57.07
## 38 2018 Q4 consumer_debt_ratio_percent 57.08
## 39 2019 Q1 consumer_debt_ratio_percent 57.16
## 40 2019 Q2 consumer_debt_ratio_percent 57.38
# using ggplot to show the percentages of mortgage and consuler debt ratios in total debt ratio segregated by quarters
# geom_bar shows the percentages as bars.
# gem_text shows the percentages above each bar
# facet_wrap segregates by quarter. wrap was used instead of grid for better view of the chart.
ggplot(debt_service_percentages_df, aes(x = year, y = percent, fill = debt_ratio_percentages)) +
geom_bar(stat = "identity", width=0.5, position=position_dodge()) +
geom_text(aes(label=percent), vjust=-0.1, size=2.5, position = position_dodge(width = 0.9)) +
facet_wrap(. ~ quarter, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90)) +
labs(y = "debt_ratio_percentages", title = "Percantages of Mortgage & consumer debt ratios in debt service ratio by quarter")
# 2
# using clean_df dataframe to create subset for percentages of numbers in financial_obligations_ratio
# Percentages were calculated and gather function was used to show percent of mortgage and consumer ratios per each year and quarter combination. Ans sapply was used to change datatypes
financial_obligations_percentages_df <- clean_df %>%
mutate(mortgage_debt_ratio_percent = round(clean_df$Mortgage.debt.service.ratio..seasonally.adjusted/clean_df$Financial.obligations.ratio..seasonally.adjusted*100,2),
consumer_debt_ratio_percent = round(clean_df$Consumer.debt.service.ratio..seasonally.adjusted/clean_df$Financial.obligations.ratio..seasonally.adjusted*100,2)) %>%
dplyr::select(year, quarter, mortgage_debt_ratio_percent, consumer_debt_ratio_percent) %>%
gather(., financial_obligations_ratio_percentages, percent, 3:4)
#changing the datatype of columns
financial_obligations_percentages_df[1,4] <- round(sapply(financial_obligations_percentages_df[1,4], as.numeric),2)
#showing the data from newly created dataframe
financial_obligations_percentages_df
## year quarter financial_obligations_ratio_percentages percent
## 1 2014 Q3 mortgage_debt_ratio_percent 30.41
## 2 2014 Q4 mortgage_debt_ratio_percent 30.02
## 3 2015 Q1 mortgage_debt_ratio_percent 29.66
## 4 2015 Q2 mortgage_debt_ratio_percent 29.17
## 5 2015 Q3 mortgage_debt_ratio_percent 28.81
## 6 2015 Q4 mortgage_debt_ratio_percent 28.87
## 7 2016 Q1 mortgage_debt_ratio_percent 28.74
## 8 2016 Q2 mortgage_debt_ratio_percent 28.48
## 9 2016 Q3 mortgage_debt_ratio_percent 28.29
## 10 2016 Q4 mortgage_debt_ratio_percent 27.92
## 11 2017 Q1 mortgage_debt_ratio_percent 27.81
## 12 2017 Q2 mortgage_debt_ratio_percent 27.70
## 13 2017 Q3 mortgage_debt_ratio_percent 27.71
## 14 2017 Q4 mortgage_debt_ratio_percent 27.58
## 15 2018 Q1 mortgage_debt_ratio_percent 27.60
## 16 2018 Q2 mortgage_debt_ratio_percent 27.65
## 17 2018 Q3 mortgage_debt_ratio_percent 27.57
## 18 2018 Q4 mortgage_debt_ratio_percent 27.65
## 19 2019 Q1 mortgage_debt_ratio_percent 27.60
## 20 2019 Q2 mortgage_debt_ratio_percent 27.48
## 21 2014 Q3 consumer_debt_ratio_percent 34.70
## 22 2014 Q4 consumer_debt_ratio_percent 34.89
## 23 2015 Q1 consumer_debt_ratio_percent 35.01
## 24 2015 Q2 consumer_debt_ratio_percent 35.22
## 25 2015 Q3 consumer_debt_ratio_percent 35.47
## 26 2015 Q4 consumer_debt_ratio_percent 35.05
## 27 2016 Q1 consumer_debt_ratio_percent 35.05
## 28 2016 Q2 consumer_debt_ratio_percent 35.31
## 29 2016 Q3 consumer_debt_ratio_percent 35.66
## 30 2016 Q4 consumer_debt_ratio_percent 36.01
## 31 2017 Q1 consumer_debt_ratio_percent 36.16
## 32 2017 Q2 consumer_debt_ratio_percent 36.31
## 33 2017 Q3 consumer_debt_ratio_percent 36.34
## 34 2017 Q4 consumer_debt_ratio_percent 36.53
## 35 2018 Q1 consumer_debt_ratio_percent 36.56
## 36 2018 Q2 consumer_debt_ratio_percent 36.51
## 37 2018 Q3 consumer_debt_ratio_percent 36.65
## 38 2018 Q4 consumer_debt_ratio_percent 36.77
## 39 2019 Q1 consumer_debt_ratio_percent 36.83
## 40 2019 Q2 consumer_debt_ratio_percent 36.99
# using ggplot to show the percentages of mortgage and consuler debt ratios in financial_obligations ratio segregated by quarters
# geom_bar shows the percentages as bars.
# gem_text shows the percentages above each bar
# facet_wrap segregates by quarter. wrap was used instead of grid for better view of the chart.
ggplot(financial_obligations_percentages_df, aes(x = year, y = percent, fill = financial_obligations_ratio_percentages)) +
geom_bar(stat = "identity", width=0.5, position=position_dodge()) +
geom_text(aes(label=percent), vjust=-0.1, size=2.5, position = position_dodge(width = 0.9)) +
facet_wrap(. ~ quarter, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90)) +
labs(y = "financial_obligations_ratio_percentages", title = "Percantages of Mortgage & consumer debt ratios in the financial obligations ratio")
#Question3: What is the ratio of other debt ratios in the overall ratio (Financial obligations ratio). # Answer: other debt ratios combined appears to be also playing major part in the increase in overall ratio. This combined other debt ratio is close to consumer debt ratio. # line graph was also created to show the total ratios as well as other ratios to see how they varied in each quarter
# creating dataframe to store the calculated remaiaing ratio from other debts
other_debt_df <- clean_df %>%
mutate(other.debt.ratio = round(Financial.obligations.ratio..seasonally.adjusted-Debt.service.ratio..seasonally.adjusted,2))
#summary of the other_debt_df written to a dataframe
summary.df <- data.frame(summary(other_debt_df)) %>% dplyr::select(2:3)
names(summary.df)
## [1] "Var2" "Freq"
summary.df
## Var2 Freq
## 1 year Min. :2014
## 2 year 1st Qu.:2015
## 3 year Median :2016
## 4 year Mean :2016
## 5 year 3rd Qu.:2018
## 6 year Max. :2019
## 7 quarter Length:20
## 8 quarter Class :character
## 9 quarter Mode :character
## 10 quarter <NA>
## 11 quarter <NA>
## 12 quarter <NA>
## 13 Debt.service.ratio..seasonally.adjusted Min. : 9.690
## 14 Debt.service.ratio..seasonally.adjusted 1st Qu.: 9.738
## 15 Debt.service.ratio..seasonally.adjusted Median : 9.905
## 16 Debt.service.ratio..seasonally.adjusted Mean : 9.871
## 17 Debt.service.ratio..seasonally.adjusted 3rd Qu.: 9.953
## 18 Debt.service.ratio..seasonally.adjusted Max. :10.060
## 19 Mortgage.debt.service.ratio..seasonally.adjusted Min. :4.130
## 20 Mortgage.debt.service.ratio..seasonally.adjusted 1st Qu.:4.188
## 21 Mortgage.debt.service.ratio..seasonally.adjusted Median :4.355
## 22 Mortgage.debt.service.ratio..seasonally.adjusted Mean :4.354
## 23 Mortgage.debt.service.ratio..seasonally.adjusted 3rd Qu.:4.465
## 24 Mortgage.debt.service.ratio..seasonally.adjusted Max. :4.610
## 25 Consumer.debt.service.ratio..seasonally.adjusted Min. :5.260
## 26 Consumer.debt.service.ratio..seasonally.adjusted 1st Qu.:5.440
## 27 Consumer.debt.service.ratio..seasonally.adjusted Median :5.540
## 28 Consumer.debt.service.ratio..seasonally.adjusted Mean :5.516
## 29 Consumer.debt.service.ratio..seasonally.adjusted 3rd Qu.:5.615
## 30 Consumer.debt.service.ratio..seasonally.adjusted Max. :5.650
## 31 Financial.obligations.ratio..seasonally.adjusted Min. :15.03
## 32 Financial.obligations.ratio..seasonally.adjusted 1st Qu.:15.15
## 33 Financial.obligations.ratio..seasonally.adjusted Median :15.40
## 34 Financial.obligations.ratio..seasonally.adjusted Mean :15.37
## 35 Financial.obligations.ratio..seasonally.adjusted 3rd Qu.:15.53
## 36 Financial.obligations.ratio..seasonally.adjusted Max. :15.73
## 37 other.debt.ratio Min. :5.290
## 38 other.debt.ratio 1st Qu.:5.395
## 39 other.debt.ratio Median :5.505
## 40 other.debt.ratio Mean :5.496
## 41 other.debt.ratio 3rd Qu.:5.603
## 42 other.debt.ratio Max. :5.670
# extracting lowest number for all debt ratios
lowest_financial_obligation_ratio <- (filter(summary.df, Freq %like% "^(Min. )[[:print:]]+"))
lowest_financial_obligation_ratio
## Var2 Freq
## 1 year Min. :2014
## 2 Debt.service.ratio..seasonally.adjusted Min. : 9.690
## 3 Mortgage.debt.service.ratio..seasonally.adjusted Min. :4.130
## 4 Consumer.debt.service.ratio..seasonally.adjusted Min. :5.260
## 5 Financial.obligations.ratio..seasonally.adjusted Min. :15.03
## 6 other.debt.ratio Min. :5.290
# extracting highest number for all debt ratios
highest_financial_obligation_ratio <- filter(summary.df, Freq %like% "^(Max. )[[:print:]]+")
highest_financial_obligation_ratio
## Var2 Freq
## 1 year Max. :2019
## 2 Debt.service.ratio..seasonally.adjusted Max. :10.060
## 3 Mortgage.debt.service.ratio..seasonally.adjusted Max. :4.610
## 4 Consumer.debt.service.ratio..seasonally.adjusted Max. :5.650
## 5 Financial.obligations.ratio..seasonally.adjusted Max. :15.73
## 6 other.debt.ratio Max. :5.670
# showing the results based on minimum and maximum Financial.obligations.ratio as this the overall ratio
# year 2016 and Q3 has highest ratio
# year 2019 and Q2 has lowest ratio
(filter(other_debt_df, Financial.obligations.ratio..seasonally.adjusted %in% c(sapply(str_extract(lowest_financial_obligation_ratio$Freq, "[0-9]+\\.[0-9]+"),as.numeric),sapply(str_extract(highest_financial_obligation_ratio$Freq, "[0-9]+\\.[0-9]+"),as.numeric))))
## year quarter Debt.service.ratio..seasonally.adjusted
## 1 2016 Q3 10.06
## 2 2019 Q2 9.69
## Mortgage.debt.service.ratio..seasonally.adjusted
## 1 4.45
## 2 4.13
## Consumer.debt.service.ratio..seasonally.adjusted
## 1 5.61
## 2 5.56
## Financial.obligations.ratio..seasonally.adjusted other.debt.ratio
## 1 15.73 5.67
## 2 15.03 5.34
# using ggplot to show the line graphs for the debt rate ratio, financial obligation ratio and other debt ratio. The results aree also segregated by quarter to see if we can get some insights based on quarter. No effect of quarter n this either
ggplot(other_debt_df, aes(x=year, group = 1)) +
geom_point(aes(y= Financial.obligations.ratio..seasonally.adjusted)) +
geom_point(aes(y= Debt.service.ratio..seasonally.adjusted)) +
geom_point(aes(y= other.debt.ratio)) +
geom_line(aes(y= Financial.obligations.ratio..seasonally.adjusted, colour = "1"),arrow = arrow()) +
geom_line(aes(y= Debt.service.ratio..seasonally.adjusted, colour = "2"),arrow = arrow()) +
geom_line(aes(y= other.debt.ratio, colour = "3"),arrow = arrow()) +
facet_grid(. ~ quarter) +
scale_color_discrete(name = "Debt Ratios", labels = c("Financial.obligations.ratio", "Debt.service.ratio", "other.debt.ratio,")) +
labs(y="Multiple Debt service Ratios") +
theme(axis.text.x = element_text(size=10, angle=90))
END DATASET 1
—————————————————————————————————————————————————————————————————————————————-
START DATASET 2
Using file “Arrests.csv” from Sie Song Wong’s discussion post “Marijuana Arrests Dataset”
# Reading file into R using "read.table" function
input_df <- read.table("Arrests.csv", sep = ",", header = T, stringsAsFactors = F)
#Deleting unwanted column color
input_df <- input_df[,-c(3)]
#Renamng column 1
names(input_df)[1] <-"convict_id"
#print part of the dataframe
head(input_df)
## convict_id released year age sex employed citizen checks
## 1 1 Yes 2002 21 Male Yes Yes 3
## 2 2 No 1999 17 Male Yes Yes 3
## 3 3 Yes 2000 24 Male Yes Yes 3
## 4 4 No 2000 46 Male Yes Yes 1
## 5 5 Yes 1999 27 Female Yes Yes 1
## 6 6 Yes 1998 16 Female Yes Yes 0
# Reordering columns
clean_df <- input_df %>% dplyr::select(convict_id, year, released, checks, age, sex, citizen, employed) %>% arrange(., year)
names(clean_df)[4] <-"Past_history_count"
names(clean_df)[6] <-"Gender"
##print part of the clean dataframe
head(clean_df)
## convict_id year released Past_history_count age Gender citizen employed
## 1 14 1997 Yes 0 42 Male Yes Yes
## 2 43 1997 Yes 1 39 Male Yes Yes
## 3 46 1997 No 1 21 Male No Yes
## 4 49 1997 Yes 0 26 Female Yes Yes
## 5 53 1997 Yes 4 18 Male Yes Yes
## 6 62 1997 Yes 3 53 Male No Yes
Question1: Figure out which gender had more arrests between year 1997 and 2002.
Answer: Based on the data, females had fewer arrests than males.
# Creating a dataset with counts of arrests based on Gender
arrest_count_by_gender <- clean_df %>%
filter(., year %in% c(1997:2002)) %>%
group_by(Gender) %>%
summarise(.,count = n())
arrest_count_by_gender
## # A tibble: 2 x 2
## Gender count
## <chr> <int>
## 1 Female 443
## 2 Male 4783
# barplot
bp <- barplot(arrest_count_by_gender$count, main = "Arrest count by Gender", xlab = "color", ylab = "count",
names.arg = arrest_count_by_gender$Gender, ylim= range(pretty(c(0, arrest_count_by_gender$count+1000))),
col = c("pink","blue"))
text(x = bp, y = arrest_count_by_gender$count, label = arrest_count_by_gender$count, pos = 3, cex = 0.8)
legend("topleft", unique(arrest_count_by_gender$Gender), cex = 1, fill = c("pink","blue"))
Question2: Figure out which age group had more arrests and check what is the percent of males and females in those groups. Consider adulthood (18 to 35 years), middle age (36 to 55 years), and older adulthood (56 years and older).
Answer: Based on the data, females had fewer arrests than males. And adulthood (18 to 35 years) was seen to have more arrest percentages
# creating a new data frame by adding column age_group
agegroup_df <- clean_df %>%
filter(., year %in% c(1997:2002)) %>%
mutate(age_group = "18 below")
head(agegroup_df)
## convict_id year released Past_history_count age Gender citizen employed
## 1 14 1997 Yes 0 42 Male Yes Yes
## 2 43 1997 Yes 1 39 Male Yes Yes
## 3 46 1997 No 1 21 Male No Yes
## 4 49 1997 Yes 0 26 Female Yes Yes
## 5 53 1997 Yes 4 18 Male Yes Yes
## 6 62 1997 Yes 3 53 Male No Yes
## age_group
## 1 18 below
## 2 18 below
## 3 18 below
## 4 18 below
## 5 18 below
## 6 18 below
# updating column age_group with appropriate values
i <- 1
while(i <= nrow(agegroup_df))
{
agegroup_df$age_group[i] = if(agegroup_df$age[i] >= 18 & agegroup_df$age[i] <=35)
{
"18 to 35"
}else if(agegroup_df$age[i] >= 36 & agegroup_df$age[i] <=55)
{
"36 to 55"
}else if(agegroup_df$age[i] >= 56)
{
"56 and older"
}else
{
"18 below"
}
i <- i+1
}
head(agegroup_df)
## convict_id year released Past_history_count age Gender citizen employed
## 1 14 1997 Yes 0 42 Male Yes Yes
## 2 43 1997 Yes 1 39 Male Yes Yes
## 3 46 1997 No 1 21 Male No Yes
## 4 49 1997 Yes 0 26 Female Yes Yes
## 5 53 1997 Yes 4 18 Male Yes Yes
## 6 62 1997 Yes 3 53 Male No Yes
## age_group
## 1 36 to 55
## 2 36 to 55
## 3 18 to 35
## 4 18 to 35
## 5 18 to 35
## 6 36 to 55
arrest_count_by_age <- agegroup_df %>%
group_by(age_group, Gender) %>%
summarise(.,count = n()) %>%
mutate(count_percent = round(count/sum(.$count)*100,2))
arrest_count_by_age
## # A tibble: 7 x 4
## # Groups: age_group [4]
## age_group Gender count count_percent
## <chr> <chr> <int> <dbl>
## 1 18 below Female 125 2.39
## 2 18 below Male 934 17.9
## 3 18 to 35 Female 259 4.96
## 4 18 to 35 Male 3305 63.2
## 5 36 to 55 Female 59 1.13
## 6 36 to 55 Male 527 10.1
## 7 56 and older Male 17 0.33
ggplot(arrest_count_by_age, aes(x = age_group, y = count_percent, fill = Gender)) +
geom_bar(stat = "identity", width=0.5, position=position_dodge()) +
geom_text(aes(label= count_percent), vjust=-0.1, size=3.5, position = position_dodge(width = 0.9)) +
facet_grid(. ~ Gender) +
theme(axis.text.x = element_text(angle = 90)) +
labs(y = "count_percent", title = "Arrest percent by agegroup for Male and Female")
Question3: Most times arrested and were they citizens and were they employed
Answer: Majority convicted are citizens and employed, And few were convicted who are both non citizen and not employed. Also, 85% are citizens.
# creating dataframe to answer question 3
# required columns were selected, data in citizen and employed status was updated and columns were reorganized
other_factors_df <- clean_df %>%
filter(., year %in% c(1997:2002)) %>%
dplyr::select(convict_id, Past_history_count, citizen, employed) %>%
mutate(citizen = replace(replace(citizen, citizen == "No", "Non Citizen"), citizen == "Yes", "Citizen")) %>%
mutate(employed = replace(replace(employed, employed == "No", "Not employed"), employed == "Yes", "employed")) %>%
mutate(status = str_c(citizen," & ",employed)) %>%
dplyr::select(convict_id, status, Past_history_count)
#adding count 1 which is the current arrest record to the past history count
other_factors_df$Past_history_count <- other_factors_df$Past_history_count+1
#showing part of the dataset
head(other_factors_df)
## convict_id status Past_history_count
## 1 14 Citizen & employed 1
## 2 43 Citizen & employed 2
## 3 46 Non Citizen & employed 2
## 4 49 Citizen & employed 1
## 5 53 Citizen & employed 5
## 6 62 Non Citizen & employed 4
#final dataset to show the arrest count and percentages by status
# Arrest count and percentages were calculated additionally in this data frame
arrest_count_by_other_factors <- other_factors_df %>%
group_by(status) %>%
summarise(.,count = n()) %>%
mutate(count_percent = round(count/sum(.$count)*100,2))
#showing the results of arrest count and percentage
arrest_count_by_other_factors
## # A tibble: 4 x 3
## status count count_percent
## <chr> <int> <dbl>
## 1 Citizen & employed 3564 68.2
## 2 Citizen & Not employed 891 17.0
## 3 Non Citizen & employed 547 10.5
## 4 Non Citizen & Not employed 224 4.29
# Pie chart to give visual display of the percentages
pie(arrest_count_by_other_factors$count_percent, label = arrest_count_by_other_factors$count_percent, main = "Arrest percentage by citizenship and employment status",col = rainbow(length(arrest_count_by_other_factors$count_percent)))
legend("topright", arrest_count_by_other_factors$status, cex = 0.7,
fill = rainbow(length(arrest_count_by_other_factors$count_percent)))
END DATASET 2
—————————————————————————————————————————————————————————————————————————————-
START DATASET 3
Using file “AB_NYC_2019.csv” from Kaggle “https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/downloads/new-york-city-airbnb-open-data.zip/3”
# Reading file into R using "read.table" function
# using read.csv as read.table is data reading was not proper.Whole row is populaitng in a column randomly
input_df <- read.csv("AB_NYC_2019.csv", stringsAsFactors = F)
#showing the data in dataframe
head(input_df)
## id name host_id
## 1 2539 Clean & quiet apt home by the park 2787
## 2 2595 Skylit Midtown Castle 2845
## 3 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632
## 4 3831 Cozy Entire Floor of Brownstone 4869
## 5 5022 Entire Apt: Spacious Studio/Loft by central park 7192
## 6 5099 Large Cozy 1 BR Apartment In Midtown East 7322
## host_name neighbourhood_group neighbourhood latitude longitude
## 1 John Brooklyn Kensington 40.64749 -73.97237
## 2 Jennifer Manhattan Midtown 40.75362 -73.98377
## 3 Elisabeth Manhattan Harlem 40.80902 -73.94190
## 4 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976
## 5 Laura Manhattan East Harlem 40.79851 -73.94399
## 6 Chris Manhattan Murray Hill 40.74767 -73.97500
## room_type price minimum_nights number_of_reviews last_review
## 1 Private room 149 1 9 2018-10-19
## 2 Entire home/apt 225 1 45 2019-05-21
## 3 Private room 150 3 0
## 4 Entire home/apt 89 1 270 2019-07-05
## 5 Entire home/apt 80 10 9 2018-11-19
## 6 Entire home/apt 200 3 74 2019-06-22
## reviews_per_month calculated_host_listings_count availability_365
## 1 0.21 6 365
## 2 0.38 2 355
## 3 NA 1 365
## 4 4.64 1 194
## 5 0.10 1 0
## 6 0.59 1 129
question1: which neightbourhood has the rooms available for 365 days and has all the room types to choose from
Anser: There are 130 cities in total which offer rooms 365 days.And there are 23 neighbourhoods with room avaialibity for 365 days with all the room type options.
Private room offering is more when checked for 365 days availability
availability_365_df <- input_df %>%
filter(., availability_365 == 365) %>%
group_by(neighbourhood,room_type) %>%
summarise(., roomtype_count = n()) %>%
arrange(neighbourhood)
# what are the room tpes? "Entire home/apt" "Private room" "Shared room"
unique(availability_365_df$room_type)
## [1] "Entire home/apt" "Private room" "Shared room"
# showing all the neighbourhoods which the available room types offering for 365 days
availability_365_df
## # A tibble: 233 x 3
## # Groups: neighbourhood [130]
## neighbourhood room_type roomtype_count
## <chr> <chr> <int>
## 1 Astoria Entire home/apt 5
## 2 Astoria Private room 7
## 3 Astoria Shared room 1
## 4 Battery Park City Entire home/apt 1
## 5 Bay Ridge Private room 1
## 6 Bayside Private room 1
## 7 Bedford-Stuyvesant Entire home/apt 17
## 8 Bedford-Stuyvesant Private room 58
## 9 Bedford-Stuyvesant Shared room 12
## 10 Bellerose Private room 2
## # … with 223 more rows
# ggplot depicting the avaialbility of each of the room types in the neighbourhoods which are avaialble 365 days
ggplot(availability_365_df, aes(x = neighbourhood, y = room_type , fill = room_type, colour = room_type)) +
geom_point() +
theme(axis.text.x=element_blank())
neighbourood_with365_and3roomtypes_df <- availability_365_df %>%
summarise(., roomtype_count = n())
# showing all the neighbourhoods which have the room offering for 365 days
neighbourood_with365_and3roomtypes_df
## # A tibble: 130 x 2
## neighbourhood roomtype_count
## <chr> <int>
## 1 Astoria 3
## 2 Battery Park City 1
## 3 Bay Ridge 1
## 4 Bayside 1
## 5 Bedford-Stuyvesant 3
## 6 Bellerose 2
## 7 Bensonhurst 1
## 8 Boerum Hill 2
## 9 Borough Park 2
## 10 Briarwood 3
## # … with 120 more rows
neighbourood_with365_and3roomtypes_df <- neighbourood_with365_and3roomtypes_df %>%
filter(., roomtype_count == 3)
# showing all the neighbourhoods which have all the room types offering for 365 days
neighbourood_with365_and3roomtypes_df
## # A tibble: 23 x 2
## neighbourhood roomtype_count
## <chr> <int>
## 1 Astoria 3
## 2 Bedford-Stuyvesant 3
## 3 Briarwood 3
## 4 Bushwick 3
## 5 Chelsea 3
## 6 Crown Heights 3
## 7 East Flatbush 3
## 8 East Harlem 3
## 9 East New York 3
## 10 Flatbush 3
## # … with 13 more rows
Analysis2: Cheapest option avaialbe in each of the nieghbourhoods
cheapest_options_in_neighbouhood_df <- input_df %>%
group_by(neighbourhood_group,neighbourhood) %>%
filter(., price == min(price))
# lising newighbourds with the cheapest options
cheapest_options_in_neighbouhood_df
## # A tibble: 299 x 16
## # Groups: neighbourhood_group, neighbourhood [221]
## id name host_id host_name neighbourhood_g… neighbourhood latitude
## <int> <chr> <int> <chr> <chr> <chr> <dbl>
## 1 1.69e5 " Af… 8.06e5 Vanessa Bronx University H… 40.9
## 2 3.75e5 Enjo… 1.89e6 Rimma & … Staten Island Graniteville 40.6
## 3 1.13e6 "spa… 8.35e5 Leah Brooklyn South Slope 40.7
## 4 1.62e6 Larg… 2.20e6 Sally Manhattan East Village 40.7
## 5 1.68e6 Arve… 7.63e5 Christop… Queens Arverne 40.6
## 6 1.78e6 Spac… 3.98e6 Aude Manhattan Battery Park… 40.7
## 7 1.80e6 Spac… 9.43e6 Donna Staten Island Woodrow 40.5
## 8 1.80e6 DOMI… 3.11e5 Vie Bronx Co-op City 40.9
## 9 2.04e6 Priv… 1.04e7 Susan Brooklyn Dyker Heights 40.6
## 10 3.47e6 "“No… 4.06e6 Lisa, Na… Staten Island Eltingville 40.5
## # … with 289 more rows, and 9 more variables: longitude <dbl>,
## # room_type <chr>, price <int>, minimum_nights <int>,
## # number_of_reviews <int>, last_review <chr>, reviews_per_month <dbl>,
## # calculated_host_listings_count <int>, availability_365 <int>
questoin2: What are my room type options?
answer: Manhattan has most options for “Entire home/apt” and Brooklyn has most options for “Pivate room”, while Manhattan and Brooklyn had almost the same options for “Shared room”.
Staten Island and Bronx are the one’s with very few options. Queens has moderate availability
# creating dataframe for newighbourhood groups, room type and the percentages of room types for each of the newighbourhood groups
# Notie, the sum taken is the total sum and not by the each nieghbouood group
neighbourhood_room_options_df <- input_df %>%
group_by(neighbourhood_group,room_type) %>%
summarise(., roomtype_count = n()) %>%
mutate(roomtype_percent = round(roomtype_count/sum(.$roomtype_count)*100,2)) %>%
arrange(neighbourhood_group)
neighbourhood_room_options_df
## # A tibble: 15 x 4
## # Groups: neighbourhood_group [5]
## neighbourhood_group room_type roomtype_count roomtype_percent
## <chr> <chr> <int> <dbl>
## 1 Bronx Entire home/apt 379 0.78
## 2 Bronx Private room 652 1.33
## 3 Bronx Shared room 60 0.12
## 4 Brooklyn Entire home/apt 9559 19.6
## 5 Brooklyn Private room 10132 20.7
## 6 Brooklyn Shared room 413 0.84
## 7 Manhattan Entire home/apt 13199 27.0
## 8 Manhattan Private room 7982 16.3
## 9 Manhattan Shared room 480 0.98
## 10 Queens Entire home/apt 2096 4.29
## 11 Queens Private room 3372 6.9
## 12 Queens Shared room 198 0.4
## 13 Staten Island Entire home/apt 176 0.36
## 14 Staten Island Private room 188 0.38
## 15 Staten Island Shared room 9 0.02
ggplot(neighbourhood_room_options_df, aes(x= room_type, y = roomtype_percent, fill = room_type)) +
geom_bar(stat = "identity", width=0.5, position=position_dodge()) +
geom_text(aes(label=roomtype_percent), vjust=-0.3, size=3, position = position_dodge(width = 0.9)) +
facet_grid(. ~ neighbourhood_group, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90)) +
labs(y = "roomtype_percent", title = "Chart showing percentages of room types availabe by the neighbourhood groups")
END DATASET 3
—————————————————————————————————————————————————————————————————————————————-