## Warning: package 'maps' was built under R version 4.2.1
data_2012 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2012.csv")
## New names:
## Rows: 53368 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (14): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, mths_since_last_del...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2013 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2013.csv")
## New names:
## Rows: 134814 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2014 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2014.csv")
## New names:
## Rows: 235629 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2015 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2015.csv")
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 421095 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2016 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2016.csv")
## New names:
## Rows: 434407 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2017 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2017.csv")
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 443579 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_states <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\States.csv")
## Rows: 52 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Geography
## dbl (7): Num_Households, Median_income_Households, Unemployment rate; Estima...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_regions <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\states_regions.csv")
## Rows: 51 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): State, State_Code, Region, Division
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Concatenate all 6 tables into 1
LC_data <- rbind(data_2012, data_2013, data_2014, data_2015, data_2016, data_2017)
#dropped the previous index column because it doesn't apply to the new table, dropped the issue_d column because we already have that divided into issue_Month and issue_Year column.
LC_data_drop_columns <- subset(LC_data, select= -c(...1, issue_d))
#left join states data and regions data on state name, Puerto Rico is missing from the regions file.
data_states_regions <- data_states %>%
left_join(data_regions, c("Geography"="State"))
#join all the provided files, keeping all observations in loan data tables.
LC_data_joined <- LC_data_drop_columns %>%
left_join(data_states_regions, c("addr_state"="State_Code"))
#Remove NAs in states and divisions
#create bar chart of loan counts based on states and divisions, where x=Geography, bar color =Division, ordered by the loan count DESC.
df1 <- subset(LC_data_joined, !is.na(Geography)&!is.na(Division))
ggplot(data=df1, aes(x = fct_rev(fct_infreq(Geography)), fill=Division)) +
geom_bar()+
theme(axis.text.y = element_text(size=10))+
coord_flip()
ggplot(df1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(Division)), fill=Region))+
theme(axis.text.y = element_text(size=10))+
coord_flip()
ggplot(df1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(Region))))+
theme(axis.text.y = element_text(size=10))+
coord_flip()
#Facet the distribution by state chart by Division.
ggplot(df1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(Geography)), fill=Division), na.rm=FALSE)+
theme(axis.text.y = element_text(size=10))+
coord_flip()+
facet_wrap(~Division,nrow=2)
#Facet by regions
ggplot(df1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(Geography)), fill=Region), na.rm=FALSE)+
theme(axis.text.y = element_text(size=10))+
coord_flip()+
facet_wrap(~Region,nrow=1)
ggplot(df1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(Division)), fill=Region), na.rm=FALSE)+
theme(axis.text.y = element_text(size=10))+
coord_flip()+
facet_wrap(~Region,nrow=1)
#Count observations per state
df2 <- add_count(df1, Geography)
#Calculation loan count per capita
df3 <- mutate(df2, loan_per_capita = n / population)
#Select distinct geography + loan_per_capita
df4 <- df3 %>%
select (Geography, loan_per_capita)
df5 <- df4 %>% distinct()
#Create bar chart of loan_per_capita for each state
ggplot(df5) +
geom_col(mapping = aes(x = fct_rev(fct_rev(fct_reorder(Geography, loan_per_capita))), y = loan_per_capita))+
coord_flip()
loan_amount_average_state <- group_by(df1, Geography)
loan_amount_average_state <- summarise(loan_amount_average_state, average = mean(loan_amnt))
arrange(loan_amount_average_state, desc(average))
## # A tibble: 51 × 2
## Geography average
## <chr> <dbl>
## 1 Alaska 17058.
## 2 Virginia 15952.
## 3 District of Columbia 15730.
## 4 Maryland 15689.
## 5 New Jersey 15684.
## 6 Massachusetts 15673.
## 7 Hawaii 15586.
## 8 Texas 15569.
## 9 Wyoming 15466.
## 10 Illinois 15315.
## # … with 41 more rows
loan_amount_average_division <- group_by(df1, Division)
loan_amount_average_division<- summarise(loan_amount_average_division, average = mean(loan_amnt))
arrange(loan_amount_average_division, desc(average))
## # A tibble: 9 × 2
## Division average
## <chr> <dbl>
## 1 West South Central 15309.
## 2 New England 15169.
## 3 Pacific 15040.
## 4 Middle Atlantic 14903.
## 5 South Atlantic 14867.
## 6 East North Central 14567.
## 7 Mountain 14534.
## 8 East South Central 14459.
## 9 West North Central 14418.
#average loan amount by grade
loan_amount_average_grade <- group_by(df1, grade)
loan_amount_average_grade <- summarise(loan_amount_average_grade, average = mean(loan_amnt))
arrange(loan_amount_average_grade, desc(average))
## # A tibble: 7 × 2
## grade average
## <chr> <dbl>
## 1 G 20577.
## 2 F 19202.
## 3 E 17895.
## 4 D 15752.
## 5 C 14828.
## 6 A 14265.
## 7 B 13688.
#average interest rate by grade
interest_rate_grade <- group_by(df1, grade)
interest_rate_grade<- summarise(interest_rate_grade, average = mean(as.numeric(sub("%", "",int_rate))/100))
arrange(interest_rate_grade, desc(average))
## # A tibble: 7 × 2
## grade average
## <chr> <dbl>
## 1 G 0.283
## 2 F 0.253
## 3 E 0.214
## 4 D 0.178
## 5 C 0.140
## 6 B 0.106
## 7 A 0.0708
#number of loans per state per year
loan_number_state_year <- group_by(df1, Geography, issue_Year)
loan_number_state_year <- loan_number_state_year %>% count(issue_Year)
#average loan amount per state per year
loan_amount_average_state_year <- group_by(df1, Geography, issue_Year)
loan_amount_average_state_year <- summarise(loan_amount_average_state_year, average = mean(loan_amnt))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
#average interest rate per state per year
interest_rate_state_year <- group_by(df1, Geography, issue_Year)
interest_rate_state_year<- summarise(interest_rate_state_year, average = mean(as.numeric(sub("%", "",int_rate))/100))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
#Number of loans per year facet by state
ggplot(loan_number_state_year) +
geom_col(mapping = aes(x = issue_Year, y = n))+
facet_wrap(~Geography,nrow=5)
#average loan amount per year facet by state
ggplot(loan_amount_average_state_year) +
geom_col(mapping = aes(x = issue_Year, y = average))+
facet_wrap(~Geography,nrow=5)
#average interest rate per year facet by state
ggplot(interest_rate_state_year) +
geom_col(mapping = aes(x = issue_Year, y = average))+
facet_wrap(~Geography,nrow=5)
loan_amount_average_state_pop <- group_by(df1, Geography, population)
loan_amount_average_state_pop <- summarise(loan_amount_average_state_pop, average = mean(loan_amnt))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
arrange(loan_amount_average_state_pop, desc(population))
## # A tibble: 51 × 3
## # Groups: Geography [51]
## Geography population average
## <chr> <dbl> <dbl>
## 1 California 38982847 15068.
## 2 Texas 27419612 15569.
## 3 Florida 20278447 14139.
## 4 New York 19798228 14688.
## 5 Illinois 12854526 15315.
## 6 Pennsylvania 12790505 14588.
## 7 Ohio 11609756 14182.
## 8 Georgia 10201635 15111.
## 9 North Carolina 10052564 14609.
## 10 Michigan 9925568 14141.
## # … with 41 more rows
#create bar chart where x=states, y=average loan amounts, order states by population size.
ggplot(loan_amount_average_state_pop) +
geom_col(mapping = aes(x = fct_rev(fct_rev(fct_reorder(Geography, population))), y = average))+
coord_flip()
#compare the proportions of each grade of loans between states, order states by Median Income Households.
ggplot(df1) +
geom_bar(mapping = aes(x = fct_reorder(Geography, Median_income_Households), fill=grade), position="fill")+
scale_fill_manual(values=c("#78c679", "#74a9cf", "#9e9ac8", "#ffffd4", "#fecc5c", "#fd8d3c", "#e31a1c"))+
coord_flip()
#create bins for interest rates
df1 <- transform(df1, int_bin = cut(as.numeric(sub("%", "",int_rate)), breaks = c(0, 7.5, 10, 12.5, 15, 20, 30), labels = c("0-7.5", "7.5-10", "10-12.5", "12.5-15", "15-20", "20+")))
#remove missing values in interest rate
df6 <- subset(df1, !is.na(int_rate))
#plot interest rate groups by loan grade
ggplot(df6)+
geom_bar(mapping=aes(x=int_bin, fill = grade))+
facet_grid(grade ~.)+
scale_fill_manual(values=c("#78c679", "#74a9cf", "#9e9ac8", "#ffffd4", "#fecc5c", "#fd8d3c", "#e31a1c"))
#left join our data set to us_states data, change all states headers to lowercase for mapping purpose.
us_states <- map_data("state")
loan_amount_average_state$state_lowercase = tolower(loan_amount_average_state$Geography)
avg_loan_map <- us_states %>%
left_join(loan_amount_average_state, c("region"="state_lowercase"))
#create the map color coded by average loan amount
ggplot(data=avg_loan_map,
mapping=aes(x=long, y=lat, group=group, fill=average))+
geom_polygon(color="white")+
coord_map(projection="albers", lat0=39, lat1=45)+
scale_fill_gradient(low="White", high="#4273c2")
#create the variable for DTI (debt-to-loan ratio)
df1$DTI <- df1$loan_amnt/df1$annual_inc
ggplot(data = df1, mapping=aes(x=DTI)) +
geom_freqpoly(binwidth=0.05) +
coord_cartesian(xlim = c(0.01, 0.75))
## Warning: Removed 552 rows containing non-finite values (stat_bin).
ggplot(data = df1, mapping=aes(x=emp_length, y=loan_amnt))+
geom_boxplot(mapping=aes(group=cut_width(loan_amnt,10000)))
loan_number_region <- group_by(df1,Region)
loan_number_region <- loan_number_region %>% count(Region)
loan_number_region <- data_regions %>%
left_join(loan_number_region, by="Region")
loan_number_region$state_lowercase=tolower(loan_number_region$State)
regional_map <- us_states %>%
left_join(loan_number_region, c("region"="state_lowercase"))
#create the regional map color coded by number of loans
ggplot(data=regional_map,
mapping=aes(x=long, y=lat, group=group, fill=n))+
geom_polygon(color="white")+
scale_fill_gradient(low="White", high="#4273c2")