## Warning: package 'maps' was built under R version 4.2.1


Merging and Cleaning

data_2012 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2012.csv")
## New names:
## Rows: 53368 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (14): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, mths_since_last_del...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2013 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2013.csv")
## New names:
## Rows: 134814 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2014 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2014.csv")
## New names:
## Rows: 235629 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2015 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2015.csv")
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 421095 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2016 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2016.csv")
## New names:
## Rows: 434407 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2017 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2017.csv")
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 443579 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_states <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\States.csv")
## Rows: 52 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Geography
## dbl (7): Num_Households, Median_income_Households, Unemployment rate; Estima...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_regions <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\states_regions.csv")
## Rows: 51 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): State, State_Code, Region, Division
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Concatenate all 6 tables into 1
LC_data <- rbind(data_2012, data_2013, data_2014, data_2015, data_2016, data_2017)

#dropped the previous index column because it doesn't apply to the new table, dropped the issue_d column because we already have that divided into issue_Month and issue_Year column. 
LC_data_drop_columns <- subset(LC_data, select= -c(...1, issue_d))
#left join states data and regions data on state name, Puerto Rico is missing from the regions file.
data_states_regions <- data_states %>%
  left_join(data_regions, c("Geography"="State"))
#join all the provided files, keeping all observations in loan data tables.
LC_data_joined <- LC_data_drop_columns %>%
  left_join(data_states_regions, c("addr_state"="State_Code"))


Analysis


1) Find the distribution of number of loans by state, regions and divisions. Describe in your own words the geographic differences in the number of loans. Also, analyze your results by comparing number of loans per capita. Did you notice any missing states in the Lending Club data? If yes, then find out why.

#Remove NAs in states and divisions
#create bar chart of loan counts based on states and divisions, where x=Geography, bar color =Division, ordered by the loan count DESC.
df1 <- subset(LC_data_joined, !is.na(Geography)&!is.na(Division))
ggplot(data=df1, aes(x = fct_rev(fct_infreq(Geography)), fill=Division)) + 
  geom_bar()+
  theme(axis.text.y = element_text(size=10))+
  coord_flip()

ggplot(df1) + 
  geom_bar(mapping = aes(x = fct_rev(fct_infreq(Division)), fill=Region))+
  theme(axis.text.y = element_text(size=10))+
  coord_flip()

ggplot(df1) + 
  geom_bar(mapping = aes(x = fct_rev(fct_infreq(Region))))+
  theme(axis.text.y = element_text(size=10))+
  coord_flip()

  • First I created three bar charts that represent the distribution of loans by states, divisions and regions. From the first plot, we can see that a lot of the loans are done in California, New York, Texas and Florida. However, the states that have the most loans are not necessarily in the division or region that have the most loans. For example, California has the most loans among all of the states, however, the division of California which is Pacific, has the 2nd most loans among all divisions. From the distribution of loan counts by regions, we know that most of the loans are done in the South and West Region.
  • To learn more about the relationship between loan counts and states vs. regions, next I facet the first chart by division.
#Facet the distribution by state chart by Division.
ggplot(df1) + 
  geom_bar(mapping = aes(x = fct_rev(fct_infreq(Geography)), fill=Division), na.rm=FALSE)+
  theme(axis.text.y = element_text(size=10))+
  coord_flip()+
  facet_wrap(~Division,nrow=2)

  • From this chart, we can clearly see that for each division, which states have the most loans. We can see that in some of the divisions, most of the loans are done in 1 state, such as the California in Division Pacific and the Texas in Division West South Central.
#Facet by regions
ggplot(df1) + 
  geom_bar(mapping = aes(x = fct_rev(fct_infreq(Geography)), fill=Region), na.rm=FALSE)+
  theme(axis.text.y = element_text(size=10))+
  coord_flip()+
  facet_wrap(~Region,nrow=1)

ggplot(df1) + 
  geom_bar(mapping = aes(x = fct_rev(fct_infreq(Division)), fill=Region), na.rm=FALSE)+
  theme(axis.text.y = element_text(size=10))+
  coord_flip()+
  facet_wrap(~Region,nrow=1)

  • By faceting the distribution of loans by state into regions, we know for each region which states have the most loans. By faceting the distribution of loans by divisions into regions, we can see that Lending Club’s loan counts in each region are not that much different from each other, the company tend to do loans in South and West slightly more than the other two regions.
#Count observations per state
df2 <- add_count(df1, Geography)

#Calculation loan count per capita
df3 <- mutate(df2, loan_per_capita = n / population)

#Select distinct geography + loan_per_capita
df4 <- df3 %>%
  select (Geography, loan_per_capita)
df5 <- df4 %>% distinct()

#Create bar chart of loan_per_capita for each state
ggplot(df5) + 
  geom_col(mapping = aes(x = fct_rev(fct_rev(fct_reorder(Geography, loan_per_capita))), y = loan_per_capita))+
  coord_flip()

  • The distribution of loans per state per capita is different from per state. For example even though California has the most number of loans, the loans per capita is only ranked the 11th among all states.
  • Lending Club doesn’t have any loan data for Iowa state, I did some research and found that Lending Club loans are not available in Iowa.


2) Compare the average amount of loans granted by all states and divisions. Which states and divisions have the highest and lowest average loan amounts?

loan_amount_average_state <- group_by(df1, Geography)
loan_amount_average_state <- summarise(loan_amount_average_state, average = mean(loan_amnt))
arrange(loan_amount_average_state, desc(average))
## # A tibble: 51 × 2
##    Geography            average
##    <chr>                  <dbl>
##  1 Alaska                17058.
##  2 Virginia              15952.
##  3 District of Columbia  15730.
##  4 Maryland              15689.
##  5 New Jersey            15684.
##  6 Massachusetts         15673.
##  7 Hawaii                15586.
##  8 Texas                 15569.
##  9 Wyoming               15466.
## 10 Illinois              15315.
## # … with 41 more rows
loan_amount_average_division <- group_by(df1, Division)
loan_amount_average_division<- summarise(loan_amount_average_division, average = mean(loan_amnt))
arrange(loan_amount_average_division, desc(average))
## # A tibble: 9 × 2
##   Division           average
##   <chr>                <dbl>
## 1 West South Central  15309.
## 2 New England         15169.
## 3 Pacific             15040.
## 4 Middle Atlantic     14903.
## 5 South Atlantic      14867.
## 6 East North Central  14567.
## 7 Mountain            14534.
## 8 East South Central  14459.
## 9 West North Central  14418.
  • Among all states, Alaska has the highest average loan amount which is $17,057.72, Montana has the lowest average loan amount which is $13696.50. I removed Iowa from the result because that might be an error, since loans are not available in Iowa.
  • Among all divisions, West South Central has the highest average loan amount which is $15,308.98, West North Central has the lowest average loan amount, which is $14,417.97.


3) Compare the average interest rate charged and average loan amount by the loan Grade. Do you notice any patterns?

#average loan amount by grade
loan_amount_average_grade <- group_by(df1, grade)
loan_amount_average_grade <- summarise(loan_amount_average_grade, average = mean(loan_amnt))
arrange(loan_amount_average_grade, desc(average))
## # A tibble: 7 × 2
##   grade average
##   <chr>   <dbl>
## 1 G      20577.
## 2 F      19202.
## 3 E      17895.
## 4 D      15752.
## 5 C      14828.
## 6 A      14265.
## 7 B      13688.
#average interest rate by grade
interest_rate_grade <- group_by(df1, grade)
interest_rate_grade<- summarise(interest_rate_grade, average = mean(as.numeric(sub("%", "",int_rate))/100))
arrange(interest_rate_grade, desc(average))
## # A tibble: 7 × 2
##   grade average
##   <chr>   <dbl>
## 1 G      0.283 
## 2 F      0.253 
## 3 E      0.214 
## 4 D      0.178 
## 5 C      0.140 
## 6 B      0.106 
## 7 A      0.0708
  • Generally speaking, loans with a higher grade would have a lower interest rate, and lower loan amount.


4) Run a frequency distribution of number of loans, average loan amount and average interest rate for each state by year (2012 through 2017). Describe the changing patterns in those numbers.

#number of loans per state per year
loan_number_state_year <- group_by(df1, Geography, issue_Year)
loan_number_state_year <- loan_number_state_year %>% count(issue_Year)

#average loan amount per state per year
loan_amount_average_state_year <- group_by(df1, Geography, issue_Year)
loan_amount_average_state_year <- summarise(loan_amount_average_state_year, average = mean(loan_amnt))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
#average interest rate per state per year
interest_rate_state_year <- group_by(df1, Geography, issue_Year)
interest_rate_state_year<- summarise(interest_rate_state_year, average = mean(as.numeric(sub("%", "",int_rate))/100))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
#Number of loans per year facet by state
ggplot(loan_number_state_year) + 
  geom_col(mapping = aes(x = issue_Year, y = n))+
  facet_wrap(~Geography,nrow=5)

#average loan amount per year facet by state
ggplot(loan_amount_average_state_year) + 
  geom_col(mapping = aes(x = issue_Year, y = average))+
  facet_wrap(~Geography,nrow=5)

#average interest rate per year facet by state
ggplot(interest_rate_state_year) + 
  geom_col(mapping = aes(x = issue_Year, y = average))+
  facet_wrap(~Geography,nrow=5)

  • Number of Loans: each state generally share a similar change in number of loans from 2012 to 2017, where there’s normally an obvious increase in the number of loans from year 2012 to 2015. And then from 2015 to 2017, the loan numbers don’t change much, in some states there are even slight decreases in the loan numbers between the year 2015 and 2017.
  • Average Loan Amount: The average loan amount of each state doesn’t really change much between year 2012 and 2017, except for certain states such as Maine, the average loan amount in 2014 is way higher than the other years.
  • Average Interest Rates: Interest rates don’t change much either. For Nebraska and Maine, the interest rates were higher during the year when the loan amount were higher than the other years.


5) Is there a relationship between the population size of a state and the average loan amount given? Is there a relationship between Grade of loans and median income level in a state?

loan_amount_average_state_pop <- group_by(df1, Geography, population)
loan_amount_average_state_pop <- summarise(loan_amount_average_state_pop, average = mean(loan_amnt))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
arrange(loan_amount_average_state_pop, desc(population))
## # A tibble: 51 × 3
## # Groups:   Geography [51]
##    Geography      population average
##    <chr>               <dbl>   <dbl>
##  1 California       38982847  15068.
##  2 Texas            27419612  15569.
##  3 Florida          20278447  14139.
##  4 New York         19798228  14688.
##  5 Illinois         12854526  15315.
##  6 Pennsylvania     12790505  14588.
##  7 Ohio             11609756  14182.
##  8 Georgia          10201635  15111.
##  9 North Carolina   10052564  14609.
## 10 Michigan          9925568  14141.
## # … with 41 more rows
#create bar chart where x=states, y=average loan amounts, order states by population size.
ggplot(loan_amount_average_state_pop) + 
  geom_col(mapping = aes(x = fct_rev(fct_rev(fct_reorder(Geography, population))), y = average))+
  coord_flip()

  • There’re not really an obvious relationship between the population size and the average loan amount of a state.
#compare the proportions of each grade of loans between states, order states by Median Income Households.
ggplot(df1) + 
  geom_bar(mapping = aes(x = fct_reorder(Geography, Median_income_Households), fill=grade), position="fill")+
  scale_fill_manual(values=c("#78c679", "#74a9cf", "#9e9ac8", "#ffffd4", "#fecc5c", "#fd8d3c", "#e31a1c"))+
  coord_flip()

  • My assumption was that states with generally higher income would have better quality loans, however it’s hard to compare the number of loans by grade. So I create the bar chart based on proportions, but it seems that the proportions are very similar between each state.


6) This is an open-ended question where you are asked to share an interesting fact that you found through data analysis. 1)

  • By completing the data analysis task 3, I know better about how loans are graded, and the relationship between loan grade and interest rate. When borrowers need more money, it is generally considered riskier than smaller amount of loan because borrowers’ capability to repay might be low, therefore the lender tend to offer a higher interest rates to make sure the the borrower pay back sooner.


Visualization


1) Create a plot of interest rates and Grade of a loan and describe the pattern.

#create bins for interest rates
df1 <- transform(df1, int_bin = cut(as.numeric(sub("%", "",int_rate)), breaks = c(0, 7.5, 10, 12.5, 15, 20, 30), labels = c("0-7.5", "7.5-10", "10-12.5", "12.5-15", "15-20", "20+")))

#remove missing values in interest rate
df6 <- subset(df1, !is.na(int_rate))

#plot interest rate groups by loan grade
ggplot(df6)+
 geom_bar(mapping=aes(x=int_bin, fill = grade))+ 
 facet_grid(grade ~.)+
  scale_fill_manual(values=c("#78c679", "#74a9cf", "#9e9ac8", "#ffffd4", "#fecc5c", "#fd8d3c", "#e31a1c"))

  • There’s a strong relationship between loan grade and interest rates, where loans with higher grade normally have lower interest rates.


2) Create a map of US states and color code the map with the average amount of loans given.

#left join our data set to us_states data, change all states headers to lowercase for mapping purpose.
us_states <- map_data("state")
loan_amount_average_state$state_lowercase = tolower(loan_amount_average_state$Geography)
avg_loan_map <- us_states %>%
  left_join(loan_amount_average_state, c("region"="state_lowercase"))

#create the map color coded by average loan amount
ggplot(data=avg_loan_map,
       mapping=aes(x=long, y=lat, group=group, fill=average))+
  geom_polygon(color="white")+
  coord_map(projection="albers", lat0=39, lat1=45)+
  scale_fill_gradient(low="White", high="#4273c2")


3) Show visually the relationship between the annual income of the recipient and the loan amount obtained from Lending Club

#create the variable for DTI (debt-to-loan ratio)
df1$DTI <- df1$loan_amnt/df1$annual_inc

ggplot(data = df1, mapping=aes(x=DTI)) + 
  geom_freqpoly(binwidth=0.05) + 
    coord_cartesian(xlim = c(0.01, 0.75))
## Warning: Removed 552 rows containing non-finite values (stat_bin).

  • From this graph we can tell that most of the loan amount are between 5%-30% of the borrower’s annual income.


4) Create a plot that shows the relationship between the length of employment and amount of loan obtained.

ggplot(data = df1, mapping=aes(x=emp_length, y=loan_amnt))+
geom_boxplot(mapping=aes(group=cut_width(loan_amnt,10000)))


5) Create a “regional” map and show an interesting relationship of your liking.

loan_number_region <- group_by(df1,Region)
loan_number_region <- loan_number_region %>% count(Region)

loan_number_region <- data_regions %>%
  left_join(loan_number_region, by="Region")
loan_number_region$state_lowercase=tolower(loan_number_region$State)

regional_map <- us_states %>%
  left_join(loan_number_region, c("region"="state_lowercase"))

#create the regional map color coded by number of loans
ggplot(data=regional_map,
       mapping=aes(x=long, y=lat, group=group, fill=n))+
  geom_polygon(color="white")+
  scale_fill_gradient(low="White", high="#4273c2")

  • This is the regional map of number of loans.