library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)      # For data manipulation
library(ggplot2)

##First, read all the file into dataframe. In the future, I need to figure out a way to read multiple files all at once. But for this one, I read everything seperately. I noticed that the data2012 does not have column names. I copy and paste directly from data2013. But later find out the other 4 data files also have no column names. I will deal with it later.

#cname_path <- 'C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\DataDictionary.txt'
file_path1 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2012.csv"
file_path2 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2013.csv"
file_path3 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2014.csv"
file_path4 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2015.csv"
file_path5 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2016.csv"
file_path6 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2017.csv"
States <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\States.csv"
States_regions <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\states_regions.csv"
d1 <- read_csv(file_path1)
## New names:
## Rows: 53368 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (19): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (13): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, open_acc, total_pym...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d2 <- read_csv(file_path2)
## New names:
## Rows: 134814 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d3 <- read_csv(file_path3)
## New names:
## Rows: 235629 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): Unnamed: 5, Unnamed: 6, Unnamed: 8, Unnamed: 9, Unnamed: 10, Unnam... dbl
## (16): ...1, Unnamed: 2, Unnamed: 13, Unnamed: 24, Unnamed: 25, Unnamed: ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d4 <- read_csv(file_path4)
## New names:
## Rows: 421095 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): Unnamed: 5, Unnamed: 6, Unnamed: 8, Unnamed: 9, Unnamed: 10, Unnam... dbl
## (16): ...1, Unnamed: 2, Unnamed: 13, Unnamed: 24, Unnamed: 25, Unnamed: ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d5 <- read_csv(file_path5)
## New names:
## Rows: 434407 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): Unnamed: 5, Unnamed: 6, Unnamed: 8, Unnamed: 9, Unnamed: 10, Unnam... dbl
## (16): ...1, Unnamed: 2, Unnamed: 13, Unnamed: 24, Unnamed: 25, Unnamed: ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d6 <- read_csv(file_path6)
## New names:
## Rows: 443579 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): Unnamed: 5, Unnamed: 6, Unnamed: 8, Unnamed: 9, Unnamed: 10, Unnam... dbl
## (16): ...1, Unnamed: 2, Unnamed: 13, Unnamed: 24, Unnamed: 25, Unnamed: ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
df_States <-   read_csv(States)
## Rows: 52 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Geography
## dbl (7): Num_Households, Median_income_Households, Unemployment rate; Estima...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_States_regions <-   read_csv(States_regions)
## New names:
## Rows: 51 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): State, State Code, Region, Division lgl (1): ...5
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...5`

This step helps me to fill all the dataframe that are missing column names.

cname <- colnames(d1)
colnames(d3) <- cname
colnames(d4) <- cname
colnames(d5) <- cname
colnames(d6) <- cname

I got the errors that tell me below columns in data2012 was imported as character instead of numeric. So I converted all of them manually. Then I bind all the data frames.

d1$inq_last_6mths <- as.numeric(d1$inq_last_6mths)
## Warning: NAs introduced by coercion
d1$mths_since_last_delinq <- as.numeric(d1$mths_since_last_delinq)
## Warning: NAs introduced by coercion
d1$pub_rec_bankruptcies<- as.numeric(d1$pub_rec_bankruptcies)
## Warning: NAs introduced by coercion
df <- bind_rows(d1,d2,d3,d4,d5,d6)

This step I merged the state column with the common value column. We can see that 12 of the columns does not have anything but case id and origin.index. It’s safe to remove them to keep a clean record of data.

df %>%
  count(is.na(grade))
dff <- df %>%
  filter(!is.na(grade))
state_dd <- left_join(df_States,df_States_regions, join_by("Geography" == "State"))
head(state_dd)

Final step, merge the large data frame with the state info on the common value column.

df_final <-left_join(dff,state_dd, join_by("addr_state"=="State Code"))
#write.csv(df_final, "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\Lending_group.csv", row.names=FALSE)

For now, we have a basic data frame that contains all the data. Just print here and see what type of question can we ask.

head(df_final)
df_final %>%
  count(is.na(addr_state), is.na(Geography), is.na(Region), is.na(Division))

Analysis Use the above file to analyze and answer the following questions: 1) Find the distribution of number of loans by state, regions and divisions. Describe in your own words the geographic differences in the number of loans. Also, analyze your results by comparing number of loans per capita. Did you notice any missing states in the Lending Club data? If yes, then find out why.

dist_state <- df_final %>%
  group_by(Geography) %>%
  summarise(nb =n()) %>%
  arrange(desc(nb))


dist_regions <- df_final %>%
  group_by(Region) %>%
  summarise(nb =n()) %>%
  arrange(desc(nb))

dist_division <- df_final %>%
  group_by(Division) %>%
  summarise(nb =n()) %>%
  arrange(desc(nb))
dist_state$Geography <- factor(dist_state$Geography, levels = dist_state$Geography[order(dist_state$nb)])

ggplot(dist_state, aes(x =Geography, y = nb), color = Geography) +
  geom_bar(stat = "identity")+
    theme(axis.text.y = element_text(size=10))+
  coord_flip()

California has the most loan cases. The next 2 are New York and Texas. Iowa, Puerto Rico and North Dakota are the 3 least loan state.

dist_regions$Region <- factor(dist_regions$Region, levels = dist_regions$Region[order(dist_regions$nb)]) 
  

ggplot(dist_regions, aes(x = Region,  y = nb)) +
  geom_bar(stat = "identity")+ 
  theme(axis.text.y = element_text(angle = 90, vjust = 0.5, hjust=1))

dist_division$Division <- factor(dist_division$Division, levels = dist_division$Division[order(dist_division$nb)])

ggplot(dist_division, aes(x = Division,  y = nb),fill= Region) +
  geom_bar(stat = "identity")+coord_flip()+ 
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=1))

Looks like South Atlantic has the most loan cases compares to other divisions. Regionally, South has the most loan cases. Statewise, California has the most cases.

We have a trend in Division, Region, and states.I wonder if this related to the population in each category. Will explore in the following analysis. I realize that the calculation in the beginning might not required to plot the bar chart. Next I want to filter out the NA value and use the new data set to plot.

n_t1 <- df_final

ggplot(n_t1)+
   geom_bar(mapping = aes(x = fct_rev(fct_infreq(Geography)), fill=Division), na.rm=FALSE)+
  theme(axis.text.y = element_text(size=10))+
  coord_flip()+
  facet_wrap(~Division,nrow=2)

We can see that although California has the most cases, the Pacific division it belongs to is the 2nd on the case number. On the other hand, South Atlantic, which include Florida,is the top on the cases.

#Facet by regions
ggplot(n_t1) + 
  geom_bar(mapping = aes(x = fct_rev(fct_infreq(Geography)), fill=Region), na.rm=FALSE)+
  theme(axis.text.y = element_text(size=10))+
  coord_flip()+
  facet_wrap(~Region,nrow=1)

state_dd %>%
  group_by(Region) %>%
  count()

From the chart we can tell that the loan cases is influenced by how many states under that region. And then influenced by the state cases.

Then next question is, how does the loan cases influenced by the population by each state?

#Count observations per state
df2 <- add_count(n_t1, Geography)

#Calculation loan count per capita
df3 <- mutate(df2, loan_per_capita = n / population)

#Select distinct geography + loan_per_capita
df4 <- df3 %>%
  select (Geography, loan_per_capita) %>% 
  distinct()

#Create bar chart of loan_per_capita for each state
ggplot(df4) + 
  geom_col(mapping = aes(x = fct_rev(fct_rev(fct_reorder(Geography, loan_per_capita))), y = loan_per_capita))+
  coord_flip()

Remove the influence of the population, Nevada has the highest loan cases. Iowa still the least. New York is still on the top. We should use the adjusted measurement such as mean or percentage instead of the total number in the following analysis to get a less biased result.

  1. Compare the average amount of loans granted by all states and divisions. Which states and divisions have the highest and lowest average loan amounts?
n_t2<-df_final %>%
  group_by(Geography, Division) %>%
  summarise(avg_loan = mean(loan_amnt, na.rm=TRUE)) %>%
  arrange(desc(avg_loan))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
n_t2

From the data frame, we can see that Alaska, Pacific has the highest average loan of 17057.72. Iowa, West North Central has the lowest average loan of 6962.5.

n_t3<-df_final %>%
  filter(!is.na(loan_amnt))%>%
  group_by(Geography) %>%
  summarise(avg_loan = mean(loan_amnt, na.rm=TRUE)) %>%
  arrange(desc(avg_loan))
n_t3

Interesting thing is that Alaska has the highest average loan. Our top cases state such as CA, NY, and FL are in the middle of the list.

n_t4<-df_final %>%
  filter(!is.na(Division)) %>%
  group_by( Division) %>%
  summarise(avg_loan = mean(loan_amnt, na.rm=TRUE)) %>%
  arrange(desc(avg_loan))
n_t4

West South Central has the highest average loan of 15308.98. And West North Central has the lowest average loan of 14417.97.

  1. Compare the average interest rate charged and average loan amount by the loan Grade. Do you notice any patterns? Interest rate was used a lot so we add a column we can directly use.
df_f <- df_final %>%
  mutate(rate_n =as.numeric(sub("%","",int_rate),na.rm=TRUE))
head(df_f)
df_f %>%
  group_by(grade) %>%
  summarise(avg_loan = mean(loan_amnt,na.rm = TRUE),
            avg_int = mean(rate_n)) %>%
  arrange(desc(avg_int))

The interesting thing is, the higher the interest rate, the lower the loan amount. And the average interest rate increase as the grade proceed alphabetically.

df_f %>%
  group_by(sub_grade) %>%
  summarise(avg_loan = mean(loan_amnt,na.rm = TRUE),
            avg_int = mean(rate_n)) %>%
  arrange(desc(avg_int))

We can see the same pattern in sub-grade.

  1. Run a frequency distribution of number of loans, average loan amount and average interest rate for each state by year (2012 through 2017). Describe the changing patterns in those numbers.
avg_result <-df_f %>%
  group_by(Geography, issue_Year) %>%
  summarise(avg_loan= mean(loan_amnt, na.rm=TRUE),
            avg_int = mean(rate_n))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
head(avg_result)

Facet Geography vs years

df_f %>%
  filter(!is.na(Geography), !is.na(issue_Year)) %>%
  count(Geography, issue_Year)%>% 
  ggplot(aes(issue_Year,n))+
  geom_bar(stat="identity")+
  facet_wrap(~Geography, nrow=6)

I can see the pattern that the loan numbers increased from 2012-2016 at each state. And it got stable in 2016 and 2017.

avg_result  %>%
  filter(!is.na(Geography), !is.na(issue_Year)) %>%
  ggplot(aes(issue_Year,avg_loan))+
  geom_bar(stat="identity")+
  facet_wrap(~Geography, nrow=6)

it’s interesting to see that most of the state has a rather stable average loan amount. Massachusetts has no loan in 2012 and 2025. Mississippi only has loan in 2013 and 2014. Nevada started the loan in 2014 and the average loan dropped after the first year. Oklahoma has no loan record in 2014. Texas started the loan in 2015. Average loan for each state are on the same level and kept stable.

avg_result  %>%
  ggplot(aes(issue_Year,avg_int))+
  geom_bar(stat="identity")+
  facet_wrap(~Geography, nrow=6)

The interest rate for each state follows the same pattern with the average_loan. Average interest rate for Oklahoma in 2012 is rather higher than any of other state or year. Then dropped to the similar level as the other state.

  1. Is there a relationship with the population size of a state and the average loan amount given? Is there a relationship between Grade of loans and median income level in a state?
popsize_loan <- df_f %>%
  group_by(Geography, population) %>%
  summarise(avg= mean(loan_amnt, na.rm = TRUE))%>%
  arrange(desc(avg))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
head(popsize_loan)
ggplot(popsize_loan) + 
  geom_col(mapping = aes(x = fct_rev(fct_rev(fct_reorder(Geography, population))), y = avg))+
  coord_flip()

Two outliers we can get from the chart is Iowa and Alaska. Iowa has the lowest average amount but their population is in the middle level of all the states. And Alaska has the highest average loan despite the total population as the 4th least of all the states. Besides these 2, we can see that the state population size does not influence the average loan amount in an obvious pattern.

Is there a relationship between Grade of loans and median income level in a state?

ggplot(df_f) + 
  geom_bar(mapping = aes(x = fct_reorder(Geography, Median_income_Households), fill=grade), position="fill")+
    coord_flip()

We don’t have a clear pattern for this grade and median loan level. The only outlier is Iwoa.

  1. This is an open-ended question where you are asked to share an interesting fact that you found through data analysis.

From above analysis, we tried to see the relationship of cases with Region, Division, States. And clearly there is a pattern. But remove the influence of population, the pattern become less obvious. We can also see outliers such as Alaska and Iowa. And the loan start date difference is couple of the states.

Visualization 1) Create a plot of interest rates and Grade or a loan and describe the pattern.

Like we analyzed in the previous steps, the interest rate increase as the Grade proceed alphabetically. And clearly the range of different grade can be from 5%(A) to almost 30%(G).

ggplot(df_f, aes(x=grade, y = rate_n)) +
  geom_boxplot(fill = "lightblue", color = "black") 

  1. Create a map of US states and color code the map with the average amount of loans given.
avg_loa_by_state <- df_f %>%
  group_by(Geography) %>%
  summarise(avg_loan = mean(loan_amnt, na.rm= TRUE))

head(avg_loa_by_state)

Load the state map to the data set. Then join it with the average state loan.

library(ggplot2)
state_map <-map_data("state")
head(state_map)
new_table <- avg_loa_by_state %>%
  mutate(state = tolower(Geography)) %>%
  select(state, avg_loan)


state_map_with_data <- new_table %>%
  left_join(state_map, join_by( "state"=="region" ))
head(state_map_with_data)
ggplot(state_map_with_data, aes(x = long, y = lat, group = group, fill = avg_loan)) +
    geom_polygon(color="white")

  1. Show visually the relationship between the annual income of the recipient and the loan amount obtained from Lending Club
income_loan <- df_f %>%
  summarise(rate=loan_amnt/annual_inc)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(income_loan, aes(rate))+
   geom_freqpoly(binwidth=0.05) + 
    coord_cartesian(xlim = c(0.01, 1))
## Warning: Removed 552 rows containing non-finite values (`stat_bin()`).

Looks like most of the loan rate are between 2% to 35%.

income_loan %>%
  filter(!is.na(rate))%>%
  summarise(max= max(rate),
            min= min(rate),
            med=median(rate),
            mod = mode(rate)
            )

Due to the missing value, the highest rate is infinite. Median rate is 20%.

  1. Create a plot that shows the relationship between the length of employment and amount of loan obtained.
emp_loan <- df_final %>%
  filter(!is.na(emp_length), !is.na(loan_amnt))
  
ggplot(emp_loan, aes(emp_length, loan_amnt, na.rm=TRUE))+
         geom_boxplot()+
  theme(axis.text.x =  element_text(angle = 90, vjust = 0.5, hjust=1))

We can tell that in general, the longer the employment,the higher the loan_amount. But in general, the employment length is not a huge influence to the loan amount.

  1. Create a “regional” map and show an interesting relationship of your liking
avg_int_region <- df_f %>%
  group_by(Region) %>%
  summarise(avg_int = mean(rate_n, na.rm= TRUE)) %>%
  arrange(desc(avg_int))

avg_int_region
Region_map_with_data <- avg_int_region %>%
  left_join(state_dd) %>%
  mutate(state=tolower(Geography))%>%
  right_join(state_map, join_by("state"=="region"))
## Joining with `by = join_by(Region)`
head(Region_map_with_data)
ggplot(Region_map_with_data , aes(x = long, y = lat, group = group, fill = avg_int)) +
    geom_polygon(color="white")

From the plot, we can see South has the highest average interest rate. West has the lowest average interest rate. But according to the table. These 4 region does not have huge difference.