library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr) # For data manipulation
library(ggplot2)
##First, read all the file into dataframe. In the future, I need to figure out a way to read multiple files all at once. But for this one, I read everything seperately. I noticed that the data2012 does not have column names. I copy and paste directly from data2013. But later find out the other 4 data files also have no column names. I will deal with it later.
#cname_path <- 'C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\DataDictionary.txt'
file_path1 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2012.csv"
file_path2 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2013.csv"
file_path3 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2014.csv"
file_path4 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2015.csv"
file_path5 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2016.csv"
file_path6 <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\data2017.csv"
States <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\States.csv"
States_regions <- "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\states_regions.csv"
d1 <- read_csv(file_path1)
## New names:
## Rows: 53368 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (19): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (13): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, open_acc, total_pym...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d2 <- read_csv(file_path2)
## New names:
## Rows: 134814 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d3 <- read_csv(file_path3)
## New names:
## Rows: 235629 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): Unnamed: 5, Unnamed: 6, Unnamed: 8, Unnamed: 9, Unnamed: 10, Unnam... dbl
## (16): ...1, Unnamed: 2, Unnamed: 13, Unnamed: 24, Unnamed: 25, Unnamed: ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d4 <- read_csv(file_path4)
## New names:
## Rows: 421095 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): Unnamed: 5, Unnamed: 6, Unnamed: 8, Unnamed: 9, Unnamed: 10, Unnam... dbl
## (16): ...1, Unnamed: 2, Unnamed: 13, Unnamed: 24, Unnamed: 25, Unnamed: ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d5 <- read_csv(file_path5)
## New names:
## Rows: 434407 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): Unnamed: 5, Unnamed: 6, Unnamed: 8, Unnamed: 9, Unnamed: 10, Unnam... dbl
## (16): ...1, Unnamed: 2, Unnamed: 13, Unnamed: 24, Unnamed: 25, Unnamed: ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
d6 <- read_csv(file_path6)
## New names:
## Rows: 443579 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): Unnamed: 5, Unnamed: 6, Unnamed: 8, Unnamed: 9, Unnamed: 10, Unnam... dbl
## (16): ...1, Unnamed: 2, Unnamed: 13, Unnamed: 24, Unnamed: 25, Unnamed: ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
df_States <- read_csv(States)
## Rows: 52 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Geography
## dbl (7): Num_Households, Median_income_Households, Unemployment rate; Estima...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_States_regions <- read_csv(States_regions)
## New names:
## Rows: 51 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): State, State Code, Region, Division lgl (1): ...5
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...5`
This step helps me to fill all the dataframe that are missing column names.
cname <- colnames(d1)
colnames(d3) <- cname
colnames(d4) <- cname
colnames(d5) <- cname
colnames(d6) <- cname
I got the errors that tell me below columns in data2012 was imported as character instead of numeric. So I converted all of them manually. Then I bind all the data frames.
d1$inq_last_6mths <- as.numeric(d1$inq_last_6mths)
## Warning: NAs introduced by coercion
d1$mths_since_last_delinq <- as.numeric(d1$mths_since_last_delinq)
## Warning: NAs introduced by coercion
d1$pub_rec_bankruptcies<- as.numeric(d1$pub_rec_bankruptcies)
## Warning: NAs introduced by coercion
df <- bind_rows(d1,d2,d3,d4,d5,d6)
This step I merged the state column with the common value column. We can see that 12 of the columns does not have anything but case id and origin.index. It’s safe to remove them to keep a clean record of data.
df %>%
count(is.na(grade))
dff <- df %>%
filter(!is.na(grade))
state_dd <- left_join(df_States,df_States_regions, join_by("Geography" == "State"))
head(state_dd)
Final step, merge the large data frame with the state info on the common value column.
df_final <-left_join(dff,state_dd, join_by("addr_state"=="State Code"))
#write.csv(df_final, "C:\\Users\\autom\\Documents\\2023\\2023summer\\R studio and tools\\FinalCase\\Lending_group.csv", row.names=FALSE)
For now, we have a basic data frame that contains all the data. Just print here and see what type of question can we ask.
head(df_final)
df_final %>%
count(is.na(addr_state), is.na(Geography), is.na(Region), is.na(Division))
Analysis Use the above file to analyze and answer the following questions: 1) Find the distribution of number of loans by state, regions and divisions. Describe in your own words the geographic differences in the number of loans. Also, analyze your results by comparing number of loans per capita. Did you notice any missing states in the Lending Club data? If yes, then find out why.
dist_state <- df_final %>%
group_by(Geography) %>%
summarise(nb =n()) %>%
arrange(desc(nb))
dist_regions <- df_final %>%
group_by(Region) %>%
summarise(nb =n()) %>%
arrange(desc(nb))
dist_division <- df_final %>%
group_by(Division) %>%
summarise(nb =n()) %>%
arrange(desc(nb))
dist_state$Geography <- factor(dist_state$Geography, levels = dist_state$Geography[order(dist_state$nb)])
ggplot(dist_state, aes(x =Geography, y = nb), color = Geography) +
geom_bar(stat = "identity")+
theme(axis.text.y = element_text(size=10))+
coord_flip()
California has the most loan cases. The next 2 are New York and Texas.
Iowa, Puerto Rico and North Dakota are the 3 least loan state.
dist_regions$Region <- factor(dist_regions$Region, levels = dist_regions$Region[order(dist_regions$nb)])
ggplot(dist_regions, aes(x = Region, y = nb)) +
geom_bar(stat = "identity")+
theme(axis.text.y = element_text(angle = 90, vjust = 0.5, hjust=1))
dist_division$Division <- factor(dist_division$Division, levels = dist_division$Division[order(dist_division$nb)])
ggplot(dist_division, aes(x = Division, y = nb),fill= Region) +
geom_bar(stat = "identity")+coord_flip()+
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=1))
Looks like South Atlantic has the most loan cases compares to other divisions. Regionally, South has the most loan cases. Statewise, California has the most cases.
We have a trend in Division, Region, and states.I wonder if this related to the population in each category. Will explore in the following analysis. I realize that the calculation in the beginning might not required to plot the bar chart. Next I want to filter out the NA value and use the new data set to plot.
n_t1 <- df_final
ggplot(n_t1)+
geom_bar(mapping = aes(x = fct_rev(fct_infreq(Geography)), fill=Division), na.rm=FALSE)+
theme(axis.text.y = element_text(size=10))+
coord_flip()+
facet_wrap(~Division,nrow=2)
We can see that although California has the most cases, the Pacific
division it belongs to is the 2nd on the case number. On the other hand,
South Atlantic, which include Florida,is the top on the cases.
#Facet by regions
ggplot(n_t1) +
geom_bar(mapping = aes(x = fct_rev(fct_infreq(Geography)), fill=Region), na.rm=FALSE)+
theme(axis.text.y = element_text(size=10))+
coord_flip()+
facet_wrap(~Region,nrow=1)
state_dd %>%
group_by(Region) %>%
count()
From the chart we can tell that the loan cases is influenced by how many states under that region. And then influenced by the state cases.
Then next question is, how does the loan cases influenced by the population by each state?
#Count observations per state
df2 <- add_count(n_t1, Geography)
#Calculation loan count per capita
df3 <- mutate(df2, loan_per_capita = n / population)
#Select distinct geography + loan_per_capita
df4 <- df3 %>%
select (Geography, loan_per_capita) %>%
distinct()
#Create bar chart of loan_per_capita for each state
ggplot(df4) +
geom_col(mapping = aes(x = fct_rev(fct_rev(fct_reorder(Geography, loan_per_capita))), y = loan_per_capita))+
coord_flip()
Remove the influence of the population, Nevada has the highest loan
cases. Iowa still the least. New York is still on the top. We should use
the adjusted measurement such as mean or percentage instead of the total
number in the following analysis to get a less biased result.
n_t2<-df_final %>%
group_by(Geography, Division) %>%
summarise(avg_loan = mean(loan_amnt, na.rm=TRUE)) %>%
arrange(desc(avg_loan))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
n_t2
From the data frame, we can see that Alaska, Pacific has the highest average loan of 17057.72. Iowa, West North Central has the lowest average loan of 6962.5.
n_t3<-df_final %>%
filter(!is.na(loan_amnt))%>%
group_by(Geography) %>%
summarise(avg_loan = mean(loan_amnt, na.rm=TRUE)) %>%
arrange(desc(avg_loan))
n_t3
Interesting thing is that Alaska has the highest average loan. Our top cases state such as CA, NY, and FL are in the middle of the list.
n_t4<-df_final %>%
filter(!is.na(Division)) %>%
group_by( Division) %>%
summarise(avg_loan = mean(loan_amnt, na.rm=TRUE)) %>%
arrange(desc(avg_loan))
n_t4
West South Central has the highest average loan of 15308.98. And West North Central has the lowest average loan of 14417.97.
df_f <- df_final %>%
mutate(rate_n =as.numeric(sub("%","",int_rate),na.rm=TRUE))
head(df_f)
df_f %>%
group_by(grade) %>%
summarise(avg_loan = mean(loan_amnt,na.rm = TRUE),
avg_int = mean(rate_n)) %>%
arrange(desc(avg_int))
The interesting thing is, the higher the interest rate, the lower the loan amount. And the average interest rate increase as the grade proceed alphabetically.
df_f %>%
group_by(sub_grade) %>%
summarise(avg_loan = mean(loan_amnt,na.rm = TRUE),
avg_int = mean(rate_n)) %>%
arrange(desc(avg_int))
We can see the same pattern in sub-grade.
avg_result <-df_f %>%
group_by(Geography, issue_Year) %>%
summarise(avg_loan= mean(loan_amnt, na.rm=TRUE),
avg_int = mean(rate_n))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
head(avg_result)
Facet Geography vs years
df_f %>%
filter(!is.na(Geography), !is.na(issue_Year)) %>%
count(Geography, issue_Year)%>%
ggplot(aes(issue_Year,n))+
geom_bar(stat="identity")+
facet_wrap(~Geography, nrow=6)
I can see the pattern that the loan numbers increased from 2012-2016 at
each state. And it got stable in 2016 and 2017.
avg_result %>%
filter(!is.na(Geography), !is.na(issue_Year)) %>%
ggplot(aes(issue_Year,avg_loan))+
geom_bar(stat="identity")+
facet_wrap(~Geography, nrow=6)
it’s interesting to see that most of the state has a rather stable
average loan amount. Massachusetts has no loan in 2012 and 2025.
Mississippi only has loan in 2013 and 2014. Nevada started the loan in
2014 and the average loan dropped after the first year. Oklahoma has no
loan record in 2014. Texas started the loan in 2015. Average loan for
each state are on the same level and kept stable.
avg_result %>%
ggplot(aes(issue_Year,avg_int))+
geom_bar(stat="identity")+
facet_wrap(~Geography, nrow=6)
The interest rate for each state follows the same pattern with the
average_loan. Average interest rate for Oklahoma in 2012 is rather
higher than any of other state or year. Then dropped to the similar
level as the other state.
popsize_loan <- df_f %>%
group_by(Geography, population) %>%
summarise(avg= mean(loan_amnt, na.rm = TRUE))%>%
arrange(desc(avg))
## `summarise()` has grouped output by 'Geography'. You can override using the
## `.groups` argument.
head(popsize_loan)
ggplot(popsize_loan) +
geom_col(mapping = aes(x = fct_rev(fct_rev(fct_reorder(Geography, population))), y = avg))+
coord_flip()
Two outliers we can get from the chart is Iowa and Alaska. Iowa has the
lowest average amount but their population is in the middle level of all
the states. And Alaska has the highest average loan despite the total
population as the 4th least of all the states. Besides these 2, we can
see that the state population size does not influence the average loan
amount in an obvious pattern.
Is there a relationship between Grade of loans and median income level in a state?
ggplot(df_f) +
geom_bar(mapping = aes(x = fct_reorder(Geography, Median_income_Households), fill=grade), position="fill")+
coord_flip()
We don’t have a clear pattern for this grade and median loan level. The only outlier is Iwoa.
From above analysis, we tried to see the relationship of cases with Region, Division, States. And clearly there is a pattern. But remove the influence of population, the pattern become less obvious. We can also see outliers such as Alaska and Iowa. And the loan start date difference is couple of the states.
Visualization 1) Create a plot of interest rates and Grade or a loan and describe the pattern.
Like we analyzed in the previous steps, the interest rate increase as the Grade proceed alphabetically. And clearly the range of different grade can be from 5%(A) to almost 30%(G).
ggplot(df_f, aes(x=grade, y = rate_n)) +
geom_boxplot(fill = "lightblue", color = "black")
avg_loa_by_state <- df_f %>%
group_by(Geography) %>%
summarise(avg_loan = mean(loan_amnt, na.rm= TRUE))
head(avg_loa_by_state)
Load the state map to the data set. Then join it with the average state loan.
library(ggplot2)
state_map <-map_data("state")
head(state_map)
new_table <- avg_loa_by_state %>%
mutate(state = tolower(Geography)) %>%
select(state, avg_loan)
state_map_with_data <- new_table %>%
left_join(state_map, join_by( "state"=="region" ))
head(state_map_with_data)
ggplot(state_map_with_data, aes(x = long, y = lat, group = group, fill = avg_loan)) +
geom_polygon(color="white")
income_loan <- df_f %>%
summarise(rate=loan_amnt/annual_inc)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(income_loan, aes(rate))+
geom_freqpoly(binwidth=0.05) +
coord_cartesian(xlim = c(0.01, 1))
## Warning: Removed 552 rows containing non-finite values (`stat_bin()`).
Looks like most of the loan rate are between 2% to 35%.
income_loan %>%
filter(!is.na(rate))%>%
summarise(max= max(rate),
min= min(rate),
med=median(rate),
mod = mode(rate)
)
Due to the missing value, the highest rate is infinite. Median rate is 20%.
emp_loan <- df_final %>%
filter(!is.na(emp_length), !is.na(loan_amnt))
ggplot(emp_loan, aes(emp_length, loan_amnt, na.rm=TRUE))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
We can tell that in general, the longer the employment,the higher the
loan_amount. But in general, the employment length is not a huge
influence to the loan amount.
avg_int_region <- df_f %>%
group_by(Region) %>%
summarise(avg_int = mean(rate_n, na.rm= TRUE)) %>%
arrange(desc(avg_int))
avg_int_region
Region_map_with_data <- avg_int_region %>%
left_join(state_dd) %>%
mutate(state=tolower(Geography))%>%
right_join(state_map, join_by("state"=="region"))
## Joining with `by = join_by(Region)`
head(Region_map_with_data)
ggplot(Region_map_with_data , aes(x = long, y = lat, group = group, fill = avg_int)) +
geom_polygon(color="white")
From the plot, we can see South has the highest average interest rate. West has the lowest average interest rate. But according to the table. These 4 region does not have huge difference.