This report has the objective of analysing datasets from the Lending Club. There are two datasets called ‘LC_Accept.csv’ and ‘LC-Decline.csv’, they have information about the services of lending money online by matching individual lenders and borrowers.
Beyond that, the ‘LC_Accept.csv’ contains the accepted services, on the other hand, the ‘LC-Decline.csv’ contain the decline services.
#Set working directory
setwd("C:/Users/olive/Desktop/peer-to-peer (P2P) lending/Datasets")
#Set US location
Sys.setenv(LANG = "en_US.UTF-8")
#Load packages
library(ggplot2)
library(plotly)
library(zoo)
#Read dataset
accept <- read.csv('LC_Accept.csv',stringsAsFactors = FALSE)
decline <- read.csv('LC_Decline.csv',stringsAsFactors = FALSE)
#Print dataset structure
print(str(accept))
## 'data.frame': 71695 obs. of 7 variables:
## $ Date : int 201501 201501 201501 201501 201501 201501 201501 201501 201501 201501 ...
## $ State : chr "SD" "TX" "CO" "WA" ...
## $ Purpose : chr "Debt_Consolidation" "Credit_Card" "Debt_Consolidation" "Debt_Consolidation" ...
## $ Amount_Requested : int 18000 21000 9600 23000 22475 15850 24000 24800 18000 15000 ...
## $ Risk_Score : int 700 690 665 745 715 710 660 665 680 705 ...
## $ Debt_Income_Ratio: num 0.219 0.225 0.122 0.148 0.178 ...
## $ Employment_Length: int 4 11 7 1 8 1 11 6 9 4 ...
## NULL
print(str(decline))
## 'data.frame': 287169 obs. of 7 variables:
## $ Date : int 201501 201501 201501 201501 201501 201501 201501 201501 201501 201501 ...
## $ State : chr "IL" "WA" "FL" "NY" ...
## $ Purpose : chr "Debt_Consolidation" "Credit_Card" "Credit_Card" "Purchase" ...
## $ Amount_Requested : int 20000 35000 5000 15000 10500 5000 2000 5000 13000 35000 ...
## $ Risk_Score : int 514 671 661 617 527 602 657 645 720 708 ...
## $ Debt_Income_Ratio: num 0.1562 0.0635 0.4876 0.1664 0.1515 ...
## $ Employment_Length: int 0 0 0 0 0 0 0 0 0 0 ...
## NULL
#Transform as date for accept dataset
accept$Date <- paste(accept$Date, '15', sep='')
accept$Date <- as.Date(accept$Date,'%Y%m%d')
#Transform as date for decline dataset
decline$Date <- paste(decline$Date, '15', sep='')
decline$Date <- as.Date(decline$Date,'%Y%m%d')
#Groub by date
print('Group by Date')
## [1] "Group by Date"
accept_date <- aggregate(accept$Amount_Requested, by=list(accept$Date), FUN=sum)
decline_date <- aggregate(decline$Amount_Requested, by=list(decline$Date), FUN=sum)
print(accept_date)
## Group.1 x
## 1 2015-01-15 25517150
## 2 2015-02-15 17541050
## 3 2015-03-15 18664625
## 4 2015-04-15 25743000
## 5 2015-05-15 23018050
## 6 2015-06-15 20639300
## 7 2015-07-15 32804500
## 8 2015-08-15 26356725
## 9 2015-09-15 21206725
## 10 2015-10-15 35768650
## 11 2015-11-15 26991625
## 12 2015-12-15 31317625
## 13 2016-01-15 24935475
## 14 2016-02-15 29344100
## 15 2016-03-15 45028425
## 16 2016-04-15 25828525
## 17 2016-05-15 20174300
## 18 2016-06-15 23405425
## 19 2016-07-15 23140325
## 20 2016-08-15 24583375
## 21 2016-09-15 19355250
## 22 2016-10-15 21410875
## 23 2016-11-15 22990725
## 24 2016-12-15 24435925
## 25 2017-01-15 22019875
## 26 2017-02-15 19334325
## 27 2017-03-15 25999150
## 28 2017-04-15 20616250
## 29 2017-05-15 26107275
## 30 2017-06-15 25548250
## 31 2017-07-15 26902050
## 32 2017-08-15 29407025
## 33 2017-09-15 27749125
## 34 2017-10-15 27031750
## 35 2017-11-15 29939650
## 36 2017-12-15 27485400
## 37 2018-01-15 27479350
## 38 2018-02-15 24473400
## 39 2018-03-15 28974050
## 40 2018-04-15 32183775
## 41 2018-05-15 34049975
## 42 2018-06-15 30503075
print(decline_date)
## Group.1 x
## 1 2015-01-15 124829400
## 2 2015-02-15 92951725
## 3 2015-03-15 21827775
## 4 2015-04-15 6713750
## 5 2015-05-15 5329425
## 6 2015-06-15 6791500
## 7 2015-07-15 11697400
## 8 2015-08-15 9356975
## 9 2015-09-15 5884375
## 10 2015-10-15 14752255
## 11 2015-11-15 18251100
## 12 2015-12-15 22616050
## 13 2016-01-15 33732150
## 14 2016-02-15 51245850
## 15 2016-03-15 59921625
## 16 2016-04-15 48420575
## 17 2016-05-15 15563300
## 18 2016-06-15 9318075
## 19 2016-07-15 17674550
## 20 2016-08-15 27234700
## 21 2016-09-15 57120775
## 22 2016-10-15 105852975
## 23 2016-11-15 136059675
## 24 2016-12-15 121367050
## 25 2017-01-15 133772000
## 26 2017-02-15 119076450
## 27 2017-03-15 138840925
## 28 2017-04-15 123821125
## 29 2017-05-15 154172600
## 30 2017-06-15 138475350
## 31 2017-07-15 166736175
## 32 2017-08-15 188308875
## 33 2017-09-15 219165775
## 34 2017-10-15 295649325
## 35 2017-11-15 362381575
## 36 2017-12-15 322945175
## 37 2018-01-15 225254425
## 38 2018-02-15 11477200
## 39 2018-03-15 12151925
## 40 2018-04-15 12209625
## 41 2018-05-15 13216250
## 42 2018-06-15 7249200
#Set columns names
names(accept_date) <- c('Date','Amount Requested')
colnames(decline_date) <- colnames(accept_date)
The first task of the report is to analyse the acceptance rates of loan applications over the months. The formula used to calculate the acceptance rate was rate = accept/ (accept + decline).
It was created a line plot to illustrate how the acceptance rate behaves through time, the graph show the period from January 2015 until June 2018.
#Rate variable
rate <-
accept_date$`Amount Requested` / (accept_date$`Amount Requested` + decline_date$`Amount Requested`)
#Dataset for graph 1
graph_df <- data.frame(Date=accept_date$Date,
Rate = rate)
#Transform date into month and year
graph_df$Date <- as.yearmon(graph_df$Date, '%Y%m')
#Plot Task 1
plot1 <- ggplot(graph_df, aes(x=Date,y=Rate))+
geom_line()+
scale_y_continuous(breaks = seq(0, 1, by = 0.1))+
theme_bw()
ggplotly(plot1)
Observing the graph from Task 1 it is possible to conclude that, the most important periods to increase the acceptance of the lending money services provided were during the December 2017 to February 2018 and from January 2015 until April 2015.
The second task consists of creating a side-by-side bar chart of loan purposes grouped by the status of accept and decline. The datasets have a total of five loan purposes.
#Unite dataframes for graph
graph_df_task2 <-
data.frame(
Purpose = c(accept$Purpose, decline$Purpose),
Status = c(rep('Accept', nrow(accept)), rep('Decline', nrow(decline)))
)
graph_df_task2 <- graph_df_task2 %>%
group_by(Purpose, Status) %>%
summarise(count = n())
#Plot Task 2
plot2 <- ggplot(graph_df_task2, aes(x = Purpose, y = count, fill = Status)) +
geom_col(position='dodge') +
scale_fill_manual(values = c('green','red'))+
scale_y_continuous(breaks = seq(0, 140000, by = 25000))+
labs(y='Count')+
theme_bw()
ggplotly(plot2,tooltip = c('count'))
The graph created in Task 2 shows that the purpose Debt Consolidation was the one with more number of accept, in the other hand, this was the same purpose with the higher number of decline.
In this final task, it will be created a graph to compare the overall value of the amount requested for the services. It will be possible to exam the variance of the amount requested for the decline and accept services.
#Unite dataframes for graph
graph_df_task3 <-
data.frame(
Amount_Requested = c(accept$Amount_Requested, decline$Amount_Requested),
Status = c(rep('Accept', nrow(accept)), rep('Decline', nrow(decline)))
)
#Plot Task 3
plot3 <- ggplot(graph_df_task3, aes(y = Amount_Requested, x = Status, fill=Status)) +
geom_boxplot() +
scale_fill_manual(values = c('green','red'))+
theme_bw() +
labs(x='')+
theme(axis.text.x = element_blank())
ggplotly(plot3)
It can be observed with the graph for Task 3 that the accept amounts requested are in general higher than the declines ones. Beyond that, it can be noticed that the decline amounts requested have a higher variance.