This markdown file presents the R code and analysis presented during my Strata 2013 forecasting presentation:
http://strataconf.com/strata2013/public/schedule/detail/27630
I go through a quick forecasting example using Lending Club and Prosper's loan data which are available for download here:
https://www.lendingclub.com/info/download-data.action http://www.prosper.com/tools/DataExport.aspx
Load the libraries and data
library(ggplot2)
library(gridExtra)
## Loading required package: grid
library(scales)
library(zoo)
## Attaching package: 'zoo'
## The following object(s) are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(forecast)
## This is forecast 4.01
library(TTR)
## Loading required package: xts
setwd("~/programming/r_data/lending_club")
lc <- read.csv("lendingclub0205.csv")
prosper <- read.csv("prosper0205.csv")
Sanatize and clean the columns (casts, conversions, renaming, etc.)
lc$Interest.Rate <- as.numeric(sub("%", "", as.character(lc$Interest.Rate)))/100
lc$Application.Date <- as.Date(as.character(lc$Application.Date), format = "%m/%d/%y")
lc$Application.Expiration.Date <- as.Date(as.character(lc$Application.Expiration.Date),
format = "%m/%d/%y")
lc$Issued.Date <- as.Date(as.character(lc$Issued.Date), format = "%m/%d/%y")
lc$Earliest.CREDIT.Line <- as.Date(as.character(lc$Earliest.CREDIT.Line), format = "%m/%d/%y")
lc$Debt.To.Income.Ratio <- as.numeric(sub("%", "", as.character(lc$Debt.To.Income.Ratio)))/100
lc$Revolving.Line.Utilization <- as.numeric(sub("%", "", as.character(lc$Revolving.Line.Utilization)))/100
names(lc) <- c("loanid", "amount.requested", "amount.funded", "interest", "loan.length",
"app.date", "app.expiration.date", "issue.date", "credit.grade", "loan.title",
"loan.purpose", "loan.description", "monthly.payment", "status", "total.funded",
"debt.to.income", "remaining.principle.funded", "payments.to.date.funded",
"remaining.principle", "payments.to.date", "screen.name", "city", "state",
"home.ownership", "monthly.income", "fico.range", "earliest.credit.line",
"open.credit.lines", "total.credit.lines", "revolving.credit.balance", "revolving.line.utilization",
"6month.inquiries", "accounts.now.delinquent", "delinquent.amount", "2year.delinquencies",
"months.since.delinquency", "records.on.file", "months.since.last.record",
"education", "employment.length", "code", "initial.listing.status")
Clean Propser data
f <- function(s) strsplit(s, "_")[[1]][1]
prosper$ListingCreationDate <- as.Date(sapply(as.character(prosper$ListingCreationDate),
f))
prosper$ClosedDate <- as.Date(sapply(as.character(prosper$ClosedDate), f))
prosper$DateCreditPulled <- as.Date(sapply(as.character(prosper$DateCreditPulled),
f))
prosper$LoanOriginationDate <- as.Date(sapply(as.character(prosper$LoanOriginationDate),
f))
prosper$ClosedDate <- as.Date(sapply(as.character(prosper$ClosedDate), f))
Filter out pre 2013-02-01 data so that we don't have a half month sitting around
lc <- lc[lc$issue.date <= "2013-01-31", ]
prosper <- prosper[prosper$LoanOriginationDate <= "2013-01-31", ]
Generate plotting xlim date limits and ylim value limits
lc$num.issue.date <- as.numeric(lc$issue.date)
prosper$num.issue.date <- as.numeric(prosper$LoanOriginationDate)
min.date = min(lc$issue.date, prosper$LoanOriginationDate, na.rm = T)
max.date = as.Date("2013-01-31")
max.ylim = max(tapply(lc$issue.date, format(lc$issue.date, "%m-%Y"), FUN = length),
tapply(prosper$LoanOriginationDate, format(prosper$LoanOriginationDate,
"%m-%Y"), FUN = length))
Plot number of loans funded by month
lc.date.plot <- ggplot(lc, aes(issue.date, ..count..)) + geom_histogram(binwidth = 30,
colour = "white", fill = "red") + scale_x_date(breaks = as.Date(seq(min.date,
max.date, 90)), labels = date_format("%b %Y"), limits = c(min.date, max.date)) +
scale_y_continuous(name = "New Loans", limits = c(0, max.ylim)) + theme_bw() +
xlab(NULL) + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title.y = element_text(hjust = 0.5, vjust = 0.2, size = 15, angle = 90),
plot.title = element_text(vjust = 1.5, size = 20, face = "bold")) + labs(title = "Monthly New Loans (LC)")
prosper.date.plot <- ggplot(prosper, aes(LoanOriginationDate, ..count..)) +
geom_histogram(binwidth = 30, colour = "white", fill = "blue") + scale_x_date(breaks = as.Date(seq(min.date,
max.date, 90)), labels = date_format("%b %Y"), limits = c(min.date, max.date)) +
scale_y_continuous(name = "New Loans", limits = c(0, max.ylim)) + theme_bw() +
xlab(NULL) + labs(title = "Monthly New Loans (PR)") + theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 1), axis.title.y = element_text(hjust = 0.5, vjust = 0.2,
size = 15, angle = 90), plot.title = element_text(vjust = 1.5, size = 20,
face = "bold"))
combined.date.plot <- grid.arrange(lc.date.plot, prosper.date.plot, nrow = 2,
ncol = 1)
combined.date.plot
## NULL
Create data to replicate total amount funded by Lending Club and Prosper.
lc.funded <- tapply(lc$total.funded, format(lc$issue.date, "%Y-%m"), FUN = sum)
lc.funded <- data.frame(rownames(lc.funded), lc.funded)
names(lc.funded) <- c("date", "total")
lc.funded$total <- as.numeric(lc.funded$total)
lc.funded$date <- as.yearmon(lc.funded$date)
prosper.funded <- tapply(prosper$LoanOriginalAmount, format(prosper$LoanOriginationDate,
"%Y-%m"), FUN = sum)
prosper.funded <- data.frame(rownames(prosper.funded), prosper.funded)
names(prosper.funded) <- c("date", "total")
prosper.funded$date <- as.yearmon(prosper.funded$date)
total.funded <- merge(prosper.funded, lc.funded, by = "date", all = T)
total.funded$total.y[is.na(total.funded$total.y)] <- 0
total.funded$total.x[is.na(total.funded$total.x)] <- 0
Plot Total Funds Disbursed
max.ylim <- max(lc.funded$total, prosper.funded$total)
my_labs <- total.funded$date[seq(from = 3, to = length(total.funded$date), b = 3)]
total.funded$date <- as.character(total.funded$date)
total.funded$date <- factor(total.funded$date, levels = unique(as.character(total.funded$date)))
total.funded$total.x <- as.numeric(total.funded$total.x)
total.funded$total.y <- as.numeric(total.funded$total.y)
lc.rev.plot <- ggplot(total.funded, aes(x = date, y = total.y)) + geom_bar(fill = "red",
width = 0.9, stat = "identity") + theme_bw() + xlab(NULL) + scale_y_continuous(name = "New Loan Disbursements ($)",
limits = c(0, max.ylim), labels = comma) + scale_x_discrete(breaks = my_labs) +
labs(title = "Monthly New Loan Disbursements $ (LC)") + theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 1), axis.title.y = element_text(hjust = 0.5, vjust = 0.2,
size = 15, angle = 90), plot.title = element_text(vjust = 1.5, size = 20,
face = "bold"))
prosper.rev.plot <- ggplot(total.funded, aes(x = date, y = total.x)) + geom_bar(fill = "blue",
width = 0.9, stat = "identity") + theme_bw() + xlab(NULL) + scale_y_continuous(name = "New Loan Disbursements ($)",
limits = c(0, max.ylim), labels = comma) + scale_x_discrete(breaks = my_labs) +
labs(title = "Monthly New Loan Disbursements $ (PR)") + theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 1), axis.title.y = element_text(hjust = 0.5, vjust = 0.2,
size = 15, angle = 90), plot.title = element_text(vjust = 1.5, size = 20,
face = "bold"))
combined.rev.plot <- grid.arrange(lc.rev.plot, prosper.rev.plot, nrow = 2, ncol = 1)
combined.rev.plot
## NULL
Create Daily and Monthly Time Series
lc.dayfunded <- tapply(lc$total.funded, format(lc$issue.date, "%Y-%m-%d"), FUN = sum)
lc.dayfunded <- data.frame(rownames(lc.dayfunded), lc.dayfunded)
names(lc.dayfunded) <- c("date", "total")
lc.dayfunded$date <- as.Date(lc.dayfunded$date)
zm <- zoo(total.funded$total, lc.funded$date)
zd <- zoo(lc.dayfunded$total, lc.dayfunded$date)
mts <- ts(as.numeric(lc.funded$total), frequency = 12, start = c(2007, 6))
dts <- as.ts(zd)
dts[is.na(dts)] <- 0
plot(naive(mts), main = "Naive Forecast (LC)")
Create and Plot Moving Averages
lcSMA3 <- SMA(dts, n = 3)
lcSMA10 <- SMA(dts, n = 10)
lcSMA20 <- SMA(dts, n = 20)
plot.ts(lcSMA3, main = "MA(3)")
plot.ts(lcSMA10, main = "MA(10)")
plot.ts(lcSMA20, main = "MA(20)")
Analyze and plot differences of data
dtsdiff1 <- diff(dts, differences = 1)
plot.ts(dts, main = "No Differences (LC)", ylab = "Loans Funded ($)")
plot.ts(dtsdiff1, main = "First Difference (LC)", ylab = "Loans Funded ($)")
Analyze and plot logs of data
logdts <- log(dts)
logdiffdts <- diff(log(lcSMA20), differences = 1)
logdiffdts[is.na(logdiffdts)] <- 0
plot.ts(dtsdiff1, main = "First Difference (LC)", ylab = "Loans Funded ($)")
plot(logdiffdts, main = "First Difference of Log", ylab = "Loans Funded ($)")
Holt Winters in the house
hw <- HoltWinters(logdiffdts, beta = FALSE, gamma = FALSE)
hwforecasts <- forecast.HoltWinters(hw, h = 356)
plot.forecast(hwforecasts)
The ARIMA show
lag.plot(mts, lags = 9, do.lines = FALSE)
auto.arima(mts)
## Warning: p-value smaller than printed p-value
## Warning: p-value smaller than printed p-value
## Warning: p-value greater than printed p-value
## Series: mts
## ARIMA(1,2,1)(0,1,1)[12]
##
## Coefficients:
## ar1 ma1 sma1
## 0.396 -0.734 -0.367
## s.e. 0.199 0.125 0.225
##
## sigma^2 estimated as 1.94e+12: log likelihood=-844.4
## AIC=1697 AICc=1698 BIC=1705
lcarima <- arima(mts, order = c(1, 2, 1))
lcarimaforecasts <- forecast.Arima(lcarima, h = 6)
plot.forecast(lcarimaforecasts, ylab = "Loans Funded ($)")