2013 Strata Presentation

Introduction to Forecasting

Mike Bailey, Economist @ Facebook

mcbailey@fb.com

This markdown file presents the R code and analysis presented during my Strata 2013 forecasting presentation:

http://strataconf.com/strata2013/public/schedule/detail/27630

I go through a quick forecasting example using Lending Club and Prosper's loan data which are available for download here:

https://www.lendingclub.com/info/download-data.action http://www.prosper.com/tools/DataExport.aspx

Load the libraries and data

library(ggplot2)
library(gridExtra)
## Loading required package: grid
library(scales)
library(zoo)
## Attaching package: 'zoo'
## The following object(s) are masked from 'package:base':
## 
## as.Date, as.Date.numeric
library(forecast)
## This is forecast 4.01
library(TTR)
## Loading required package: xts
setwd("~/programming/r_data/lending_club")
lc <- read.csv("lendingclub0205.csv")
prosper <- read.csv("prosper0205.csv")

Sanatize and clean the columns (casts, conversions, renaming, etc.)

lc$Interest.Rate <- as.numeric(sub("%", "", as.character(lc$Interest.Rate)))/100
lc$Application.Date <- as.Date(as.character(lc$Application.Date), format = "%m/%d/%y")
lc$Application.Expiration.Date <- as.Date(as.character(lc$Application.Expiration.Date), 
    format = "%m/%d/%y")
lc$Issued.Date <- as.Date(as.character(lc$Issued.Date), format = "%m/%d/%y")
lc$Earliest.CREDIT.Line <- as.Date(as.character(lc$Earliest.CREDIT.Line), format = "%m/%d/%y")
lc$Debt.To.Income.Ratio <- as.numeric(sub("%", "", as.character(lc$Debt.To.Income.Ratio)))/100
lc$Revolving.Line.Utilization <- as.numeric(sub("%", "", as.character(lc$Revolving.Line.Utilization)))/100
names(lc) <- c("loanid", "amount.requested", "amount.funded", "interest", "loan.length", 
    "app.date", "app.expiration.date", "issue.date", "credit.grade", "loan.title", 
    "loan.purpose", "loan.description", "monthly.payment", "status", "total.funded", 
    "debt.to.income", "remaining.principle.funded", "payments.to.date.funded", 
    "remaining.principle", "payments.to.date", "screen.name", "city", "state", 
    "home.ownership", "monthly.income", "fico.range", "earliest.credit.line", 
    "open.credit.lines", "total.credit.lines", "revolving.credit.balance", "revolving.line.utilization", 
    "6month.inquiries", "accounts.now.delinquent", "delinquent.amount", "2year.delinquencies", 
    "months.since.delinquency", "records.on.file", "months.since.last.record", 
    "education", "employment.length", "code", "initial.listing.status")

Clean Propser data

f <- function(s) strsplit(s, "_")[[1]][1]
prosper$ListingCreationDate <- as.Date(sapply(as.character(prosper$ListingCreationDate), 
    f))
prosper$ClosedDate <- as.Date(sapply(as.character(prosper$ClosedDate), f))
prosper$DateCreditPulled <- as.Date(sapply(as.character(prosper$DateCreditPulled), 
    f))
prosper$LoanOriginationDate <- as.Date(sapply(as.character(prosper$LoanOriginationDate), 
    f))
prosper$ClosedDate <- as.Date(sapply(as.character(prosper$ClosedDate), f))

Filter out pre 2013-02-01 data so that we don't have a half month sitting around

lc <- lc[lc$issue.date <= "2013-01-31", ]
prosper <- prosper[prosper$LoanOriginationDate <= "2013-01-31", ]

Generate plotting xlim date limits and ylim value limits

lc$num.issue.date <- as.numeric(lc$issue.date)
prosper$num.issue.date <- as.numeric(prosper$LoanOriginationDate)
min.date = min(lc$issue.date, prosper$LoanOriginationDate, na.rm = T)
max.date = as.Date("2013-01-31")
max.ylim = max(tapply(lc$issue.date, format(lc$issue.date, "%m-%Y"), FUN = length), 
    tapply(prosper$LoanOriginationDate, format(prosper$LoanOriginationDate, 
        "%m-%Y"), FUN = length))

Plot number of loans funded by month

lc.date.plot <- ggplot(lc, aes(issue.date, ..count..)) + geom_histogram(binwidth = 30, 
    colour = "white", fill = "red") + scale_x_date(breaks = as.Date(seq(min.date, 
    max.date, 90)), labels = date_format("%b %Y"), limits = c(min.date, max.date)) + 
    scale_y_continuous(name = "New Loans", limits = c(0, max.ylim)) + theme_bw() + 
    xlab(NULL) + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
    axis.title.y = element_text(hjust = 0.5, vjust = 0.2, size = 15, angle = 90), 
    plot.title = element_text(vjust = 1.5, size = 20, face = "bold")) + labs(title = "Monthly New Loans (LC)")

prosper.date.plot <- ggplot(prosper, aes(LoanOriginationDate, ..count..)) + 
    geom_histogram(binwidth = 30, colour = "white", fill = "blue") + scale_x_date(breaks = as.Date(seq(min.date, 
    max.date, 90)), labels = date_format("%b %Y"), limits = c(min.date, max.date)) + 
    scale_y_continuous(name = "New Loans", limits = c(0, max.ylim)) + theme_bw() + 
    xlab(NULL) + labs(title = "Monthly New Loans (PR)") + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1, vjust = 1), axis.title.y = element_text(hjust = 0.5, vjust = 0.2, 
    size = 15, angle = 90), plot.title = element_text(vjust = 1.5, size = 20, 
    face = "bold"))

combined.date.plot <- grid.arrange(lc.date.plot, prosper.date.plot, nrow = 2, 
    ncol = 1)

plot of chunk unnamed-chunk-6

combined.date.plot
## NULL

Create data to replicate total amount funded by Lending Club and Prosper.

lc.funded <- tapply(lc$total.funded, format(lc$issue.date, "%Y-%m"), FUN = sum)
lc.funded <- data.frame(rownames(lc.funded), lc.funded)
names(lc.funded) <- c("date", "total")
lc.funded$total <- as.numeric(lc.funded$total)
lc.funded$date <- as.yearmon(lc.funded$date)
prosper.funded <- tapply(prosper$LoanOriginalAmount, format(prosper$LoanOriginationDate, 
    "%Y-%m"), FUN = sum)
prosper.funded <- data.frame(rownames(prosper.funded), prosper.funded)
names(prosper.funded) <- c("date", "total")
prosper.funded$date <- as.yearmon(prosper.funded$date)
total.funded <- merge(prosper.funded, lc.funded, by = "date", all = T)
total.funded$total.y[is.na(total.funded$total.y)] <- 0
total.funded$total.x[is.na(total.funded$total.x)] <- 0

Plot Total Funds Disbursed

max.ylim <- max(lc.funded$total, prosper.funded$total)
my_labs <- total.funded$date[seq(from = 3, to = length(total.funded$date), b = 3)]

total.funded$date <- as.character(total.funded$date)
total.funded$date <- factor(total.funded$date, levels = unique(as.character(total.funded$date)))
total.funded$total.x <- as.numeric(total.funded$total.x)
total.funded$total.y <- as.numeric(total.funded$total.y)

lc.rev.plot <- ggplot(total.funded, aes(x = date, y = total.y)) + geom_bar(fill = "red", 
    width = 0.9, stat = "identity") + theme_bw() + xlab(NULL) + scale_y_continuous(name = "New Loan Disbursements ($)", 
    limits = c(0, max.ylim), labels = comma) + scale_x_discrete(breaks = my_labs) + 
    labs(title = "Monthly New Loan Disbursements $ (LC)") + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1, vjust = 1), axis.title.y = element_text(hjust = 0.5, vjust = 0.2, 
    size = 15, angle = 90), plot.title = element_text(vjust = 1.5, size = 20, 
    face = "bold"))

prosper.rev.plot <- ggplot(total.funded, aes(x = date, y = total.x)) + geom_bar(fill = "blue", 
    width = 0.9, stat = "identity") + theme_bw() + xlab(NULL) + scale_y_continuous(name = "New Loan Disbursements ($)", 
    limits = c(0, max.ylim), labels = comma) + scale_x_discrete(breaks = my_labs) + 
    labs(title = "Monthly New Loan Disbursements $ (PR)") + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1, vjust = 1), axis.title.y = element_text(hjust = 0.5, vjust = 0.2, 
    size = 15, angle = 90), plot.title = element_text(vjust = 1.5, size = 20, 
    face = "bold"))

combined.rev.plot <- grid.arrange(lc.rev.plot, prosper.rev.plot, nrow = 2, ncol = 1)

plot of chunk unnamed-chunk-9

combined.rev.plot
## NULL

Create Daily and Monthly Time Series

lc.dayfunded <- tapply(lc$total.funded, format(lc$issue.date, "%Y-%m-%d"), FUN = sum)
lc.dayfunded <- data.frame(rownames(lc.dayfunded), lc.dayfunded)
names(lc.dayfunded) <- c("date", "total")
lc.dayfunded$date <- as.Date(lc.dayfunded$date)

zm <- zoo(total.funded$total, lc.funded$date)
zd <- zoo(lc.dayfunded$total, lc.dayfunded$date)
mts <- ts(as.numeric(lc.funded$total), frequency = 12, start = c(2007, 6))
dts <- as.ts(zd)
dts[is.na(dts)] <- 0
plot(naive(mts), main = "Naive Forecast (LC)")

plot of chunk unnamed-chunk-12

Create and Plot Moving Averages

lcSMA3 <- SMA(dts, n = 3)
lcSMA10 <- SMA(dts, n = 10)
lcSMA20 <- SMA(dts, n = 20)
plot.ts(lcSMA3, main = "MA(3)")

plot of chunk unnamed-chunk-14

plot.ts(lcSMA10, main = "MA(10)")

plot of chunk unnamed-chunk-15

plot.ts(lcSMA20, main = "MA(20)")

plot of chunk unnamed-chunk-16

Analyze and plot differences of data

dtsdiff1 <- diff(dts, differences = 1)
plot.ts(dts, main = "No Differences (LC)", ylab = "Loans Funded ($)")

plot of chunk unnamed-chunk-18

plot.ts(dtsdiff1, main = "First Difference (LC)", ylab = "Loans Funded ($)")

plot of chunk unnamed-chunk-19

Analyze and plot logs of data

logdts <- log(dts)
logdiffdts <- diff(log(lcSMA20), differences = 1)
logdiffdts[is.na(logdiffdts)] <- 0
plot.ts(dtsdiff1, main = "First Difference (LC)", ylab = "Loans Funded ($)")

plot of chunk unnamed-chunk-21

plot(logdiffdts, main = "First Difference of Log", ylab = "Loans Funded ($)")

plot of chunk unnamed-chunk-22

Holt Winters in the house

hw <- HoltWinters(logdiffdts, beta = FALSE, gamma = FALSE)
hwforecasts <- forecast.HoltWinters(hw, h = 356)
plot.forecast(hwforecasts)

plot of chunk unnamed-chunk-24

The ARIMA show

lag.plot(mts, lags = 9, do.lines = FALSE)

plot of chunk unnamed-chunk-25

auto.arima(mts)
## Warning: p-value smaller than printed p-value
## Warning: p-value smaller than printed p-value
## Warning: p-value greater than printed p-value
## Series: mts 
## ARIMA(1,2,1)(0,1,1)[12]                    
## 
## Coefficients:
##         ar1     ma1    sma1
##       0.396  -0.734  -0.367
## s.e.  0.199   0.125   0.225
## 
## sigma^2 estimated as 1.94e+12:  log likelihood=-844.4
## AIC=1697   AICc=1698   BIC=1705
lcarima <- arima(mts, order = c(1, 2, 1))
lcarimaforecasts <- forecast.Arima(lcarima, h = 6)
plot.forecast(lcarimaforecasts, ylab = "Loans Funded ($)")

plot of chunk unnamed-chunk-27