Loan Status : Time Series

Load the data

loan.all <- read.csv("LoanStats3a.csv", skip = 1)

## Observation It's a wide data set. Need to narrow it down to desired
## features

subsetting data

column.list <- c("id", "list_d", "accept_d", "issue_d", "loan_amnt", "loan_status")
loan <- loan.all[column.list]
lapply(loan, class)

## $id
## [1] "factor"
## 
## $list_d
## [1] "factor"
## 
## $accept_d
## [1] "factor"
## 
## $issue_d
## [1] "factor"
## 
## $loan_amnt
## [1] "integer"
## 
## $loan_status
## [1] "factor"


## Observation Dates are factors still

Pre-processing data


library(lubridate)

loan$list_d <- as.Date(loan$list_d)
loan$accept_d <- as.Date(loan$accept_d)
loan$issue_d <- as.Date(loan$issue_d)
loan$ones <- 1

loan$year <- year(loan$list_d)
loan$month <- month(loan$list_d)
loan$week <- week(loan$list_d)
loan$day <- day(loan$list_d)

head(loan)

##      id     list_d   accept_d    issue_d loan_amnt loan_status ones year
## 1 54734 2009-07-26 2009-07-26 2009-08-05     25000  Fully Paid    1 2009
## 2 55742 2008-05-12 2008-05-12 2008-05-27      7000  Fully Paid    1 2008
## 3 57245 2010-03-11 2010-03-11 2010-03-22      1200  Fully Paid    1 2010
## 4 57416 2009-11-04 2009-11-04 2009-11-12     10800  Fully Paid    1 2009
## 5 58915 2008-04-08 2008-04-08 2008-04-23      7500  Fully Paid    1 2008
## 6 59006 2009-09-14 2009-09-14 2009-09-18      3000  Fully Paid    1 2009
##   month week day
## 1     7   30  26
## 2     5   20  12
## 3     3   11  11
## 4    11   45   4
## 5     4   15   8
## 6     9   37  14

Number of loans by week

wloan <- aggregate(loan$ones, by = list(loan$year, loan$week), sum, na.rm = TRUE)
head(wloan)

##   Group.1 Group.2   x
## 1    2008       1  40
## 2    2009       1  44
## 3    2010       1  96
## 4    2011       1 345
## 5    2008       2  82
## 6    2009       2  63

colnames(wloan) <- c("year", "week", "count")

wloan <- wloan[order(wloan$year, wloan$week), ]
head(wloan)

##     year week count
## 81  2007   21     3
## 86  2007   22    11
## 91  2007   23     9
## 96  2007   24    18
## 101 2007   25    11
## 106 2007   26    15

tail(wloan)

##     year week count
## 210 2011   46   467
## 215 2011   47   805
## 220 2011   48   565
## 225 2011   49   962
## 230 2011   50   491
## 235 2011   51    10

Create test & train data set


## Training set comprises of 80% of all observations

trainCount <- trunc(nrow(wloan) * 0.8)
totalCount <- nrow(wloan)

wloanTrain <- wloan[1:trainCount, ]
wloanTest <- wloan[trainCount + 1:totalCount, ]

train.start <- c(wloan[1, ]$year, wloan[1, ]$week)
train.end <- c(wloan[trainCount, ]$year, wloan[trainCount, ]$week)

test.start <- c(wloan[trainCount + 1, ]$year, wloan[trainCount + 1, ]$week)
test.end <- c(wloan[totalCount, ]$year, wloan[totalCount, ]$week)

all.ts <- ts(wloan$count, start = train.start, test.end, frequency = 52)
train.ts <- ts(wloanTrain$count, start = train.start, end = train.end, frequency = 52)
test.ts <- ts(wloanTest$count, start = test.start, end = test.end, frequency = 52)

Plot time series for entire period

par(mfrow = c(1, 1))
plot.ts(all.ts, type = "l", col = "red", lwd = 2, ylab = "Number of loans", 
    main = "Number of loans over time")

plot of chunk unnamed-chunk-6


## Observation :: The loans seems to osicllate with spikes & valleys.  It
## also has some trend for part of the time where it goes up & then up for
## some time.  Important thing to note here is that the data set is not
## stationary

Smoothing the Time Series object by Moving Average of order 3

library("TTR")

## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

wtsSMA3 <- SMA(all.ts, n = 3)
plot.ts(wtsSMA3, col = "red", lwd = 2, ylab = "Number of loans", main = "Number of loans over time (n=3)")

plot of chunk unnamed-chunk-7

Smoothing the Time Series object by Moving Average of order 8

wtsSMA8 <- SMA(all.ts, n = 8)
plot.ts(wtsSMA8, col = "red", lwd = 2, ylab = "Number of loans", main = "Number of loans over time (n=8)")

plot of chunk unnamed-chunk-8

Seasonal Decomposition

plot(stl(all.ts, s.window = "period"), col = "red", lwd = 2, main = "Seasonal Decomposition")

plot of chunk unnamed-chunk-9

Plot ACF(Auto-correlation) and PACF(Partial Auto-correlation) on same graph

par(mfrow = c(1, 1))
acf(all.ts)

plot of chunk unnamed-chunk-10


par(mfrow = c(1, 1))
pacf(all.ts)

plot of chunk unnamed-chunk-10


## Observation :: ACF does not tail off quickly, hence its not stationary
## Trend and seasonality cause the data to be not stationary Hence we need to
## some differencing to make it stationary

Difference the series and plot it

par(mfrow = c(1, 1))
diff.loan <- diff(all.ts, differences = 1)
count <- length(diff.loan)
plot(1:count, diff.loan, type = "l", lwd = 2, col = "red")

plot of chunk unnamed-chunk-11


## Observation :: There is no trend or seasonality right now.  The time
## series of FIRST differences (above) does appear to be stationary in mean
## and variance,

Plot ACF(Auto-correlation) and PACF(Partial Auto-correlation) on same graph

par(mfrow = c(1, 1))
acf(diff.loan)

plot of chunk unnamed-chunk-12

acf(diff.loan, plot = FALSE)

## 
## Autocorrelations of series 'diff.loan', by lag
## 
## 0.0000 0.0192 0.0385 0.0577 0.0769 0.0962 0.1154 0.1346 0.1538 0.1731 
##  1.000 -0.221 -0.144 -0.007  0.015  0.019 -0.075  0.058  0.024  0.005 
## 0.1923 0.2115 0.2308 0.2500 0.2692 0.2885 0.3077 0.3269 0.3462 0.3654 
## -0.072  0.150 -0.090 -0.036  0.078 -0.126  0.007  0.073  0.111 -0.081 
## 0.3846 0.4038 0.4231 0.4423 
## -0.100  0.087 -0.062  0.103


## Observation We see from the correlogram that the autocorrelation at lag 1
## exceeds the significance bounds, but all other autocorrelations between
## lags 1-20 do not exceed the significance bounds.


pacf(diff.loan)

plot of chunk unnamed-chunk-12


## Observation The partial correlogram tails off to zero after lag 3

## Conclusion :: Since the correlogram is zero after lag 1, and the partial
## correlogram tails off to zero after lag 3, this means that the following
## ARMA (autoregressive moving average) models are possible for the time
## series of first differences:

## (A) an ARMA(3,0) model, that is, an autoregressive model of order p=3,
## since the partial autocorrelogram is zero after lag 3, and the
## autocorrelogram tails off to zero (although perhaps too abruptly for this
## model to be appropriate)

## (B) an ARMA(0,1) model, that is, a moving average model of order q=1,
## since the autocorrelogram is zero after lag 1 and the partial
## autocorrelogram tails off to zero

## (C) an ARMA(p,q) model, that is, a mixed model with p and q greater than
## 0, since the autocorrelogram and partial correlogram tail off to zero

## We assume that p=3 and q=1 are the paramters for an ideal ARIMA fit

Fit various models on Training set

library(forecast)

## Loading required package: timeDate
## This is forecast 5.4


############ Option 1 - Manual Arima ############

## Using ARIMA(0,1,1) model for time series, where d(=1)is the order of
## differencing used, and p & q are selected based on the criteria listed
## above

manual.fit <- Arima(train.ts, order = c(3, 1, 1))
manual.fit

## Series: train.ts 
## ARIMA(3,1,1)                    
## 
## Coefficients:
##         ar1     ar2     ar3     ma1
##       0.171  -0.206  -0.068  -0.502
## s.e.  0.267   0.101   0.126   0.263
## 
## sigma^2 estimated as 532:  log likelihood=-861.6
## AIC=1733   AICc=1734   BIC=1749


## Observation Standard Error: 0.1106, AIC: 1733.47

############ Option 2 - Auto-Arima #############

auto.fit <- auto.arima(train.ts)

## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots

auto.fit

## Series: train.ts 
## ARIMA(2,1,4)(0,1,1)[52]                    
## 
## Coefficients:
##          ar1     ar2    ma1   ma2     ma3     ma4    sma1
##       -0.748  -0.925  0.419  0.42  -0.601  -0.274  -0.471
## s.e.   0.056   0.040  0.121  0.14   0.099   0.104   0.179
## 
## sigma^2 estimated as 681:  log likelihood=-648.7
## AIC=1313   AICc=1315   BIC=1337


## Observation Standard Error: .0703, AIC: 1313.4 The selected order
## parameter is ARIMA(2,1,4)(0,1,1) with auto arima Gives better AIC value
## The Akaike Information Critera (AIC) is a widely used measure of a
## statistical model. It basically quantifies 1) the goodness of fit, and 2)
## the simplicity/parsimony, of the model into a single statistic. When
## comparing two models, the one with the lower AIC is generally “better”.

Examining In-sample accuracy parameters

accuracy(manual.fit)

##                 ME  RMSE   MAE    MPE  MAPE   MASE
## Training set 3.176 23.01 15.81 -11.59 31.13 0.2063

accuracy(auto.fit)

##                 ME  RMSE   MAE   MPE  MAPE   MASE
## Training set 2.238 22.16 13.81 -5.87 18.37 0.1802

Examining out-of sample accuracy parameters

accuracy(forecast(manual.fit, 52), test.ts)

##                   ME   RMSE    MAE    MPE  MAPE   MASE   ACF1 Theil's U
## Training set   3.176  23.01  15.81 -11.59 31.13 0.2063     NA        NA
## Test set     146.599 199.19 158.98 -23.59 86.12 2.0022 0.3832      1.61

plot(forecast(manual.fit, 52))
lines(test.ts)

plot of chunk unnamed-chunk-15


accuracy(forecast(auto.fit, 52), test.ts)

##                  ME   RMSE   MAE    MPE  MAPE   MASE   ACF1 Theil's U
## Training set  2.238  22.16 13.81  -5.87 18.37 0.1802     NA        NA
## Test set     65.707 136.78 92.81 -63.96 94.46 1.1688 0.1737     1.086

plot(forecast(auto.fit, 52))
lines(test.ts)

plot of chunk unnamed-chunk-15

Plotting Forecast Errors


## To check whether the forecast errors are normally distributed with mean
## zero, we can plot a histogram of the forecast errors, with an overlaid
## normal curve that has mean zero and the same standard deviation as the
## distribution of forecast errors. To do this, we can define a function
## “plotForecastErrors()”

plotForecastErrors <- function(forecasterrors, title = "Histogram of forecasterrors") {
    # make a histogram of the forecast errors:
    mybinsize <- IQR(forecasterrors)/4
    mysd <- sd(forecasterrors)
    mymin <- min(forecasterrors) - mysd * 5
    mymax <- max(forecasterrors) + mysd * 3
    # generate normally distributed data with mean 0 and standard deviation mysd
    mynorm <- rnorm(10000, mean = 0, sd = mysd)
    mymin2 <- min(mynorm)
    mymax2 <- max(mynorm)
    if (mymin2 < mymin) {
        mymin <- mymin2
    }
    if (mymax2 > mymax) {
        mymax <- mymax2
    }
    # make a red histogram of the forecast errors, with the normally distributed
    # data overlaid:
    mybins <- seq(mymin, mymax, mybinsize)
    hist(forecasterrors, col = "red", freq = FALSE, breaks = mybins, main = title)
    # freq=FALSE ensures the area under the histogram = 1 generate normally
    # distributed data with mean 0 and standard deviation mysd
    myhist <- hist(mynorm, plot = FALSE, breaks = mybins)
    # plot the normal curve as a blue line on top of the histogram of forecast
    # errors:
    points(myhist$mids, myhist$density, type = "l", col = "blue", lwd = 2)
}

par(mfrow = c(1, 2))
plot.ts(manual.fit$residuals, ylab = "residuals", main = "Residual Plot - ARIMA")
plot.ts(auto.fit$residuals, , ylab = "residuals", main = "Residual Plot - AUTO ARIMA")

plot of chunk unnamed-chunk-16


par(mfrow = c(1, 2))
plotForecastErrors(manual.fit$residuals, "Forecast Errors for ARIMA")
plotForecastErrors(auto.fit$residuals, "Forecast Errors for AUTO.ARIMA")

plot of chunk unnamed-chunk-16


## Observation The plot shows that the distribution of forecast errors is
## roughly centred on zero, and is more or less normally distributed. And the
## Error density is lower for Auto Arima

Box test : Hypothesis Test to determine residuals from the ARIMA model have no autocorrelation

acf(manual.fit$residuals, lag.max = 20)

plot of chunk unnamed-chunk-17

Box.test(manual.fit$residuals, type = "Ljung-Box", lag = 20)

## 
##  Box-Ljung test
## 
## data:  manual.fit$residuals
## X-squared = 15.46, df = 20, p-value = 0.7494


## Hypothesis Test / Results Ho - There is NO Auto-correlation between
## residuals Ha - There IS Auto-correlation between residuals Since the
## correlogram shows that none of the sample autocorrelations for lags 1-20
## exceed the significance bounds, and the p-value for the Ljung-Box test is
## 0.75, we can conclude that there is very little evidence for non-zero
## autocorrelations in the forecast errors at lags 1-20.