loan.all <- read.csv("LoanStats3a.csv", skip = 1)
## Observation It's a wide data set. Need to narrow it down to desired
## features
column.list <- c("id", "list_d", "accept_d", "issue_d", "loan_amnt", "loan_status")
loan <- loan.all[column.list]
lapply(loan, class)
## $id
## [1] "factor"
##
## $list_d
## [1] "factor"
##
## $accept_d
## [1] "factor"
##
## $issue_d
## [1] "factor"
##
## $loan_amnt
## [1] "integer"
##
## $loan_status
## [1] "factor"
## Observation Dates are factors still
library(lubridate)
loan$list_d <- as.Date(loan$list_d)
loan$accept_d <- as.Date(loan$accept_d)
loan$issue_d <- as.Date(loan$issue_d)
loan$ones <- 1
loan$year <- year(loan$list_d)
loan$month <- month(loan$list_d)
loan$week <- week(loan$list_d)
loan$day <- day(loan$list_d)
head(loan)
## id list_d accept_d issue_d loan_amnt loan_status ones year
## 1 54734 2009-07-26 2009-07-26 2009-08-05 25000 Fully Paid 1 2009
## 2 55742 2008-05-12 2008-05-12 2008-05-27 7000 Fully Paid 1 2008
## 3 57245 2010-03-11 2010-03-11 2010-03-22 1200 Fully Paid 1 2010
## 4 57416 2009-11-04 2009-11-04 2009-11-12 10800 Fully Paid 1 2009
## 5 58915 2008-04-08 2008-04-08 2008-04-23 7500 Fully Paid 1 2008
## 6 59006 2009-09-14 2009-09-14 2009-09-18 3000 Fully Paid 1 2009
## month week day
## 1 7 30 26
## 2 5 20 12
## 3 3 11 11
## 4 11 45 4
## 5 4 15 8
## 6 9 37 14
wloan <- aggregate(loan$ones, by = list(loan$year, loan$week), sum, na.rm = TRUE)
head(wloan)
## Group.1 Group.2 x
## 1 2008 1 40
## 2 2009 1 44
## 3 2010 1 96
## 4 2011 1 345
## 5 2008 2 82
## 6 2009 2 63
colnames(wloan) <- c("year", "week", "count")
wloan <- wloan[order(wloan$year, wloan$week), ]
head(wloan)
## year week count
## 81 2007 21 3
## 86 2007 22 11
## 91 2007 23 9
## 96 2007 24 18
## 101 2007 25 11
## 106 2007 26 15
tail(wloan)
## year week count
## 210 2011 46 467
## 215 2011 47 805
## 220 2011 48 565
## 225 2011 49 962
## 230 2011 50 491
## 235 2011 51 10
## Training set comprises of 80% of all observations
trainCount <- trunc(nrow(wloan) * 0.8)
totalCount <- nrow(wloan)
wloanTrain <- wloan[1:trainCount, ]
wloanTest <- wloan[trainCount + 1:totalCount, ]
train.start <- c(wloan[1, ]$year, wloan[1, ]$week)
train.end <- c(wloan[trainCount, ]$year, wloan[trainCount, ]$week)
test.start <- c(wloan[trainCount + 1, ]$year, wloan[trainCount + 1, ]$week)
test.end <- c(wloan[totalCount, ]$year, wloan[totalCount, ]$week)
all.ts <- ts(wloan$count, start = train.start, test.end, frequency = 52)
train.ts <- ts(wloanTrain$count, start = train.start, end = train.end, frequency = 52)
test.ts <- ts(wloanTest$count, start = test.start, end = test.end, frequency = 52)
par(mfrow = c(1, 1))
plot.ts(all.ts, type = "l", col = "red", lwd = 2, ylab = "Number of loans",
main = "Number of loans over time")
## Observation :: The loans seems to osicllate with spikes & valleys. It
## also has some trend for part of the time where it goes up & then up for
## some time. Important thing to note here is that the data set is not
## stationary
library("TTR")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
wtsSMA3 <- SMA(all.ts, n = 3)
plot.ts(wtsSMA3, col = "red", lwd = 2, ylab = "Number of loans", main = "Number of loans over time (n=3)")
wtsSMA8 <- SMA(all.ts, n = 8)
plot.ts(wtsSMA8, col = "red", lwd = 2, ylab = "Number of loans", main = "Number of loans over time (n=8)")
plot(stl(all.ts, s.window = "period"), col = "red", lwd = 2, main = "Seasonal Decomposition")
par(mfrow = c(1, 1))
acf(all.ts)
par(mfrow = c(1, 1))
pacf(all.ts)
## Observation :: ACF does not tail off quickly, hence its not stationary
## Trend and seasonality cause the data to be not stationary Hence we need to
## some differencing to make it stationary
par(mfrow = c(1, 1))
diff.loan <- diff(all.ts, differences = 1)
count <- length(diff.loan)
plot(1:count, diff.loan, type = "l", lwd = 2, col = "red")
## Observation :: There is no trend or seasonality right now. The time
## series of FIRST differences (above) does appear to be stationary in mean
## and variance,
par(mfrow = c(1, 1))
acf(diff.loan)
acf(diff.loan, plot = FALSE)
##
## Autocorrelations of series 'diff.loan', by lag
##
## 0.0000 0.0192 0.0385 0.0577 0.0769 0.0962 0.1154 0.1346 0.1538 0.1731
## 1.000 -0.221 -0.144 -0.007 0.015 0.019 -0.075 0.058 0.024 0.005
## 0.1923 0.2115 0.2308 0.2500 0.2692 0.2885 0.3077 0.3269 0.3462 0.3654
## -0.072 0.150 -0.090 -0.036 0.078 -0.126 0.007 0.073 0.111 -0.081
## 0.3846 0.4038 0.4231 0.4423
## -0.100 0.087 -0.062 0.103
## Observation We see from the correlogram that the autocorrelation at lag 1
## exceeds the significance bounds, but all other autocorrelations between
## lags 1-20 do not exceed the significance bounds.
pacf(diff.loan)
## Observation The partial correlogram tails off to zero after lag 3
## Conclusion :: Since the correlogram is zero after lag 1, and the partial
## correlogram tails off to zero after lag 3, this means that the following
## ARMA (autoregressive moving average) models are possible for the time
## series of first differences:
## (A) an ARMA(3,0) model, that is, an autoregressive model of order p=3,
## since the partial autocorrelogram is zero after lag 3, and the
## autocorrelogram tails off to zero (although perhaps too abruptly for this
## model to be appropriate)
## (B) an ARMA(0,1) model, that is, a moving average model of order q=1,
## since the autocorrelogram is zero after lag 1 and the partial
## autocorrelogram tails off to zero
## (C) an ARMA(p,q) model, that is, a mixed model with p and q greater than
## 0, since the autocorrelogram and partial correlogram tail off to zero
## We assume that p=3 and q=1 are the paramters for an ideal ARIMA fit
library(forecast)
## Loading required package: timeDate
## This is forecast 5.4
############ Option 1 - Manual Arima ############
## Using ARIMA(0,1,1) model for time series, where d(=1)is the order of
## differencing used, and p & q are selected based on the criteria listed
## above
manual.fit <- Arima(train.ts, order = c(3, 1, 1))
manual.fit
## Series: train.ts
## ARIMA(3,1,1)
##
## Coefficients:
## ar1 ar2 ar3 ma1
## 0.171 -0.206 -0.068 -0.502
## s.e. 0.267 0.101 0.126 0.263
##
## sigma^2 estimated as 532: log likelihood=-861.6
## AIC=1733 AICc=1734 BIC=1749
## Observation Standard Error: 0.1106, AIC: 1733.47
############ Option 2 - Auto-Arima #############
auto.fit <- auto.arima(train.ts)
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
## Warning: Unable to check for unit roots
auto.fit
## Series: train.ts
## ARIMA(2,1,4)(0,1,1)[52]
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4 sma1
## -0.748 -0.925 0.419 0.42 -0.601 -0.274 -0.471
## s.e. 0.056 0.040 0.121 0.14 0.099 0.104 0.179
##
## sigma^2 estimated as 681: log likelihood=-648.7
## AIC=1313 AICc=1315 BIC=1337
## Observation Standard Error: .0703, AIC: 1313.4 The selected order
## parameter is ARIMA(2,1,4)(0,1,1) with auto arima Gives better AIC value
## The Akaike Information Critera (AIC) is a widely used measure of a
## statistical model. It basically quantifies 1) the goodness of fit, and 2)
## the simplicity/parsimony, of the model into a single statistic. When
## comparing two models, the one with the lower AIC is generally “better”.
accuracy(manual.fit)
## ME RMSE MAE MPE MAPE MASE
## Training set 3.176 23.01 15.81 -11.59 31.13 0.2063
accuracy(auto.fit)
## ME RMSE MAE MPE MAPE MASE
## Training set 2.238 22.16 13.81 -5.87 18.37 0.1802
accuracy(forecast(manual.fit, 52), test.ts)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 3.176 23.01 15.81 -11.59 31.13 0.2063 NA NA
## Test set 146.599 199.19 158.98 -23.59 86.12 2.0022 0.3832 1.61
plot(forecast(manual.fit, 52))
lines(test.ts)
accuracy(forecast(auto.fit, 52), test.ts)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 2.238 22.16 13.81 -5.87 18.37 0.1802 NA NA
## Test set 65.707 136.78 92.81 -63.96 94.46 1.1688 0.1737 1.086
plot(forecast(auto.fit, 52))
lines(test.ts)
## To check whether the forecast errors are normally distributed with mean
## zero, we can plot a histogram of the forecast errors, with an overlaid
## normal curve that has mean zero and the same standard deviation as the
## distribution of forecast errors. To do this, we can define a function
## “plotForecastErrors()”
plotForecastErrors <- function(forecasterrors, title = "Histogram of forecasterrors") {
# make a histogram of the forecast errors:
mybinsize <- IQR(forecasterrors)/4
mysd <- sd(forecasterrors)
mymin <- min(forecasterrors) - mysd * 5
mymax <- max(forecasterrors) + mysd * 3
# generate normally distributed data with mean 0 and standard deviation mysd
mynorm <- rnorm(10000, mean = 0, sd = mysd)
mymin2 <- min(mynorm)
mymax2 <- max(mynorm)
if (mymin2 < mymin) {
mymin <- mymin2
}
if (mymax2 > mymax) {
mymax <- mymax2
}
# make a red histogram of the forecast errors, with the normally distributed
# data overlaid:
mybins <- seq(mymin, mymax, mybinsize)
hist(forecasterrors, col = "red", freq = FALSE, breaks = mybins, main = title)
# freq=FALSE ensures the area under the histogram = 1 generate normally
# distributed data with mean 0 and standard deviation mysd
myhist <- hist(mynorm, plot = FALSE, breaks = mybins)
# plot the normal curve as a blue line on top of the histogram of forecast
# errors:
points(myhist$mids, myhist$density, type = "l", col = "blue", lwd = 2)
}
par(mfrow = c(1, 2))
plot.ts(manual.fit$residuals, ylab = "residuals", main = "Residual Plot - ARIMA")
plot.ts(auto.fit$residuals, , ylab = "residuals", main = "Residual Plot - AUTO ARIMA")
par(mfrow = c(1, 2))
plotForecastErrors(manual.fit$residuals, "Forecast Errors for ARIMA")
plotForecastErrors(auto.fit$residuals, "Forecast Errors for AUTO.ARIMA")
## Observation The plot shows that the distribution of forecast errors is
## roughly centred on zero, and is more or less normally distributed. And the
## Error density is lower for Auto Arima
acf(manual.fit$residuals, lag.max = 20)
Box.test(manual.fit$residuals, type = "Ljung-Box", lag = 20)
##
## Box-Ljung test
##
## data: manual.fit$residuals
## X-squared = 15.46, df = 20, p-value = 0.7494
## Hypothesis Test / Results Ho - There is NO Auto-correlation between
## residuals Ha - There IS Auto-correlation between residuals Since the
## correlogram shows that none of the sample autocorrelations for lags 1-20
## exceed the significance bounds, and the p-value for the Ljung-Box test is
## 0.75, we can conclude that there is very little evidence for non-zero
## autocorrelations in the forecast errors at lags 1-20.