#load all packages required for the project
require(corrplot)
## Loading required package: corrplot
## corrplot 0.84 loaded
require(lattice)
## Loading required package: lattice
require(MASS)
## Loading required package: MASS
require(leaps)
## Loading required package: leaps
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
require(DAAG)
## Loading required package: DAAG
## 
## Attaching package: 'DAAG'
## The following object is masked from 'package:MASS':
## 
##     hills
require(pROC)
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
## 
##     auc
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
require(ROCR)
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
require(ggplot2)
## Loading required package: ggplot2
require(class)
## Loading required package: class
require(chemometrics)
## Loading required package: chemometrics
## Loading required package: rpart
require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
require(car)
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:DAAG':
## 
##     vif
require(Amelia)
## Loading required package: Amelia
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
require(mlbench)
## Loading required package: mlbench
#importing Dataset 
library(readr)
credit_data <- read_csv("~/Masters of Predictive Analytics/Predictive Behaviour Modelling in Business/credit_data.txt")
## Parsed with column specification:
## cols(
##   CARDHLDR = col_integer(),
##   DEFAULT = col_integer(),
##   AGE = col_double(),
##   ACADMOS = col_integer(),
##   ADEPCNT = col_integer(),
##   MAJORDRG = col_integer(),
##   MINORDRG = col_integer(),
##   OWNRENT = col_integer(),
##   INCOME = col_double(),
##   SELFEMPL = col_integer(),
##   INCPER = col_double(),
##   EXP_INC = col_double(),
##   SPENDING = col_double(),
##   LOGSPEND = col_double()
## )
attach(credit_data)
View(credit_data)
#Check Variable Names
names(credit_data)
##  [1] "CARDHLDR" "DEFAULT"  "AGE"      "ACADMOS"  "ADEPCNT"  "MAJORDRG"
##  [7] "MINORDRG" "OWNRENT"  "INCOME"   "SELFEMPL" "INCPER"   "EXP_INC" 
## [13] "SPENDING" "LOGSPEND"
summary(credit_data)
##     CARDHLDR         DEFAULT             AGE           ACADMOS      
##  Min.   :0.0000   Min.   :0.00000   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:25.67   1st Qu.: 12.00  
##  Median :1.0000   Median :0.00000   Median :31.50   Median : 30.00  
##  Mean   :0.7809   Mean   :0.07409   Mean   :33.47   Mean   : 55.32  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:39.33   3rd Qu.: 72.00  
##  Max.   :1.0000   Max.   :1.00000   Max.   :88.67   Max.   :576.00  
##                                                                     
##     ADEPCNT         MAJORDRG          MINORDRG          OWNRENT     
##  Min.   :0.000   Min.   : 0.0000   Min.   : 0.0000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.000  
##  Median :1.000   Median : 0.0000   Median : 0.0000   Median :0.000  
##  Mean   :1.017   Mean   : 0.4628   Mean   : 0.2905   Mean   :0.456  
##  3rd Qu.:2.000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.:1.000  
##  Max.   :9.000   Max.   :22.0000   Max.   :11.0000   Max.   :1.000  
##                                                                     
##      INCOME        SELFEMPL           INCPER            EXP_INC         
##  Min.   :  50   Min.   :0.00000   Min.   :   362.5   Min.   :0.0000882  
##  1st Qu.:1667   1st Qu.:0.00000   1st Qu.: 12000.0   1st Qu.:0.0027060  
##  Median :2167   Median :0.00000   Median : 19000.0   Median :0.0392857  
##  Mean   :2510   Mean   :0.05794   Mean   : 21719.7   Mean   :0.0709737  
##  3rd Qu.:2917   3rd Qu.:0.00000   3rd Qu.: 27658.7   3rd Qu.:0.0956546  
##  Max.   :8333   Max.   :1.00000   Max.   :150000.0   Max.   :2.0377278  
##                                                                         
##     SPENDING           LOGSPEND     
##  Min.   :   0.111   Min.   :-2.197  
##  1st Qu.:  58.753   1st Qu.: 4.073  
##  Median : 139.992   Median : 4.942  
##  Mean   : 226.983   Mean   : 4.729  
##  3rd Qu.: 284.440   3rd Qu.: 5.651  
##  Max.   :4810.309   Max.   : 8.479  
##  NA's   :2945       NA's   :2945
#Check for missing data
#The x-axis shows attributes and the y-axis shows instances. Horizontal lines indicate missing data for an instance, vertical blocks represent missing data for an attribute.
library(Amelia)
library(mlbench)
missmap(credit_data, col=c("blue", "red"), legend=FALSE)
## Warning in if (class(obj) == "amelia") {: the condition has length > 1 and
## only the first element will be used
## Warning: Unknown or uninitialised column: 'arguments'.

## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.

Missing Data on Logspend and spending

#Data Analysis
library(car)
scatterplot(AGE~ADEPCNT, data=credit_data, main="Scatterplot with boxplot") 

There appears to be errors in the data as there are people aged less than 18 with multiple dependencaies. We will remove these outliers

#Remove outliers of Age less than 18
credit_data1 <- subset(credit_data ,AGE>18)
#Updated Scatterplot with AGE greater than 18 
scatterplot(AGE~ADEPCNT, data=credit_data1, main="Scatterplot with boxplot")

boxplot(AGE~ credit_data1$DEFAULT, data=credit_data1, ylab="Age(years)", xlab=" Default Status ( 0 = No-Default, 1 = Default)", main="Relationship Between Age & Default Status") 

Most Defaults just under 30yr olds

plot(INCOME, MAJORDRG, data = credit_data1, main = 'Realtionship of Income Levels to loan payments
that are 60 days overdue', col = 'cyan4')
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter

The higher the income, the less MajorDRG

mix <- table(OWNRENT, credit_data$DEFAULT)/13390*100
plot(mix, data = credit_data1, main= "Own and Rent who have Defaults and who doesnt", col = 'blue')
## Warning: In mosaicplot.default(x, xlab = xlab, ylab = ylab, ...) :
##  extra argument 'data' will be disregarded

mix    
##        
## OWNRENT         0         1
##       0 50.029873  4.592980
##       1 42.935026  2.845407

Mostly Renters have defaulted on their credit cards

Let’s start calculating the correlation between each pair of numeric variables. These pair-wise correlations can be plotted in a correlation matrix plot to given an idea of which variables change together.

library(corrplot)
correlations <- cor(credit_data1[,1:14])
corrplot(correlations, method="circle")

For the above, dark blue is strong positive correlation. The diagonal is the autocorrelation. Dark orange implies a moderate to strong negative correlation.

#Correlation
pairs(credit_data1, col=credit_data1$DEFAULT)

#As there is 2945 missing values from logspend and spending of 13444 observations, we will impute them with the variable average
#First logspend
credit_data2 = transform(credit_data1, LOGSPEND = ifelse(is.na(LOGSPEND), mean(LOGSPEND, na.rm=TRUE), LOGSPEND))
View(credit_data2)
#Now we impute for spending
credit_data3 = transform(credit_data2, SPENDING = ifelse(is.na(SPENDING), mean(SPENDING, na.rm=TRUE), SPENDING))
View(credit_data3)
#Lets split the data into training and test data
install.packages("caret")
## Installing package into 'C:/Users/rhysa/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(caret)
set.seed(4)
Creditindex<-createDataPartition(credit_data3$DEFAULT, p= 0.8)[[1]]
CreditTrain<- credit_data3[Creditindex, ]
CreditTest<- credit_data3[-Creditindex, ]
#Build logistic regression
#All variables
log_model <- glm(CreditTrain$DEFAULT~.,family='binomial'(logit),data=CreditTrain)
summary(log_model)
## 
## Call:
## glm(formula = CreditTrain$DEFAULT ~ ., family = binomial(logit), 
##     data = CreditTrain)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.67375  -0.47180  -0.36464  -0.00008   3.11379  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.817e+01  2.083e+02  -0.087 0.930492    
## CARDHLDR     1.801e+01  2.083e+02   0.086 0.931078    
## AGE         -1.162e-02  4.649e-03  -2.499 0.012449 *  
## ACADMOS      4.679e-04  6.567e-04   0.713 0.476131    
## ADEPCNT      3.291e-02  5.078e-02   0.648 0.516974    
## MAJORDRG     2.534e-01  7.812e-02   3.244 0.001180 ** 
## MINORDRG     2.074e-01  5.512e-02   3.762 0.000169 ***
## OWNRENT     -3.057e-01  8.874e-02  -3.445 0.000572 ***
## INCOME      -2.277e-04  6.396e-05  -3.559 0.000372 ***
## SELFEMPL     2.469e-03  1.831e-01   0.013 0.989244    
## INCPER      -9.389e-06  5.602e-06  -1.676 0.093753 .  
## EXP_INC      2.740e+00  8.695e-01   3.151 0.001626 ** 
## SPENDING    -5.956e-04  4.333e-04  -1.375 0.169250    
## LOGSPEND    -2.428e-01  3.258e-02  -7.454 9.07e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5720.1  on 10711  degrees of freedom
## Residual deviance: 5062.6  on 10698  degrees of freedom
## AIC: 5090.6
## 
## Number of Fisher Scoring iterations: 18
#Intercept Only
log_model0 <- glm(CreditTrain$DEFAULT~ 1,family='binomial'(logit),data=CreditTrain)
## odds ratios only
exp(coef(log_model))
##  (Intercept)     CARDHLDR          AGE      ACADMOS      ADEPCNT 
## 1.289226e-08 6.652501e+07 9.884480e-01 1.000468e+00 1.033454e+00 
##     MAJORDRG     MINORDRG      OWNRENT       INCOME     SELFEMPL 
## 1.288390e+00 1.230421e+00 7.366141e-01 9.997724e-01 1.002472e+00 
##       INCPER      EXP_INC     SPENDING     LOGSPEND 
## 9.999906e-01 1.548899e+01 9.994045e-01 7.843948e-01
#Number of Defaults to No-defaults
library(gmodels)
## 
## Attaching package: 'gmodels'
## The following object is masked from 'package:pROC':
## 
##     ci
CrossTable(CreditTrain$DEFAULT)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  10712 
## 
##  
##           |         0 |         1 | 
##           |-----------|-----------|
##           |      9906 |       806 | 
##           |     0.925 |     0.075 | 
##           |-----------|-----------|
## 
## 
## 
## 
glmforward<-step(log_model0, scope=formula(log_model), direction="forward", trace=0)
summary(glmforward)
## 
## Call:
## glm(formula = CreditTrain$DEFAULT ~ CARDHLDR + INCOME + LOGSPEND + 
##     MINORDRG + EXP_INC + OWNRENT + INCPER + MAJORDRG + AGE + 
##     SPENDING, family = binomial(logit), data = CreditTrain)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.66312  -0.47187  -0.36540  -0.00008   3.12975  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.817e+01  2.077e+02  -0.087 0.930276    
## CARDHLDR     1.804e+01  2.077e+02   0.087 0.930775    
## INCOME      -2.093e-04  5.601e-05  -3.736 0.000187 ***
## LOGSPEND    -2.439e-01  3.252e-02  -7.500 6.39e-14 ***
## MINORDRG     2.088e-01  5.508e-02   3.791 0.000150 ***
## EXP_INC      2.735e+00  8.676e-01   3.153 0.001616 ** 
## OWNRENT     -2.871e-01  8.632e-02  -3.326 0.000879 ***
## INCPER      -1.213e-05  3.710e-06  -3.269 0.001078 ** 
## MAJORDRG     2.585e-01  7.791e-02   3.319 0.000904 ***
## AGE         -1.029e-02  4.373e-03  -2.353 0.018615 *  
## SPENDING    -5.924e-04  4.324e-04  -1.370 0.170675    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5720.1  on 10711  degrees of freedom
## Residual deviance: 5063.5  on 10701  degrees of freedom
## AIC: 5085.5
## 
## Number of Fisher Scoring iterations: 18
predictions_test <- predict(glmforward, newdata = CreditTest, type = "response")
range(predictions_test)  #Look at the range of the object "predictions_all_small"
## [1] 2.220488e-10 5.698558e-01
require(lattice)
histogram(~ predictions_test | CreditTrain$DEFAULT, type = "count", xlab = "Probability of Default",strip = strip.custom(factor.levels = c("True outcome: No Default", "True Outcome: Default")), nint = 10, col = "royalblue")

histogram(~I (1- predictions_test) | CreditTrain$DEFAULT, type = "count", xlab = "Probability of Default",strip = strip.custom(factor.levels = c("True outcome: No Default", "True Outcome: Default")), nint = 10, col = "royalblue")

plot(calibration(CreditTest$DEFAULT ~ predictions_test, cuts = 10))
## Error in calibration.formula(CreditTest$DEFAULT ~ predictions_test, cuts = 10): the left-hand side of the formula must be a factor of classes
#Confusion Matrix
fwd.pred <- ifelse(predictions_test>0.5,1,0) #Make a binary predictions-vector using a cut-off of 50%
CreditTest1 <- data.frame(CreditTest, fwd.pred)
conf <-table(CreditTest1$DEFAULT, fwd.pred,dnn = c("Predicted", "Actual"))
print(conf)
##          Actual
## Predicted    0    1
##         0 2490    1
##         1  187    0
TP <- conf[1, 1]
FN <- conf[1, 2] 
FP <- conf[2,1] 
TN <- conf[2,2]

# Calculate and print the accuracy: acc
acc.fwd = (TP+TN)/(TP+FN+FP+TN)
print(acc.fwd)
## [1] 0.9297984
# Calculate and print out the precision: prec
prec = (TP)/(TP+FP)
print(prec)
## [1] 0.9301457
# Calculate and print out the recall: rec (Sensitvity)
rec.fwd = (TP)/(TP+FN)
print(rec.fwd)
## [1] 0.9995986
library(ROCR)
library(pROC)
p <- predict(log_model, newdata = CreditTest, type = "response")
pr <- prediction(p, CreditTest$DEFAULT)
perf <- performance(pr, measure = "tpr", x.measure = "fpr")
par(pty = "s")
plot(perf)
abline(a=0, b= 1)

par(pty = "s")
plot(roc(CreditTest$DEFAULT, predictions_test), legacy.axes = TRUE)

#Area under the Curve
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7436676
#Classification Tree

library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## The following object is masked from 'package:car':
## 
##     Predict
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(rpart.plot)
library(partykit)
## Loading required package: libcoin
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal, varimp
attach(CreditTrain)
## The following objects are masked from credit_data:
## 
##     ACADMOS, ADEPCNT, AGE, CARDHLDR, DEFAULT, EXP_INC, INCOME,
##     INCPER, LOGSPEND, MAJORDRG, MINORDRG, OWNRENT, SELFEMPL,
##     SPENDING
credit.rpart <- rpart(DEFAULT~., data = credit_data, cp = 0.0023)
par(pty="s")
creditplot <- rpart.plot(credit.rpart, tweak = 1)

plot(creditplot, uniform= TRUE, xlab = "Variables", ylab = "0= No Default / 1=Default", main = "Classification Tree for Default Characteristics")
## Warning in plot.window(...): "uniform" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "uniform" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "uniform" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "uniform" is
## not a graphical parameter
## Warning in box(...): "uniform" is not a graphical parameter
## Warning in title(...): "uniform" is not a graphical parameter
text(creditplot, use.n=TRUE, all = TRUE, cex = 0.5)
## Warning in text.default(creditplot, use.n = TRUE, all = TRUE, cex = 0.5):
## "use.n" is not a graphical parameter
## Warning in text.default(creditplot, use.n = TRUE, all = TRUE, cex = 0.5):
## "all" is not a graphical parameter

#prune the tree
pfit <- prune(credit.rpart, cp = credit.rpart$cptable[which.min(credit.rpart$cptable[,"xerror"]), "CP"])
#plot Tree
plot(pfit, uniform=TRUE, main = "Pruned Classification Tree for Default Characterisics")
text(pfit, use.n=TRUE, all= TRUE, cex = 0.5)

#prediction of pruned tree
pred.prune <- predict(pfit)
pred_cutoff <- ifelse(pred.prune>0.5,1,0) #Make binary predictions-vector using a cut-off of 50%
credittree <- data.frame(CreditTest, pred_cutoff)
## Error in data.frame(CreditTest, pred_cutoff): arguments imply differing number of rows: 2678, 13444
conf.tree <- table(credittree$Default, pred_cutoff,dnn = c("Predicted", "Actual"))
## Error in table(credittree$Default, pred_cutoff, dnn = c("Predicted", "Actual")): object 'credittree' not found
print(conf.tree)
## Error in print(conf.tree): object 'conf.tree' not found
TP.tree <- conf.tree[1, 1]
## Error in eval(expr, envir, enclos): object 'conf.tree' not found
FN.tree <- conf.tree[1, 2] 
## Error in eval(expr, envir, enclos): object 'conf.tree' not found
FP.tree <- conf.tree[2,1] 
## Error in eval(expr, envir, enclos): object 'conf.tree' not found
TN.tree <- conf.tree[2,2]
## Error in eval(expr, envir, enclos): object 'conf.tree' not found
# Calculate and print the accuracy: acc
acc.tree = (TP.tree+TN.tree)/(TP.tree+FN.tree+FP.tree+TN.tree)
## Error in eval(expr, envir, enclos): object 'TP.tree' not found
print(acc.tree)
## Error in print(acc.tree): object 'acc.tree' not found
# Calculate and print out the recall: rec (Sensitvity)
rec.tree = (TP.tree)/(TP.tree+FN.tree)
## Error in eval(expr, envir, enclos): object 'TP.tree' not found
print(rec.tree)
## Error in print(rec.tree): object 'rec.tree' not found
library(ROCR)
#Generate ROC Curve using predback and test data set
credit.roc <- predict(pfit, CreditTest)

pred6 <- prediction(credit.roc, CreditTest$DEFAULT)
perf6 <- performance(pred6, "tpr", "fpr")
plot(perf6)
abline(a=0, b= 1)

plot(roc(CreditTest$DEFAULT, credit.roc), legacy.axes = TRUE)

#LoanTrain = LoanTrain[,-2]#Method to remove variables that arent strong.Removing it for ease of data transformation
#Random Forest
library(randomForest)
credit.rf <- randomForest(DEFAULT~.,data= CreditTrain) 
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
print(credit.rf) # view results 
## 
## Call:
##  randomForest(formula = DEFAULT ~ ., data = CreditTrain) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.06850596
##                     % Var explained: 1.55
#plot variable importance 
varImpPlot(credit.rf)

#Gini Index
testp4<-predict(credit.rf, CreditTest) 
confusionMatrix(testp4, reference = CreditTest$DEFAULT, positive = 0, dnn = c("Predicted", "Actual"))
## Error: `data` and `reference` should be factors with the same levels.