#load all packages required for the project
require(corrplot)
## Loading required package: corrplot
## corrplot 0.84 loaded
require(lattice)
## Loading required package: lattice
require(MASS)
## Loading required package: MASS
require(leaps)
## Loading required package: leaps
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
require(DAAG)
## Loading required package: DAAG
##
## Attaching package: 'DAAG'
## The following object is masked from 'package:MASS':
##
## hills
require(pROC)
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
require(ROCR)
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
require(ggplot2)
## Loading required package: ggplot2
require(class)
## Loading required package: class
require(chemometrics)
## Loading required package: chemometrics
## Loading required package: rpart
require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
require(car)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:DAAG':
##
## vif
require(Amelia)
## Loading required package: Amelia
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
require(mlbench)
## Loading required package: mlbench
#importing Dataset
library(readr)
credit_data <- read_csv("~/Masters of Predictive Analytics/Predictive Behaviour Modelling in Business/credit_data.txt")
## Parsed with column specification:
## cols(
## CARDHLDR = col_integer(),
## DEFAULT = col_integer(),
## AGE = col_double(),
## ACADMOS = col_integer(),
## ADEPCNT = col_integer(),
## MAJORDRG = col_integer(),
## MINORDRG = col_integer(),
## OWNRENT = col_integer(),
## INCOME = col_double(),
## SELFEMPL = col_integer(),
## INCPER = col_double(),
## EXP_INC = col_double(),
## SPENDING = col_double(),
## LOGSPEND = col_double()
## )
attach(credit_data)
View(credit_data)
#Check Variable Names
names(credit_data)
## [1] "CARDHLDR" "DEFAULT" "AGE" "ACADMOS" "ADEPCNT" "MAJORDRG"
## [7] "MINORDRG" "OWNRENT" "INCOME" "SELFEMPL" "INCPER" "EXP_INC"
## [13] "SPENDING" "LOGSPEND"
summary(credit_data)
## CARDHLDR DEFAULT AGE ACADMOS
## Min. :0.0000 Min. :0.00000 Min. : 0.00 Min. : 0.00
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:25.67 1st Qu.: 12.00
## Median :1.0000 Median :0.00000 Median :31.50 Median : 30.00
## Mean :0.7809 Mean :0.07409 Mean :33.47 Mean : 55.32
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:39.33 3rd Qu.: 72.00
## Max. :1.0000 Max. :1.00000 Max. :88.67 Max. :576.00
##
## ADEPCNT MAJORDRG MINORDRG OWNRENT
## Min. :0.000 Min. : 0.0000 Min. : 0.0000 Min. :0.000
## 1st Qu.:0.000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.000
## Median :1.000 Median : 0.0000 Median : 0.0000 Median :0.000
## Mean :1.017 Mean : 0.4628 Mean : 0.2905 Mean :0.456
## 3rd Qu.:2.000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:1.000
## Max. :9.000 Max. :22.0000 Max. :11.0000 Max. :1.000
##
## INCOME SELFEMPL INCPER EXP_INC
## Min. : 50 Min. :0.00000 Min. : 362.5 Min. :0.0000882
## 1st Qu.:1667 1st Qu.:0.00000 1st Qu.: 12000.0 1st Qu.:0.0027060
## Median :2167 Median :0.00000 Median : 19000.0 Median :0.0392857
## Mean :2510 Mean :0.05794 Mean : 21719.7 Mean :0.0709737
## 3rd Qu.:2917 3rd Qu.:0.00000 3rd Qu.: 27658.7 3rd Qu.:0.0956546
## Max. :8333 Max. :1.00000 Max. :150000.0 Max. :2.0377278
##
## SPENDING LOGSPEND
## Min. : 0.111 Min. :-2.197
## 1st Qu.: 58.753 1st Qu.: 4.073
## Median : 139.992 Median : 4.942
## Mean : 226.983 Mean : 4.729
## 3rd Qu.: 284.440 3rd Qu.: 5.651
## Max. :4810.309 Max. : 8.479
## NA's :2945 NA's :2945
#Check for missing data
#The x-axis shows attributes and the y-axis shows instances. Horizontal lines indicate missing data for an instance, vertical blocks represent missing data for an attribute.
library(Amelia)
library(mlbench)
missmap(credit_data, col=c("blue", "red"), legend=FALSE)
## Warning in if (class(obj) == "amelia") {: the condition has length > 1 and
## only the first element will be used
## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.
Missing Data on Logspend and spending
#Data Analysis
library(car)
scatterplot(AGE~ADEPCNT, data=credit_data, main="Scatterplot with boxplot")
There appears to be errors in the data as there are people aged less than 18 with multiple dependencaies. We will remove these outliers
#Remove outliers of Age less than 18
credit_data1 <- subset(credit_data ,AGE>18)
#Updated Scatterplot with AGE greater than 18
scatterplot(AGE~ADEPCNT, data=credit_data1, main="Scatterplot with boxplot")
boxplot(AGE~ credit_data1$DEFAULT, data=credit_data1, ylab="Age(years)", xlab=" Default Status ( 0 = No-Default, 1 = Default)", main="Relationship Between Age & Default Status")
Most Defaults just under 30yr olds
plot(INCOME, MAJORDRG, data = credit_data1, main = 'Realtionship of Income Levels to loan payments
that are 60 days overdue', col = 'cyan4')
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
The higher the income, the less MajorDRG
mix <- table(OWNRENT, credit_data$DEFAULT)/13390*100
plot(mix, data = credit_data1, main= "Own and Rent who have Defaults and who doesnt", col = 'blue')
## Warning: In mosaicplot.default(x, xlab = xlab, ylab = ylab, ...) :
## extra argument 'data' will be disregarded
mix
##
## OWNRENT 0 1
## 0 50.029873 4.592980
## 1 42.935026 2.845407
Mostly Renters have defaulted on their credit cards
Let’s start calculating the correlation between each pair of numeric variables. These pair-wise correlations can be plotted in a correlation matrix plot to given an idea of which variables change together.
library(corrplot)
correlations <- cor(credit_data1[,1:14])
corrplot(correlations, method="circle")
For the above, dark blue is strong positive correlation. The diagonal is the autocorrelation. Dark orange implies a moderate to strong negative correlation.
#Correlation
pairs(credit_data1, col=credit_data1$DEFAULT)
#As there is 2945 missing values from logspend and spending of 13444 observations, we will impute them with the variable average
#First logspend
credit_data2 = transform(credit_data1, LOGSPEND = ifelse(is.na(LOGSPEND), mean(LOGSPEND, na.rm=TRUE), LOGSPEND))
View(credit_data2)
#Now we impute for spending
credit_data3 = transform(credit_data2, SPENDING = ifelse(is.na(SPENDING), mean(SPENDING, na.rm=TRUE), SPENDING))
View(credit_data3)
#Lets split the data into training and test data
install.packages("caret")
## Installing package into 'C:/Users/rhysa/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(caret)
set.seed(4)
Creditindex<-createDataPartition(credit_data3$DEFAULT, p= 0.8)[[1]]
CreditTrain<- credit_data3[Creditindex, ]
CreditTest<- credit_data3[-Creditindex, ]
#Build logistic regression
#All variables
log_model <- glm(CreditTrain$DEFAULT~.,family='binomial'(logit),data=CreditTrain)
summary(log_model)
##
## Call:
## glm(formula = CreditTrain$DEFAULT ~ ., family = binomial(logit),
## data = CreditTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.67375 -0.47180 -0.36464 -0.00008 3.11379
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.817e+01 2.083e+02 -0.087 0.930492
## CARDHLDR 1.801e+01 2.083e+02 0.086 0.931078
## AGE -1.162e-02 4.649e-03 -2.499 0.012449 *
## ACADMOS 4.679e-04 6.567e-04 0.713 0.476131
## ADEPCNT 3.291e-02 5.078e-02 0.648 0.516974
## MAJORDRG 2.534e-01 7.812e-02 3.244 0.001180 **
## MINORDRG 2.074e-01 5.512e-02 3.762 0.000169 ***
## OWNRENT -3.057e-01 8.874e-02 -3.445 0.000572 ***
## INCOME -2.277e-04 6.396e-05 -3.559 0.000372 ***
## SELFEMPL 2.469e-03 1.831e-01 0.013 0.989244
## INCPER -9.389e-06 5.602e-06 -1.676 0.093753 .
## EXP_INC 2.740e+00 8.695e-01 3.151 0.001626 **
## SPENDING -5.956e-04 4.333e-04 -1.375 0.169250
## LOGSPEND -2.428e-01 3.258e-02 -7.454 9.07e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5720.1 on 10711 degrees of freedom
## Residual deviance: 5062.6 on 10698 degrees of freedom
## AIC: 5090.6
##
## Number of Fisher Scoring iterations: 18
#Intercept Only
log_model0 <- glm(CreditTrain$DEFAULT~ 1,family='binomial'(logit),data=CreditTrain)
## odds ratios only
exp(coef(log_model))
## (Intercept) CARDHLDR AGE ACADMOS ADEPCNT
## 1.289226e-08 6.652501e+07 9.884480e-01 1.000468e+00 1.033454e+00
## MAJORDRG MINORDRG OWNRENT INCOME SELFEMPL
## 1.288390e+00 1.230421e+00 7.366141e-01 9.997724e-01 1.002472e+00
## INCPER EXP_INC SPENDING LOGSPEND
## 9.999906e-01 1.548899e+01 9.994045e-01 7.843948e-01
#Number of Defaults to No-defaults
library(gmodels)
##
## Attaching package: 'gmodels'
## The following object is masked from 'package:pROC':
##
## ci
CrossTable(CreditTrain$DEFAULT)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 10712
##
##
## | 0 | 1 |
## |-----------|-----------|
## | 9906 | 806 |
## | 0.925 | 0.075 |
## |-----------|-----------|
##
##
##
##
glmforward<-step(log_model0, scope=formula(log_model), direction="forward", trace=0)
summary(glmforward)
##
## Call:
## glm(formula = CreditTrain$DEFAULT ~ CARDHLDR + INCOME + LOGSPEND +
## MINORDRG + EXP_INC + OWNRENT + INCPER + MAJORDRG + AGE +
## SPENDING, family = binomial(logit), data = CreditTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.66312 -0.47187 -0.36540 -0.00008 3.12975
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.817e+01 2.077e+02 -0.087 0.930276
## CARDHLDR 1.804e+01 2.077e+02 0.087 0.930775
## INCOME -2.093e-04 5.601e-05 -3.736 0.000187 ***
## LOGSPEND -2.439e-01 3.252e-02 -7.500 6.39e-14 ***
## MINORDRG 2.088e-01 5.508e-02 3.791 0.000150 ***
## EXP_INC 2.735e+00 8.676e-01 3.153 0.001616 **
## OWNRENT -2.871e-01 8.632e-02 -3.326 0.000879 ***
## INCPER -1.213e-05 3.710e-06 -3.269 0.001078 **
## MAJORDRG 2.585e-01 7.791e-02 3.319 0.000904 ***
## AGE -1.029e-02 4.373e-03 -2.353 0.018615 *
## SPENDING -5.924e-04 4.324e-04 -1.370 0.170675
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5720.1 on 10711 degrees of freedom
## Residual deviance: 5063.5 on 10701 degrees of freedom
## AIC: 5085.5
##
## Number of Fisher Scoring iterations: 18
predictions_test <- predict(glmforward, newdata = CreditTest, type = "response")
range(predictions_test) #Look at the range of the object "predictions_all_small"
## [1] 2.220488e-10 5.698558e-01
require(lattice)
histogram(~ predictions_test | CreditTrain$DEFAULT, type = "count", xlab = "Probability of Default",strip = strip.custom(factor.levels = c("True outcome: No Default", "True Outcome: Default")), nint = 10, col = "royalblue")
histogram(~I (1- predictions_test) | CreditTrain$DEFAULT, type = "count", xlab = "Probability of Default",strip = strip.custom(factor.levels = c("True outcome: No Default", "True Outcome: Default")), nint = 10, col = "royalblue")
plot(calibration(CreditTest$DEFAULT ~ predictions_test, cuts = 10))
## Error in calibration.formula(CreditTest$DEFAULT ~ predictions_test, cuts = 10): the left-hand side of the formula must be a factor of classes
#Confusion Matrix
fwd.pred <- ifelse(predictions_test>0.5,1,0) #Make a binary predictions-vector using a cut-off of 50%
CreditTest1 <- data.frame(CreditTest, fwd.pred)
conf <-table(CreditTest1$DEFAULT, fwd.pred,dnn = c("Predicted", "Actual"))
print(conf)
## Actual
## Predicted 0 1
## 0 2490 1
## 1 187 0
TP <- conf[1, 1]
FN <- conf[1, 2]
FP <- conf[2,1]
TN <- conf[2,2]
# Calculate and print the accuracy: acc
acc.fwd = (TP+TN)/(TP+FN+FP+TN)
print(acc.fwd)
## [1] 0.9297984
# Calculate and print out the precision: prec
prec = (TP)/(TP+FP)
print(prec)
## [1] 0.9301457
# Calculate and print out the recall: rec (Sensitvity)
rec.fwd = (TP)/(TP+FN)
print(rec.fwd)
## [1] 0.9995986
library(ROCR)
library(pROC)
p <- predict(log_model, newdata = CreditTest, type = "response")
pr <- prediction(p, CreditTest$DEFAULT)
perf <- performance(pr, measure = "tpr", x.measure = "fpr")
par(pty = "s")
plot(perf)
abline(a=0, b= 1)
par(pty = "s")
plot(roc(CreditTest$DEFAULT, predictions_test), legacy.axes = TRUE)
#Area under the Curve
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7436676
#Classification Tree
library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:car':
##
## Predict
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(rpart.plot)
library(partykit)
## Loading required package: libcoin
##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal, varimp
attach(CreditTrain)
## The following objects are masked from credit_data:
##
## ACADMOS, ADEPCNT, AGE, CARDHLDR, DEFAULT, EXP_INC, INCOME,
## INCPER, LOGSPEND, MAJORDRG, MINORDRG, OWNRENT, SELFEMPL,
## SPENDING
credit.rpart <- rpart(DEFAULT~., data = credit_data, cp = 0.0023)
par(pty="s")
creditplot <- rpart.plot(credit.rpart, tweak = 1)
plot(creditplot, uniform= TRUE, xlab = "Variables", ylab = "0= No Default / 1=Default", main = "Classification Tree for Default Characteristics")
## Warning in plot.window(...): "uniform" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "uniform" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "uniform" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "uniform" is
## not a graphical parameter
## Warning in box(...): "uniform" is not a graphical parameter
## Warning in title(...): "uniform" is not a graphical parameter
text(creditplot, use.n=TRUE, all = TRUE, cex = 0.5)
## Warning in text.default(creditplot, use.n = TRUE, all = TRUE, cex = 0.5):
## "use.n" is not a graphical parameter
## Warning in text.default(creditplot, use.n = TRUE, all = TRUE, cex = 0.5):
## "all" is not a graphical parameter
#prune the tree
pfit <- prune(credit.rpart, cp = credit.rpart$cptable[which.min(credit.rpart$cptable[,"xerror"]), "CP"])
#plot Tree
plot(pfit, uniform=TRUE, main = "Pruned Classification Tree for Default Characterisics")
text(pfit, use.n=TRUE, all= TRUE, cex = 0.5)
#prediction of pruned tree
pred.prune <- predict(pfit)
pred_cutoff <- ifelse(pred.prune>0.5,1,0) #Make binary predictions-vector using a cut-off of 50%
credittree <- data.frame(CreditTest, pred_cutoff)
## Error in data.frame(CreditTest, pred_cutoff): arguments imply differing number of rows: 2678, 13444
conf.tree <- table(credittree$Default, pred_cutoff,dnn = c("Predicted", "Actual"))
## Error in table(credittree$Default, pred_cutoff, dnn = c("Predicted", "Actual")): object 'credittree' not found
print(conf.tree)
## Error in print(conf.tree): object 'conf.tree' not found
TP.tree <- conf.tree[1, 1]
## Error in eval(expr, envir, enclos): object 'conf.tree' not found
FN.tree <- conf.tree[1, 2]
## Error in eval(expr, envir, enclos): object 'conf.tree' not found
FP.tree <- conf.tree[2,1]
## Error in eval(expr, envir, enclos): object 'conf.tree' not found
TN.tree <- conf.tree[2,2]
## Error in eval(expr, envir, enclos): object 'conf.tree' not found
# Calculate and print the accuracy: acc
acc.tree = (TP.tree+TN.tree)/(TP.tree+FN.tree+FP.tree+TN.tree)
## Error in eval(expr, envir, enclos): object 'TP.tree' not found
print(acc.tree)
## Error in print(acc.tree): object 'acc.tree' not found
# Calculate and print out the recall: rec (Sensitvity)
rec.tree = (TP.tree)/(TP.tree+FN.tree)
## Error in eval(expr, envir, enclos): object 'TP.tree' not found
print(rec.tree)
## Error in print(rec.tree): object 'rec.tree' not found
library(ROCR)
#Generate ROC Curve using predback and test data set
credit.roc <- predict(pfit, CreditTest)
pred6 <- prediction(credit.roc, CreditTest$DEFAULT)
perf6 <- performance(pred6, "tpr", "fpr")
plot(perf6)
abline(a=0, b= 1)
plot(roc(CreditTest$DEFAULT, credit.roc), legacy.axes = TRUE)
#LoanTrain = LoanTrain[,-2]#Method to remove variables that arent strong.Removing it for ease of data transformation
#Random Forest
library(randomForest)
credit.rf <- randomForest(DEFAULT~.,data= CreditTrain)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
print(credit.rf) # view results
##
## Call:
## randomForest(formula = DEFAULT ~ ., data = CreditTrain)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.06850596
## % Var explained: 1.55
#plot variable importance
varImpPlot(credit.rf)
#Gini Index
testp4<-predict(credit.rf, CreditTest)
confusionMatrix(testp4, reference = CreditTest$DEFAULT, positive = 0, dnn = c("Predicted", "Actual"))
## Error: `data` and `reference` should be factors with the same levels.