library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
loan = read.csv("loan_data.csv")
loan = loan[,-1]
head(loan, n = 5)
##   SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age
## 1                1                               0.7661  45
## 2                0                               0.9572  40
## 3                0                               0.6582  38
## 4                0                               0.2338  30
## 5                0                               0.9072  49
##   NumberOfTime30.59DaysPastDueNotWorse   DebtRatio MonthlyIncome
## 1                                    2 0.802982129          9120
## 2                                    0 0.121876201          2600
## 3                                    1 0.085113375          3042
## 4                                    0 0.036049682          3300
## 5                                    1 0.024925695         63588
##   NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## 1                              13                       0
## 2                               4                       0
## 3                               2                       1
## 4                               5                       0
## 5                               7                       0
##   NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## 1                            6                                    0
## 2                            0                                    0
## 3                            0                                    0
## 4                            0                                    0
## 5                            1                                    0
##   NumberOfDependents
## 1                  2
## 2                  1
## 3                  0
## 4                  0
## 5                  0
colnames(loan) = tolower(colnames(loan))
str(loan)
## 'data.frame':    5003 obs. of  11 variables:
##  $ seriousdlqin2yrs                    : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ revolvingutilizationofunsecuredlines: num  0.766 0.957 0.658 0.234 0.907 ...
##  $ age                                 : int  45 40 38 30 49 74 57 39 27 57 ...
##  $ numberoftime30.59dayspastduenotworse: int  2 0 1 0 1 0 0 0 0 0 ...
##  $ debtratio                           : Factor w/ 4673 levels "0","0.00012837",..: 3581 781 603 377 331 2415 4513 1327 4439 3261 ...
##  $ monthlyincome                       : int  9120 2600 3042 3300 63588 3500 NA 3500 NA 23684 ...
##  $ numberofopencreditlinesandloans     : int  13 4 2 5 7 3 8 8 2 9 ...
##  $ numberoftimes90dayslate             : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ numberrealestateloansorlines        : int  6 0 0 0 1 1 3 0 0 4 ...
##  $ numberoftime60.89dayspastduenotworse: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ numberofdependents                  : int  2 1 0 0 0 1 0 0 NA 2 ...
summary(loan)
##  seriousdlqin2yrs revolvingutilizationofunsecuredlines      age      
##  Min.   :0.000    Min.   :   0                         Min.   :21.0  
##  1st Qu.:0.000    1st Qu.:   0                         1st Qu.:41.0  
##  Median :0.000    Median :   0                         Median :52.0  
##  Mean   :0.061    Mean   :   4                         Mean   :52.1  
##  3rd Qu.:0.000    3rd Qu.:   1                         3rd Qu.:63.0  
##  Max.   :1.000    Max.   :6324                         Max.   :99.0  
##                                                                      
##  numberoftime30.59dayspastduenotworse   debtratio    monthlyincome   
##  Min.   : 0.00                        0      : 131   Min.   :     0  
##  1st Qu.: 0.00                        2      :   9   1st Qu.:  3456  
##  Median : 0.00                        1      :   8   Median :  5333  
##  Mean   : 0.36                        10     :   7   Mean   :  6556  
##  3rd Qu.: 0.00                        13     :   7   3rd Qu.:  8168  
##  Max.   :98.00                        3      :   7   Max.   :208333  
##                                       (Other):4834   NA's   :970     
##  numberofopencreditlinesandloans numberoftimes90dayslate
##  Min.   : 0.00                   Min.   : 0.00          
##  1st Qu.: 5.00                   1st Qu.: 0.00          
##  Median : 8.00                   Median : 0.00          
##  Mean   : 8.39                   Mean   : 0.19          
##  3rd Qu.:11.00                   3rd Qu.: 0.00          
##  Max.   :46.00                   Max.   :98.00          
##                                  NA's   :1              
##  numberrealestateloansorlines numberoftime60.89dayspastduenotworse
##  Min.   : 0.00                Min.   : 0.00                       
##  1st Qu.: 0.00                1st Qu.: 0.00                       
##  Median : 1.00                Median : 0.00                       
##  Mean   : 1.02                Mean   : 0.17                       
##  3rd Qu.: 2.00                3rd Qu.: 0.00                       
##  Max.   :17.00                Max.   :98.00                       
##  NA's   :1                    NA's   :1                           
##  numberofdependents
##  Min.   :0.00      
##  1st Qu.:0.00      
##  Median :0.00      
##  Mean   :0.74      
##  3rd Qu.:1.00      
##  Max.   :8.00      
##  NA's   :143
loan$debtratio = as.integer(loan$debtratio)

Turns out chapter 12 of the ebook is actually called “Outlier detection” and has some thoughts on removing outliers.

The most intuitive in my opinion is using the five number summary - min, Q1, median, Q3 and max. The interquartile range is defined as Q3-Q1. Any object that is 1.5 * IQR smaller than Q1 or 1.5 * IQR larger Q3 is treated as an outlier because 99.3% of of objects lie within this range.

I created a simple function for calculating the range. It prints the upper and lower bounds.

outlierRange = function(quartile1, quartile3, g = 1.5) {
  range = (quartile3 - quartile1) * g
  upperBound = quartile3 + range
  lowerBound = quartile1 + range
  print (quartile3 + range) 
  print (quartile1 - range)
}

Seriousdlqin2yrs

summary(loan$seriousdlqin2yrs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.061   0.000   1.000
qplot(data = loan, x = seriousdlqin2yrs)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-3

This actually looks fine.

Age

summary(loan$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    21.0    41.0    52.0    52.1    63.0    99.0
qplot(data = loan, x = age)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-4

ggplot(loan, aes(y=age, x = 1)) + geom_boxplot()

plot of chunk unnamed-chunk-4

outlierRange(41, 63, 1.5)
## [1] 96
## [1] 8

We can see here that There are some ages at the upper end of the range. Is it likely that a 99 year old applied for a loan? The boxplot identifies two outliers at the top end of the range. Add 33 to 63 and subtract from 41 - our range is 96 to 8. Therefore we remove anything above 96.

loan = filter(loan, age < 97)
#We lose 2 observations.

Debtratio

#Monthly debt payments, alimony,living costs divided by monthy gross income. 
summary(loan$debtratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1110    2360    2340    3600    4670
str(loan$debtratio)
##  int [1:5001] 3581 781 603 377 331 2415 4513 1327 4439 3261 ...
qplot(data = loan, x = debtratio)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-6

ggplot(loan, aes(y=debtratio, x = 1)) + geom_boxplot()

plot of chunk unnamed-chunk-6

This one looks fine as far as I can tell, although the flat distribution is suspect. Hard to know without more information.

Number of open credit lines and loans.

summary(loan$numberofopencreditlinesandloans) #46 seems a bit excessive!
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    5.00    8.00    8.39   11.00   46.00
qplot(data = loan, x = numberofopencreditlinesandloans)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-7

ggplot(loan, aes(y=numberofopencreditlinesandloans, x = 1)) + geom_boxplot() 

plot of chunk unnamed-chunk-7

outlierRange(5, 11, 1.5)
## [1] 20
## [1] -4
loan = filter(loan, numberofopencreditlinesandloans < 21) 

The boxplot outlines a number of outliers. We filter them out.

Number real estate loans or lines

summary(loan$numberrealestateloansorlines)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   1.000   0.991   2.000  15.000       1
qplot(data = loan, x = numberrealestateloansorlines)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-8

ggplot(loan, aes(y=numberrealestateloansorlines, x = 1)) + geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-8

#1 Na indicates missing data. 15 seems excessive. Boxplot highlights some outliers.
outlierRange(0, 2, 1.5)
## [1] 5
## [1] -3
#Anything above 5 is removed.
loan = filter(loan, numberrealestateloansorlines < 6)

Number of dependents

summary(loan$numberofdependents)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00    0.74    1.00    8.00     138
qplot(data = loan, x = numberofdependents)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-9

ggplot(loan, aes(y=numberofdependents, x = 1)) + geom_boxplot()
## Warning: Removed 138 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-9

I’m inclined to leave this one as is. Two reasons - N/A’s probably indicate zero (as in, no dependents).I’ve left them for the moment as this would require consultation. Secondly, even though 8 dependents technically lies outside our outsider rule, our rule would also rule out anything above 2.

Revolving utilization of unsecured lines

summary(loan$revolvingutilizationofunsecuredlines) #6324 seems like a lot.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       4       1    6320
qplot(data = loan, x = revolvingutilizationofunsecuredlines)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-10

ggplot(loan, aes(y=revolvingutilizationofunsecuredlines, x = 1)) + geom_boxplot()

plot of chunk unnamed-chunk-10

This is acually a percentage - anything over 1 is suspicious in my opinion.. Looking at the boxplot, I initially remove anything over 1000.

loan = filter(loan, revolvingutilizationofunsecuredlines < 1000)
#new boxplot
ggplot(loan, aes(y=revolvingutilizationofunsecuredlines, x = 1)) + geom_boxplot()

plot of chunk unnamed-chunk-11 Although i find it hard to believe someone could technically be overdrawn to 150%, I guess it could happen. I stick with the outlier Range rule.

summary(loan$revolvingutilizationofunsecuredlines)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0297  0.1620  0.3260  0.5650  2.8400
outlierRange(.02969, .56500, 1.5)
## [1] 1.368
## [1] -0.7733
#Top end of the range is 1.367965
loan = filter(loan, revolvingutilizationofunsecuredlines < 1.37)

Number of time 30-59 days past due but not worse

summary(loan$numberoftime30.59dayspastduenotworse) #98 times>!>
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00    0.36    0.00   98.00
qplot(data = loan, x = numberoftime30.59dayspastduenotworse)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-13

ggplot(loan, aes(y=numberoftime30.59dayspastduenotworse, x = 1)) + geom_boxplot()

plot of chunk unnamed-chunk-13

So the outlier range doesnt work here because we have a 3rd quartile of zero and a 1Q of zero. This is because of the huge outlier. Remove outlier and try again.

loan = filter(loan, numberoftime30.59dayspastduenotworse < 50)
summary(loan$numberoftime30.59dayspastduenotworse)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.256   0.000  12.000
qplot(data = loan, x = numberoftime30.59dayspastduenotworse)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-14

ggplot(loan, aes(y=numberoftime30.59dayspastduenotworse, x = 1)) + geom_boxplot()

plot of chunk unnamed-chunk-14

max(loan$numberoftime30.59dayspastduenotworse)
## [1] 12

Going to to leave it there. 12 as an upper bound seems reasonable to me.

Number of times 90 days late

summary(loan$numberoftimes90dayslate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.097   0.000  10.000
qplot(data = loan, x = numberoftimes90dayslate)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-15

ggplot(loan, aes(y=numberoftimes90dayslate, x = 1)) + geom_boxplot()

plot of chunk unnamed-chunk-15

Again ten is outside our range but probably reasonable.

Number of times 60-89 days past due but not worse

summary(loan$numberoftime60.89dayspastduenotworse)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.068   0.000   5.000
qplot(data = loan, x = numberoftime60.89dayspastduenotworse)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-16

ggplot(loan, aes(y=numberoftime60.89dayspastduenotworse, x = 1)) + geom_boxplot()

plot of chunk unnamed-chunk-16 Fine.

Monthly income

summary(loan$monthlyincome) #lots of N/A's
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0    3400    5250    6430    8000  208000     949
qplot(data = loan, x = monthlyincome)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-17

ggplot(loan, aes(y=monthlyincome, x = 1)) + geom_boxplot()
## Warning: Removed 949 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-17

Some outliers to the upside and a number of NA’s. Note that the boxplot disregards the NA’s. the NA’s are important here as they have “informative missingness” - an NA likely indicates that the income deliberately wasn’t given rather than it wasn’t recorded.

Remove outliers using our outlier Rule.

outlierRange(3400, 8000, 1.5)
## [1] 14900
## [1] -3500
#6900 is the upper bound
loan = filter(loan, monthlyincome<=6900)

And I’m spent.