library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
loan = read.csv("loan_data.csv")
loan = loan[,-1]
head(loan, n = 5)
## SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age
## 1 1 0.7661 45
## 2 0 0.9572 40
## 3 0 0.6582 38
## 4 0 0.2338 30
## 5 0 0.9072 49
## NumberOfTime30.59DaysPastDueNotWorse DebtRatio MonthlyIncome
## 1 2 0.802982129 9120
## 2 0 0.121876201 2600
## 3 1 0.085113375 3042
## 4 0 0.036049682 3300
## 5 1 0.024925695 63588
## NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## 1 13 0
## 2 4 0
## 3 2 1
## 4 5 0
## 5 7 0
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## 1 6 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 1 0
## NumberOfDependents
## 1 2
## 2 1
## 3 0
## 4 0
## 5 0
colnames(loan) = tolower(colnames(loan))
str(loan)
## 'data.frame': 5003 obs. of 11 variables:
## $ seriousdlqin2yrs : int 1 0 0 0 0 0 0 0 0 0 ...
## $ revolvingutilizationofunsecuredlines: num 0.766 0.957 0.658 0.234 0.907 ...
## $ age : int 45 40 38 30 49 74 57 39 27 57 ...
## $ numberoftime30.59dayspastduenotworse: int 2 0 1 0 1 0 0 0 0 0 ...
## $ debtratio : Factor w/ 4673 levels "0","0.00012837",..: 3581 781 603 377 331 2415 4513 1327 4439 3261 ...
## $ monthlyincome : int 9120 2600 3042 3300 63588 3500 NA 3500 NA 23684 ...
## $ numberofopencreditlinesandloans : int 13 4 2 5 7 3 8 8 2 9 ...
## $ numberoftimes90dayslate : int 0 0 1 0 0 0 0 0 0 0 ...
## $ numberrealestateloansorlines : int 6 0 0 0 1 1 3 0 0 4 ...
## $ numberoftime60.89dayspastduenotworse: int 0 0 0 0 0 0 0 0 0 0 ...
## $ numberofdependents : int 2 1 0 0 0 1 0 0 NA 2 ...
summary(loan)
## seriousdlqin2yrs revolvingutilizationofunsecuredlines age
## Min. :0.000 Min. : 0 Min. :21.0
## 1st Qu.:0.000 1st Qu.: 0 1st Qu.:41.0
## Median :0.000 Median : 0 Median :52.0
## Mean :0.061 Mean : 4 Mean :52.1
## 3rd Qu.:0.000 3rd Qu.: 1 3rd Qu.:63.0
## Max. :1.000 Max. :6324 Max. :99.0
##
## numberoftime30.59dayspastduenotworse debtratio monthlyincome
## Min. : 0.00 0 : 131 Min. : 0
## 1st Qu.: 0.00 2 : 9 1st Qu.: 3456
## Median : 0.00 1 : 8 Median : 5333
## Mean : 0.36 10 : 7 Mean : 6556
## 3rd Qu.: 0.00 13 : 7 3rd Qu.: 8168
## Max. :98.00 3 : 7 Max. :208333
## (Other):4834 NA's :970
## numberofopencreditlinesandloans numberoftimes90dayslate
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 5.00 1st Qu.: 0.00
## Median : 8.00 Median : 0.00
## Mean : 8.39 Mean : 0.19
## 3rd Qu.:11.00 3rd Qu.: 0.00
## Max. :46.00 Max. :98.00
## NA's :1
## numberrealestateloansorlines numberoftime60.89dayspastduenotworse
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 1.00 Median : 0.00
## Mean : 1.02 Mean : 0.17
## 3rd Qu.: 2.00 3rd Qu.: 0.00
## Max. :17.00 Max. :98.00
## NA's :1 NA's :1
## numberofdependents
## Min. :0.00
## 1st Qu.:0.00
## Median :0.00
## Mean :0.74
## 3rd Qu.:1.00
## Max. :8.00
## NA's :143
loan$debtratio = as.integer(loan$debtratio)
Turns out chapter 12 of the ebook is actually called “Outlier detection” and has some thoughts on removing outliers.
The most intuitive in my opinion is using the five number summary - min, Q1, median, Q3 and max. The interquartile range is defined as Q3-Q1. Any object that is 1.5 * IQR smaller than Q1 or 1.5 * IQR larger Q3 is treated as an outlier because 99.3% of of objects lie within this range.
I created a simple function for calculating the range. It prints the upper and lower bounds.
outlierRange = function(quartile1, quartile3, g = 1.5) {
range = (quartile3 - quartile1) * g
upperBound = quartile3 + range
lowerBound = quartile1 + range
print (quartile3 + range)
print (quartile1 - range)
}
summary(loan$seriousdlqin2yrs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.061 0.000 1.000
qplot(data = loan, x = seriousdlqin2yrs)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
This actually looks fine.
summary(loan$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.0 41.0 52.0 52.1 63.0 99.0
qplot(data = loan, x = age)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=age, x = 1)) + geom_boxplot()
outlierRange(41, 63, 1.5)
## [1] 96
## [1] 8
We can see here that There are some ages at the upper end of the range. Is it likely that a 99 year old applied for a loan? The boxplot identifies two outliers at the top end of the range. Add 33 to 63 and subtract from 41 - our range is 96 to 8. Therefore we remove anything above 96.
loan = filter(loan, age < 97)
#We lose 2 observations.
#Monthly debt payments, alimony,living costs divided by monthy gross income.
summary(loan$debtratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1110 2360 2340 3600 4670
str(loan$debtratio)
## int [1:5001] 3581 781 603 377 331 2415 4513 1327 4439 3261 ...
qplot(data = loan, x = debtratio)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=debtratio, x = 1)) + geom_boxplot()
This one looks fine as far as I can tell, although the flat distribution is suspect. Hard to know without more information.
summary(loan$numberofopencreditlinesandloans) #46 seems a bit excessive!
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 5.00 8.00 8.39 11.00 46.00
qplot(data = loan, x = numberofopencreditlinesandloans)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=numberofopencreditlinesandloans, x = 1)) + geom_boxplot()
outlierRange(5, 11, 1.5)
## [1] 20
## [1] -4
loan = filter(loan, numberofopencreditlinesandloans < 21)
The boxplot outlines a number of outliers. We filter them out.
summary(loan$numberrealestateloansorlines)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 1.000 0.991 2.000 15.000 1
qplot(data = loan, x = numberrealestateloansorlines)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=numberrealestateloansorlines, x = 1)) + geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
#1 Na indicates missing data. 15 seems excessive. Boxplot highlights some outliers.
outlierRange(0, 2, 1.5)
## [1] 5
## [1] -3
#Anything above 5 is removed.
loan = filter(loan, numberrealestateloansorlines < 6)
Number of dependents
summary(loan$numberofdependents)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 0.74 1.00 8.00 138
qplot(data = loan, x = numberofdependents)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=numberofdependents, x = 1)) + geom_boxplot()
## Warning: Removed 138 rows containing non-finite values (stat_boxplot).
I’m inclined to leave this one as is. Two reasons - N/A’s probably indicate zero (as in, no dependents).I’ve left them for the moment as this would require consultation. Secondly, even though 8 dependents technically lies outside our outsider rule, our rule would also rule out anything above 2.
summary(loan$revolvingutilizationofunsecuredlines) #6324 seems like a lot.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 4 1 6320
qplot(data = loan, x = revolvingutilizationofunsecuredlines)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=revolvingutilizationofunsecuredlines, x = 1)) + geom_boxplot()
This is acually a percentage - anything over 1 is suspicious in my opinion.. Looking at the boxplot, I initially remove anything over 1000.
loan = filter(loan, revolvingutilizationofunsecuredlines < 1000)
#new boxplot
ggplot(loan, aes(y=revolvingutilizationofunsecuredlines, x = 1)) + geom_boxplot()
Although i find it hard to believe someone could technically be overdrawn to 150%, I guess it could happen. I stick with the outlier Range rule.
summary(loan$revolvingutilizationofunsecuredlines)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0297 0.1620 0.3260 0.5650 2.8400
outlierRange(.02969, .56500, 1.5)
## [1] 1.368
## [1] -0.7733
#Top end of the range is 1.367965
loan = filter(loan, revolvingutilizationofunsecuredlines < 1.37)
summary(loan$numberoftime30.59dayspastduenotworse) #98 times>!>
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 0.36 0.00 98.00
qplot(data = loan, x = numberoftime30.59dayspastduenotworse)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=numberoftime30.59dayspastduenotworse, x = 1)) + geom_boxplot()
So the outlier range doesnt work here because we have a 3rd quartile of zero and a 1Q of zero. This is because of the huge outlier. Remove outlier and try again.
loan = filter(loan, numberoftime30.59dayspastduenotworse < 50)
summary(loan$numberoftime30.59dayspastduenotworse)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.256 0.000 12.000
qplot(data = loan, x = numberoftime30.59dayspastduenotworse)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=numberoftime30.59dayspastduenotworse, x = 1)) + geom_boxplot()
max(loan$numberoftime30.59dayspastduenotworse)
## [1] 12
Going to to leave it there. 12 as an upper bound seems reasonable to me.
summary(loan$numberoftimes90dayslate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.097 0.000 10.000
qplot(data = loan, x = numberoftimes90dayslate)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=numberoftimes90dayslate, x = 1)) + geom_boxplot()
Again ten is outside our range but probably reasonable.
summary(loan$numberoftime60.89dayspastduenotworse)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.068 0.000 5.000
qplot(data = loan, x = numberoftime60.89dayspastduenotworse)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=numberoftime60.89dayspastduenotworse, x = 1)) + geom_boxplot()
Fine.
summary(loan$monthlyincome) #lots of N/A's
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 3400 5250 6430 8000 208000 949
qplot(data = loan, x = monthlyincome)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(loan, aes(y=monthlyincome, x = 1)) + geom_boxplot()
## Warning: Removed 949 rows containing non-finite values (stat_boxplot).
Some outliers to the upside and a number of NA’s. Note that the boxplot disregards the NA’s. the NA’s are important here as they have “informative missingness” - an NA likely indicates that the income deliberately wasn’t given rather than it wasn’t recorded.
Remove outliers using our outlier Rule.
outlierRange(3400, 8000, 1.5)
## [1] 14900
## [1] -3500
#6900 is the upper bound
loan = filter(loan, monthlyincome<=6900)
And I’m spent.