setwd("~/R/Data Analysis/Data Analysis Project 1")
load("loansData.rda")
ls()
## [1] "loansData"
source("~/Dropbox/R_functions/eda.R")
first.look(loansData)
## ----- str: look at data types and values -------
## 'data.frame': 2500 obs. of 14 variables:
## $ Amount.Requested : int 20000 19200 35000 10000 12000 6000 10000 33500 14675 7000 ...
## $ Amount.Funded.By.Investors : num 20000 19200 35000 9975 12000 ...
## $ Interest.Rate : Factor w/ 430 levels "","10.00%","10.01%",..: 409 60 335 430 49 166 398 220 130 379 ...
## $ Loan.Length : Factor w/ 3 levels "","36 months",..: 2 2 3 2 2 2 2 3 2 2 ...
## $ Loan.Purpose : Factor w/ 15 levels "","car","credit_card",..: 4 4 4 4 3 11 4 3 3 3 ...
## $ Debt.To.Income.Ratio : Factor w/ 3500 levels "","0.01%","0.02%",..: 690 2136 1681 630 1078 1305 1909 670 1992 3210 ...
## $ State : Factor w/ 51 levels "","AK","AL","AR",..: 41 44 6 18 32 8 21 20 6 6 ...
## $ Home.Ownership : Factor w/ 6 levels "","MORTGAGE",..: 2 2 2 2 6 5 6 2 6 6 ...
## $ Monthly.Income : num 6542 4583 11500 3833 3195 ...
## $ FICO.Range : Factor w/ 43 levels "","640-644","645-649",..: 21 17 12 13 13 8 18 15 11 17 ...
## $ Open.CREDIT.Lines : int 14 12 14 10 11 17 10 12 9 8 ...
## $ Revolving.CREDIT.Balance : int 14272 11140 21977 9346 14469 10391 15957 27874 7246 7612 ...
## $ Inquiries.in.the.Last.6.Months: int 2 1 1 0 0 2 0 0 1 0 ...
## $ Employment.Length : Factor w/ 13 levels "","< 1 year",..: 2 5 5 8 12 6 4 4 11 6 ...
## NULL
##
## ----- sample size: 2500
## ----- sample size of complete cases: 2498
## ----- difference: 2
##
## ----- names: variable names
## [1] "Amount.Requested" "Amount.Funded.By.Investors"
## [3] "Interest.Rate" "Loan.Length"
## [5] "Loan.Purpose" "Debt.To.Income.Ratio"
## [7] "State" "Home.Ownership"
## [9] "Monthly.Income" "FICO.Range"
## [11] "Open.CREDIT.Lines" "Revolving.CREDIT.Balance"
## [13] "Inquiries.in.the.Last.6.Months" "Employment.Length"
##
## ----- head: first six rows
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 81174 20000 20000 8.90%
## 99592 19200 19200 12.12%
## 80059 35000 35000 21.98%
## 15825 10000 9975 9.99%
## 33182 12000 12000 11.71%
## 62403 6000 6000 15.31%
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State
## 81174 36 months debt_consolidation 14.90% SC
## 99592 36 months debt_consolidation 28.36% TX
## 80059 60 months debt_consolidation 23.81% CA
## 15825 36 months debt_consolidation 14.30% KS
## 33182 36 months credit_card 18.78% NJ
## 62403 36 months other 20.05% CT
## Home.Ownership Monthly.Income FICO.Range Open.CREDIT.Lines
## 81174 MORTGAGE 6542 735-739 14
## 99592 MORTGAGE 4583 715-719 12
## 80059 MORTGAGE 11500 690-694 14
## 15825 MORTGAGE 3833 695-699 10
## 33182 RENT 3195 695-699 11
## 62403 OWN 4892 670-674 17
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 81174 14272 2
## 99592 11140 1
## 80059 21977 1
## 15825 9346 0
## 33182 14469 0
## 62403 10391 2
## Employment.Length
## 81174 < 1 year
## 99592 2 years
## 80059 2 years
## 15825 5 years
## 33182 9 years
## 62403 3 years
##
## ----- summary: statistics for each variable
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## Min. : 1000 Min. : 0 12.12% : 122
## 1st Qu.: 6000 1st Qu.: 6000 7.90% : 119
## Median :10000 Median :10000 13.11% : 115
## Mean :12406 Mean :12002 15.31% : 76
## 3rd Qu.:17000 3rd Qu.:16000 14.09% : 72
## Max. :35000 Max. :35000 14.33% : 69
## (Other):1927
## Loan.Length Loan.Purpose Debt.To.Income.Ratio
## : 0 debt_consolidation:1307 0% : 8
## 36 months:1952 credit_card : 444 12.54% : 6
## 60 months: 548 other : 201 12.20% : 5
## home_improvement : 152 12.85% : 5
## major_purchase : 101 14.22% : 5
## small_business : 87 14.66% : 5
## (Other) : 208 (Other):2466
## State Home.Ownership Monthly.Income FICO.Range
## CA : 433 : 0 Min. : 588 670-674: 171
## NY : 255 MORTGAGE:1148 1st Qu.: 3500 675-679: 166
## TX : 174 NONE : 1 Median : 5000 680-684: 157
## FL : 169 OTHER : 5 Mean : 5689 695-699: 153
## IL : 101 OWN : 200 3rd Qu.: 6800 665-669: 145
## GA : 98 RENT :1146 Max. :102750 690-694: 140
## (Other):1270 NA's :1 (Other):1568
## Open.CREDIT.Lines Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## Min. : 2.0 Min. : 0 Min. :0.000
## 1st Qu.: 7.0 1st Qu.: 5586 1st Qu.:0.000
## Median : 9.0 Median : 10962 Median :0.000
## Mean :10.1 Mean : 15245 Mean :0.906
## 3rd Qu.:13.0 3rd Qu.: 18889 3rd Qu.:1.000
## Max. :38.0 Max. :270800 Max. :9.000
## NA's :2 NA's :2 NA's :2
## Employment.Length
## 10+ years:653
## < 1 year :250
## 2 years :244
## 3 years :235
## 5 years :202
## 4 years :192
## (Other) :724
loansData.clean = loansData
loansData.clean$Interest.Rate = as.numeric(sub("%", "", loansData.clean$Interest.Rate))/100
str(loansData$Interest.Rate)
## Factor w/ 430 levels "","10.00%","10.01%",..: 409 60 335 430 49 166 398 220 130 379 ...
str(loansData.clean$Interest.Rate)
## num [1:2500] 0.089 0.1212 0.2198 0.0999 0.1171 ...
loansData$Interest.Rate[1:10]
## [1] 8.90% 12.12% 21.98% 9.99% 11.71% 15.31% 7.90% 17.14% 14.33% 6.91%
## 430 Levels: 10.00% 10.01% 10.08% 10.14% 10.16% 10.20% 10.25% ... 9.99%
loansData.clean$Interest.Rate[1:10]
## [1] 0.0890 0.1212 0.2198 0.0999 0.1171 0.1531 0.0790 0.1714 0.1433 0.0691
# get the factor descriptions for each range
FICO.levels = levels(loansData.clean$FICO.Range)
# assign the index in FICO.levels as the numeric representation add a
# variable for (FICO.levels)^2 for use in a quadratic model
for (i in seq(length(loansData.clean$FICO.Range))) {
loansData.clean$FICO.numeric[i] = which(FICO.levels == loansData.clean$FICO.Range[i])
}
loansData.clean$FICO.numeric2 = loansData.clean$FICO.numeric^2
# fico.order = order(loansData.clean$FICO.numeric)
# print(loansData.clean[fico.order, c('FICO.Range','FICO.numeric')])
summary(loansData.clean$FICO.numeric)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 10.0 14.0 15.2 19.0 40.0
png("~/R/Data Analysis/Data Analysis Project 1/project_figures/FICOdistribution.png")
plot(loansData.clean$FICO.Range, main = "How FICO Scores are distributed", ylab = "How Many People Have This Score?",
ann = FALSE)
dev.off()
## pdf
## 2
plot(loansData.clean$FICO.Range, main = "How FICO Scores are distributed", ylab = "How Many People Have This Score?",
ann = FALSE)
library(ggplot2)
library(scales)
FICOvsIntRate.png = ggplot(data = loansData.clean, aes(x = FICO.numeric, y = Interest.Rate)) +
geom_point(shape = 1) + geom_smooth(method = lm, formula = y ~ poly(x, 2),
se = FALSE, colour = "red") +
scale_x_discrete(name = "FICO Score", breaks = seq(2, 43, by = 4), labels = FICO.levels[seq(2,
43, by = 4)]) +
scale_y_continuous(name = "Interest Rate on a Loan (%)", labels = percent) +
ggtitle("How Your FICO Score affects the Interest Rate you pay") + theme(plot.title = element_text(face = "bold"))
FICOvsIntRate.png
print(FICO.levels)
## [1] "" "640-644" "645-649" "650-654" "655-659" "660-664" "665-669"
## [8] "670-674" "675-679" "680-684" "685-689" "690-694" "695-699" "700-704"
## [15] "705-709" "710-714" "715-719" "720-724" "725-729" "730-734" "735-739"
## [22] "740-744" "745-749" "750-754" "755-759" "760-764" "765-769" "770-774"
## [29] "775-779" "780-784" "785-789" "790-794" "795-799" "800-804" "805-809"
## [36] "810-814" "815-819" "820-824" "825-829" "830-834" "835-839" "840-844"
## [43] "845-850"
loansData.clean$Debt.To.Income.Ratio = as.numeric(sub("%", "", loansData.clean$Debt.To.Income.Ratio))/100
str(loansData$Debt.To.Income.Ratio)
## Factor w/ 3500 levels "","0.01%","0.02%",..: 690 2136 1681 630 1078 1305 1909 670 1992 3210 ...
str(loansData.clean$Debt.To.Income.Ratio)
## num [1:2500] 0.149 0.284 0.238 0.143 0.188 ...
loansData$Debt.To.Income.Ratio[1:10]
## [1] 14.90% 28.36% 23.81% 14.30% 18.78% 20.05% 26.09% 14.70% 26.92% 7.10%
## 3500 Levels: 0.01% 0.02% 0.03% 0.04% 0.05% 0.06% 0.07% 0.08% ... 9%
loansData.clean$Debt.To.Income.Ratio[1:10]
## [1] 0.1490 0.2836 0.2381 0.1430 0.1878 0.2005 0.2609 0.1470 0.2692 0.0710
loansData.clean$Debt.To.Income.Ratio.Cuts = cut(loansData.clean$Debt.To.Income.Ratio,
breaks = hist(loansData.clean$Debt.To.Income.Ratio)$breaks, include.lowest = TRUE)
loansData.clean$Loan.Length = factor(loansData.clean$Loan.Length)
str(loansData$Loan.Length)
## Factor w/ 3 levels "","36 months",..: 2 2 3 2 2 2 2 3 2 2 ...
str(loansData.clean$Loan.Length)
## Factor w/ 2 levels "36 months","60 months": 1 1 2 1 1 1 1 2 1 1 ...
loansData$Loan.Length[1:10]
## [1] 36 months 36 months 60 months 36 months 36 months 36 months 36 months
## [8] 60 months 36 months 36 months
## Levels: 36 months 60 months
loansData.clean$Loan.Length[1:10]
## [1] 36 months 36 months 60 months 36 months 36 months 36 months 36 months
## [8] 60 months 36 months 36 months
## Levels: 36 months 60 months
table(loansData.clean$Loan.Length)
##
## 36 months 60 months
## 1952 548
loansData.clean$Loan.Purpose = factor(loansData.clean$Loan.Purpose)
str(loansData$Loan.Purpose)
## Factor w/ 15 levels "","car","credit_card",..: 4 4 4 4 3 11 4 3 3 3 ...
str(loansData.clean$Loan.Purpose)
## Factor w/ 14 levels "car","credit_card",..: 3 3 3 3 2 10 3 2 2 2 ...
loansData$Loan.Purpose[1:10]
## [1] debt_consolidation debt_consolidation debt_consolidation
## [4] debt_consolidation credit_card other
## [7] debt_consolidation credit_card credit_card
## [10] credit_card
## 15 Levels: car credit_card debt_consolidation ... wedding
loansData.clean$Loan.Purpose[1:10]
## [1] debt_consolidation debt_consolidation debt_consolidation
## [4] debt_consolidation credit_card other
## [7] debt_consolidation credit_card credit_card
## [10] credit_card
## 14 Levels: car credit_card debt_consolidation ... wedding
table(loansData.clean$Loan.Purpose)
##
## car credit_card debt_consolidation
## 50 444 1307
## educational home_improvement house
## 15 152 20
## major_purchase medical moving
## 101 30 29
## other renewable_energy small_business
## 201 4 87
## vacation wedding
## 21 39
loansData.clean$State = factor(loansData.clean$State)
str(loansData$State)
## Factor w/ 51 levels "","AK","AL","AR",..: 41 44 6 18 32 8 21 20 6 6 ...
str(loansData.clean$State)
## Factor w/ 46 levels "AK","AL","AR",..: 37 39 5 16 28 7 19 18 5 5 ...
loansData$State[1:10]
## [1] SC TX CA KS NJ CT MA LA CA CA
## 51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY ... WY
loansData.clean$State[1:10]
## [1] SC TX CA KS NJ CT MA LA CA CA
## 46 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA IL IN KS KY LA MA ... WY
table(loansData.clean$State)
##
## AK AL AR AZ CA CO CT DC DE FL GA HI IA IL IN KS KY LA
## 11 38 13 46 433 61 50 11 8 169 98 12 1 101 3 21 23 22
## MA MD MI MN MO MS MT NC NH NJ NM NV NY OH OK OR PA RI
## 73 68 45 38 33 1 7 64 15 94 13 32 255 71 21 30 96 15
## SC SD TX UT VA VT WA WI WV WY
## 28 4 174 16 78 5 58 26 15 4
statelist = c("AK", "AL", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI",
"MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND",
"OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA",
"WA", "WV", "WI", "WY")
statelist[!(statelist %in% levels(loansData.clean$State))]
## [1] "ID" "ME" "NE" "ND" "TN"
loansData.clean$Home.Ownership = factor(loansData.clean$Home.Ownership)
str(loansData$Home.Ownership)
## Factor w/ 6 levels "","MORTGAGE",..: 2 2 2 2 6 5 6 2 6 6 ...
str(loansData.clean$Home.Ownership)
## Factor w/ 5 levels "MORTGAGE","NONE",..: 1 1 1 1 5 4 5 1 5 5 ...
loansData$Home.Ownership[1:10]
## [1] MORTGAGE MORTGAGE MORTGAGE MORTGAGE RENT OWN RENT
## [8] MORTGAGE RENT RENT
## Levels: MORTGAGE NONE OTHER OWN RENT
loansData.clean$Home.Ownership[1:10]
## [1] MORTGAGE MORTGAGE MORTGAGE MORTGAGE RENT OWN RENT
## [8] MORTGAGE RENT RENT
## Levels: MORTGAGE NONE OTHER OWN RENT
table(loansData.clean$Home.Ownership)
##
## MORTGAGE NONE OTHER OWN RENT
## 1148 1 5 200 1146
loansData.clean$FICO.Range = factor(loansData.clean$FICO.Range)
str(loansData$FICO.Range)
## Factor w/ 43 levels "","640-644","645-649",..: 21 17 12 13 13 8 18 15 11 17 ...
str(loansData.clean$FICO.Range)
## Factor w/ 38 levels "640-644","645-649",..: 20 16 11 12 12 7 17 14 10 16 ...
loansData$FICO.Range[1:10]
## [1] 735-739 715-719 690-694 695-699 695-699 670-674 720-724 705-709
## [9] 685-689 715-719
## 43 Levels: 640-644 645-649 650-654 655-659 660-664 665-669 ... 845-850
loansData.clean$FICO.Range[1:10]
## [1] 735-739 715-719 690-694 695-699 695-699 670-674 720-724 705-709
## [9] 685-689 715-719
## 38 Levels: 640-644 645-649 650-654 655-659 660-664 665-669 ... 830-834
levels(loansData$FICO.Range)[!(levels(loansData$FICO.Range) %in% levels(loansData.clean$FICO.Range))]
## [1] "" "825-829" "835-839" "840-844" "845-850"
table(loansData.clean$FICO.Range)
##
## 640-644 645-649 650-654 655-659 660-664 665-669 670-674 675-679 680-684
## 5 3 1 4 125 145 171 166 157
## 685-689 690-694 695-699 700-704 705-709 710-714 715-719 720-724 725-729
## 138 140 153 131 134 112 93 114 94
## 730-734 735-739 740-744 745-749 750-754 755-759 760-764 765-769 770-774
## 94 65 53 54 61 46 46 36 17
## 775-779 780-784 785-789 790-794 795-799 800-804 805-809 810-814 815-819
## 22 28 19 20 13 13 11 8 6
## 820-824 830-834
## 1 1
loansData.clean$Employment.Length = factor(loansData.clean$Employment.Length)
str(loansData$Employment.Length)
## Factor w/ 13 levels "","< 1 year",..: 2 5 5 8 12 6 4 4 11 6 ...
str(loansData.clean$Employment.Length)
## Factor w/ 12 levels "< 1 year","1 year",..: 1 4 4 7 11 5 3 3 10 5 ...
loansData$Employment.Length[1:10]
## [1] < 1 year 2 years 2 years 5 years 9 years 3 years 10+ years
## [8] 10+ years 8 years 3 years
## 13 Levels: < 1 year 1 year 10+ years 2 years 3 years 4 years ... n/a
loansData.clean$Employment.Length[1:10]
## [1] < 1 year 2 years 2 years 5 years 9 years 3 years 10+ years
## [8] 10+ years 8 years 3 years
## 12 Levels: < 1 year 1 year 10+ years 2 years 3 years 4 years ... n/a
table(loansData.clean$Employment.Length)
##
## < 1 year 1 year 10+ years 2 years 3 years 4 years 5 years
## 250 177 653 244 235 192 202
## 6 years 7 years 8 years 9 years n/a
## 163 127 108 72 77
loansData.clean$Revolving.CREDIT.Balance.Cuts = cut(loansData.clean$Revolving.CREDIT.Balance,
breaks = hist(loansData.clean$Revolving.CREDIT.Balance)$breaks, include.lowest = TRUE)
# Monthly income has some high values
## create Sturges breaks
```r
table(cut(loansData.clean$Monthly.Income, breaks = 6))
##
## (486,1.75e+04] (1.75e+04,3.46e+04] (3.46e+04,5.17e+04]
## 2467 29 1
## (5.17e+04,6.87e+04] (6.87e+04,8.58e+04] (8.58e+04,1.03e+05]
## 1 0 1
loansData.clean[which(loansData.clean$Monthly.Income > 40000), ]
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 18439 4500 4500 0.0691
## 54487 33000 33000 0.0762
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State
## 18439 36 months major_purchase 0.0058 MD
## 54487 36 months debt_consolidation 0.0026 NY
## Home.Ownership Monthly.Income FICO.Range Open.CREDIT.Lines
## 18439 MORTGAGE 65000 725-729 7
## 54487 MORTGAGE 102750 805-809 7
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 18439 3015 4
## 54487 95435 0
## Employment.Length FICO.numeric FICO.numeric2
## 18439 7 years 19 361
## 54487 10+ years 35 1225
## Debt.To.Income.Ratio.Cuts Revolving.CREDIT.Balance.Cuts
## 18439 [0,0.02] [0,2e+04]
## 54487 [0,0.02] (8e+04,1e+05]
loansData.clean$Monthly.Income.Cuts = cut(loansData.clean$Monthly.Income, breaks = hist(loansData.clean$Monthly.Income)$breaks,
include.lowest = TRUE)
loansData.clean$Amount.Funded.By.Investors.Cuts = cut(loansData.clean$Amount.Funded.By.Investors,
breaks = hist(loansData.clean$Amount.Funded.By.Investors)$breaks, include.lowest = TRUE)
loansData.clean$Amount.Requested.Cuts = cut(loansData.clean$Amount.Requested,
breaks = hist(loansData.clean$Amount.Requested)$breaks, include.lowest = TRUE)
**************
# look again
```r
# c('Interest.Rate', 'Amount.Requested','Amount.Requested.Cuts',
# 'Amount.Funded.By.Investors','Amount.Funded.By.Investors.Cuts',
# 'Loan.Length', 'Loan.Purpose',
# 'Debt.To.Income.Ratio','Debt.To.Income.Ratio.Cuts', 'State',
# 'Home.Ownership', 'Monthly.Income','Monthly.Income.Cuts', 'FICO.Range',
# 'Open.CREDIT.Lines',
# 'Revolving.CREDIT.Balance','Revolving.CREDIT.Balance.Cuts',
# 'Inquiries.in.the.Last.6.Months', 'Employment.Length')
first.look(loansData.clean, pairs_columns = c("Loan.Purpose", "State", "Home.Ownership",
"Open.CREDIT.Lines", "Inquiries.in.the.Last.6.Months", "Employment.Length",
"Revolving.CREDIT.Balance.Cuts", "Monthly.Income.Cuts", "Amount.Funded.By.Investors.Cuts",
"Amount.Requested.Cuts", "Debt.To.Income.Ratio.Cuts", "Loan.Length", "FICO.Range",
"Interest.Rate"))
## ----- str: look at data types and values -------
## 'data.frame': 2500 obs. of 21 variables:
## $ Amount.Requested : int 20000 19200 35000 10000 12000 6000 10000 33500 14675 7000 ...
## $ Amount.Funded.By.Investors : num 20000 19200 35000 9975 12000 ...
## $ Interest.Rate : num 0.089 0.1212 0.2198 0.0999 0.1171 ...
## $ Loan.Length : Factor w/ 2 levels "36 months","60 months": 1 1 2 1 1 1 1 2 1 1 ...
## $ Loan.Purpose : Factor w/ 14 levels "car","credit_card",..: 3 3 3 3 2 10 3 2 2 2 ...
## $ Debt.To.Income.Ratio : num 0.149 0.284 0.238 0.143 0.188 ...
## $ State : Factor w/ 46 levels "AK","AL","AR",..: 37 39 5 16 28 7 19 18 5 5 ...
## $ Home.Ownership : Factor w/ 5 levels "MORTGAGE","NONE",..: 1 1 1 1 5 4 5 1 5 5 ...
## $ Monthly.Income : num 6542 4583 11500 3833 3195 ...
## $ FICO.Range : Factor w/ 38 levels "640-644","645-649",..: 20 16 11 12 12 7 17 14 10 16 ...
## $ Open.CREDIT.Lines : int 14 12 14 10 11 17 10 12 9 8 ...
## $ Revolving.CREDIT.Balance : int 14272 11140 21977 9346 14469 10391 15957 27874 7246 7612 ...
## $ Inquiries.in.the.Last.6.Months : int 2 1 1 0 0 2 0 0 1 0 ...
## $ Employment.Length : Factor w/ 12 levels "< 1 year","1 year",..: 1 4 4 7 11 5 3 3 10 5 ...
## $ FICO.numeric : int 21 17 12 13 13 8 18 15 11 17 ...
## $ FICO.numeric2 : num 441 289 144 169 169 64 324 225 121 289 ...
## $ Debt.To.Income.Ratio.Cuts : Factor w/ 18 levels "[0,0.02]","(0.02,0.04]",..: 8 15 12 8 10 11 14 8 14 4 ...
## $ Revolving.CREDIT.Balance.Cuts : Factor w/ 14 levels "[0,2e+04]","(2e+04,4e+04]",..: 1 1 2 1 1 1 1 2 1 1 ...
## $ Monthly.Income.Cuts : Factor w/ 11 levels "[0,1e+04]","(1e+04,2e+04]",..: 1 1 2 1 1 1 1 2 1 1 ...
## $ Amount.Funded.By.Investors.Cuts: Factor w/ 19 levels "[-2e+03,0]","(0,2e+03]",..: 11 11 19 6 7 4 6 18 9 5 ...
## $ Amount.Requested.Cuts : Factor w/ 18 levels "[0,2e+03]","(2e+03,4e+03]",..: 10 10 18 5 6 3 5 17 8 4 ...
## NULL
##
## ----- sample size: 2500
## ----- sample size of complete cases: 2498
## ----- difference: 2
##
## ----- names: variable names
## [1] "Amount.Requested" "Amount.Funded.By.Investors"
## [3] "Interest.Rate" "Loan.Length"
## [5] "Loan.Purpose" "Debt.To.Income.Ratio"
## [7] "State" "Home.Ownership"
## [9] "Monthly.Income" "FICO.Range"
## [11] "Open.CREDIT.Lines" "Revolving.CREDIT.Balance"
## [13] "Inquiries.in.the.Last.6.Months" "Employment.Length"
## [15] "FICO.numeric" "FICO.numeric2"
## [17] "Debt.To.Income.Ratio.Cuts" "Revolving.CREDIT.Balance.Cuts"
## [19] "Monthly.Income.Cuts" "Amount.Funded.By.Investors.Cuts"
## [21] "Amount.Requested.Cuts"
##
## ----- head: first six rows
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 81174 20000 20000 0.0890
## 99592 19200 19200 0.1212
## 80059 35000 35000 0.2198
## 15825 10000 9975 0.0999
## 33182 12000 12000 0.1171
## 62403 6000 6000 0.1531
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State
## 81174 36 months debt_consolidation 0.1490 SC
## 99592 36 months debt_consolidation 0.2836 TX
## 80059 60 months debt_consolidation 0.2381 CA
## 15825 36 months debt_consolidation 0.1430 KS
## 33182 36 months credit_card 0.1878 NJ
## 62403 36 months other 0.2005 CT
## Home.Ownership Monthly.Income FICO.Range Open.CREDIT.Lines
## 81174 MORTGAGE 6542 735-739 14
## 99592 MORTGAGE 4583 715-719 12
## 80059 MORTGAGE 11500 690-694 14
## 15825 MORTGAGE 3833 695-699 10
## 33182 RENT 3195 695-699 11
## 62403 OWN 4892 670-674 17
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 81174 14272 2
## 99592 11140 1
## 80059 21977 1
## 15825 9346 0
## 33182 14469 0
## 62403 10391 2
## Employment.Length FICO.numeric FICO.numeric2
## 81174 < 1 year 21 441
## 99592 2 years 17 289
## 80059 2 years 12 144
## 15825 5 years 13 169
## 33182 9 years 13 169
## 62403 3 years 8 64
## Debt.To.Income.Ratio.Cuts Revolving.CREDIT.Balance.Cuts
## 81174 (0.14,0.16] [0,2e+04]
## 99592 (0.28,0.3] [0,2e+04]
## 80059 (0.22,0.24] (2e+04,4e+04]
## 15825 (0.14,0.16] [0,2e+04]
## 33182 (0.18,0.2] [0,2e+04]
## 62403 (0.2,0.22] [0,2e+04]
## Monthly.Income.Cuts Amount.Funded.By.Investors.Cuts
## 81174 [0,1e+04] (1.8e+04,2e+04]
## 99592 [0,1e+04] (1.8e+04,2e+04]
## 80059 (1e+04,2e+04] (3.4e+04,3.6e+04]
## 15825 [0,1e+04] (8e+03,1e+04]
## 33182 [0,1e+04] (1e+04,1.2e+04]
## 62403 [0,1e+04] (4e+03,6e+03]
## Amount.Requested.Cuts
## 81174 (1.8e+04,2e+04]
## 99592 (1.8e+04,2e+04]
## 80059 (3.4e+04,3.6e+04]
## 15825 (8e+03,1e+04]
## 33182 (1e+04,1.2e+04]
## 62403 (4e+03,6e+03]
##
## ----- summary: statistics for each variable
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## Min. : 1000 Min. : 0 Min. :0.0542
## 1st Qu.: 6000 1st Qu.: 6000 1st Qu.:0.1016
## Median :10000 Median :10000 Median :0.1311
## Mean :12406 Mean :12002 Mean :0.1307
## 3rd Qu.:17000 3rd Qu.:16000 3rd Qu.:0.1580
## Max. :35000 Max. :35000 Max. :0.2489
##
## Loan.Length Loan.Purpose Debt.To.Income.Ratio
## 36 months:1952 debt_consolidation:1307 Min. :0.0000
## 60 months: 548 credit_card : 444 1st Qu.:0.0975
## other : 201 Median :0.1532
## home_improvement : 152 Mean :0.1538
## major_purchase : 101 3rd Qu.:0.2067
## small_business : 87 Max. :0.3491
## (Other) : 208
## State Home.Ownership Monthly.Income FICO.Range
## CA : 433 MORTGAGE:1148 Min. : 588 670-674: 171
## NY : 255 NONE : 1 1st Qu.: 3500 675-679: 166
## TX : 174 OTHER : 5 Median : 5000 680-684: 157
## FL : 169 OWN : 200 Mean : 5689 695-699: 153
## IL : 101 RENT :1146 3rd Qu.: 6800 665-669: 145
## GA : 98 Max. :102750 690-694: 140
## (Other):1270 NA's :1 (Other):1568
## Open.CREDIT.Lines Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## Min. : 2.0 Min. : 0 Min. :0.000
## 1st Qu.: 7.0 1st Qu.: 5586 1st Qu.:0.000
## Median : 9.0 Median : 10962 Median :0.000
## Mean :10.1 Mean : 15245 Mean :0.906
## 3rd Qu.:13.0 3rd Qu.: 18889 3rd Qu.:1.000
## Max. :38.0 Max. :270800 Max. :9.000
## NA's :2 NA's :2 NA's :2
## Employment.Length FICO.numeric FICO.numeric2 Debt.To.Income.Ratio.Cuts
## 10+ years:653 Min. : 2.0 Min. : 4 (0.14,0.16]: 257
## < 1 year :250 1st Qu.:10.0 1st Qu.: 100 (0.16,0.18]: 245
## 2 years :244 Median :14.0 Median : 196 (0.12,0.14]: 222
## 3 years :235 Mean :15.2 Mean : 279 (0.1,0.12] : 219
## 5 years :202 3rd Qu.:19.0 3rd Qu.: 361 (0.2,0.22] : 217
## 4 years :192 Max. :40.0 Max. :1600 (0.18,0.2] : 208
## (Other) :724 (Other) :1132
## Revolving.CREDIT.Balance.Cuts Monthly.Income.Cuts
## [0,2e+04] :1932 [0,1e+04] :2297
## (2e+04,4e+04]: 435 (1e+04,2e+04]: 186
## (4e+04,6e+04]: 70 (2e+04,3e+04]: 13
## (6e+04,8e+04]: 28 (3e+04,4e+04]: 1
## (8e+04,1e+05]: 12 (6e+04,7e+04]: 1
## (Other) : 21 (Other) : 1
## NA's : 2 NA's : 1
## Amount.Funded.By.Investors.Cuts Amount.Requested.Cuts
## (4e+03,6e+03] :366 (4e+03,6e+03] :365
## (8e+03,1e+04] :346 (8e+03,1e+04] :353
## (6e+03,8e+03] :280 (6e+03,8e+03] :277
## (1e+04,1.2e+04] :235 (1e+04,1.2e+04] :234
## (1.4e+04,1.6e+04]:207 (1.4e+04,1.6e+04]:214
## (2e+03,4e+03] :206 (2e+03,4e+03] :199
## (Other) :860 (Other) :858
grfpairs(loansData.clean[, c("Loan.Purpose", "State", "Home.Ownership", "Open.CREDIT.Lines",
"Inquiries.in.the.Last.6.Months", "Employment.Length", "Revolving.CREDIT.Balance",
"Monthly.Income", "Amount.Funded.By.Investors", "Amount.Requested", "Debt.To.Income.Ratio",
"Loan.Length", "FICO.Range", "Interest.Rate")])
loansData.clean[!complete.cases(loansData.clean), ]
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 101596 5000 4525 0.0743
## 101515 3500 225 0.1028
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State Home.Ownership
## 101596 36 months other 0.01 NY NONE
## 101515 36 months other 0.10 NY RENT
## Monthly.Income FICO.Range Open.CREDIT.Lines
## 101596 NA 800-804 NA
## 101515 15000 685-689 NA
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 101596 NA NA
## 101515 NA NA
## Employment.Length FICO.numeric FICO.numeric2
## 101596 < 1 year 34 1156
## 101515 < 1 year 11 121
## Debt.To.Income.Ratio.Cuts Revolving.CREDIT.Balance.Cuts
## 101596 [0,0.02] <NA>
## 101515 (0.08,0.1] <NA>
## Monthly.Income.Cuts Amount.Funded.By.Investors.Cuts
## 101596 <NA> (4e+03,6e+03]
## 101515 (1e+04,2e+04] (0,2e+03]
## Amount.Requested.Cuts
## 101596 (4e+03,6e+03]
## 101515 (2e+03,4e+03]
# create a dataset with no NAs
loansData.complete = loansData.clean[complete.cases(loansData.clean), ]
# create trainLoans and testLoans
set.seed(3435)
trainIndicator = rbinom(nrow(loansData.clean), size = 1, prob = 0.5)
table(trainIndicator)
## trainIndicator
## 0 1
## 1265 1235
trainLoans = loansData.clean[trainIndicator == 1, ]
testLoans = loansData.clean[trainIndicator == 0, ]
dim(trainLoans)
## [1] 1235 21
dim(testLoans)
## [1] 1265 21
# get the date and time to save
loansData.create.date = date()
# save everything in one place
save(loansData.clean, loansData.complete, trainLoans, testLoans, FICO.levels,
loansData.create.date, file = "loansData.clean.rda")
print(str(.Platform))
## List of 8
## $ OS.type : chr "windows"
## $ file.sep : chr "/"
## $ dynlib.ext: chr ".dll"
## $ GUI : chr "RTerm"
## $ endian : chr "little"
## $ pkgType : chr "win.binary"
## $ path.sep : chr ";"
## $ r_arch : chr "x64"
## NULL
print(version)
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 3
## minor 0.2
## year 2013
## month 09
## day 25
## svn rev 63987
## language R
## version.string R version 3.0.2 (2013-09-25)
## nickname Frisbee Sailing
print(sessionInfo(), locale = FALSE)
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_0.2.3 ggplot2_0.9.3.1 knitr_1.5
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.2-4 dichromat_2.0-0 digest_0.6.3
## [4] evaluate_0.5.1 formatR_0.10 grid_3.0.2
## [7] gtable_0.1.2 labeling_0.2 MASS_7.3-29
## [10] munsell_0.4.2 plyr_1.8 proto_0.3-10
## [13] RColorBrewer_1.0-5 reshape2_1.2.2 stringr_0.6.2
## [16] tools_3.0.2
print(Sys.time())
## [1] "2013-11-07 10:35:18 EST"