Programming in R: Independent Study
J Herdmann
Winter 2013
sessionInfo()
## R version 2.15.1 (2012-06-22)
## Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.1
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.3 evaluate_0.4.3 formatR_0.7 stringr_0.6.2
## [5] tools_2.15.1
library(gbm)
## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loaded gbm 2.0-8
library(stringr)
library(Hmisc)
## Hmisc library by Frank E Harrell Jr
##
## Type library(help='Hmisc'), ?Overview, or ?Hmisc.Overview') to see overall
## documentation.
##
## NOTE:Hmisc no longer redefines [.factor to drop unused levels when
## subsetting. To get the old behavior of Hmisc type dropUnusedLevels().
## Attaching package: 'Hmisc'
## The following object(s) are masked from 'package:survival':
##
## untangle.specials
## The following object(s) are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
getwd()
## [1] "/Users/user/Desktop/loansData"
setwd("/Users/user/Desktop/loansData")
load("~/Desktop/loansData/loansData.rda")
x <- loansData
dim(x)
## [1] 2500 14
Dimensions
names(x)
## [1] "Amount.Requested" "Amount.Funded.By.Investors"
## [3] "Interest.Rate" "Loan.Length"
## [5] "Loan.Purpose" "Debt.To.Income.Ratio"
## [7] "State" "Home.Ownership"
## [9] "Monthly.Income" "FICO.Range"
## [11] "Open.CREDIT.Lines" "Revolving.CREDIT.Balance"
## [13] "Inquiries.in.the.Last.6.Months" "Employment.Length"
Variable Names
str(x)
## 'data.frame': 2500 obs. of 14 variables:
## $ Amount.Requested : int 20000 19200 35000 10000 12000 6000 10000 33500 14675 7000 ...
## $ Amount.Funded.By.Investors : num 20000 19200 35000 9975 12000 ...
## $ Interest.Rate : Factor w/ 430 levels "","10.00%","10.01%",..: 409 60 335 430 49 166 398 220 130 379 ...
## $ Loan.Length : Factor w/ 3 levels "","36 months",..: 2 2 3 2 2 2 2 3 2 2 ...
## $ Loan.Purpose : Factor w/ 15 levels "","car","credit_card",..: 4 4 4 4 3 11 4 3 3 3 ...
## $ Debt.To.Income.Ratio : Factor w/ 3500 levels "","0.01%","0.02%",..: 690 2136 1681 630 1078 1305 1909 670 1992 3210 ...
## $ State : Factor w/ 51 levels "","AK","AL","AR",..: 41 44 6 18 32 8 21 20 6 6 ...
## $ Home.Ownership : Factor w/ 6 levels "","MORTGAGE",..: 2 2 2 2 6 5 6 2 6 6 ...
## $ Monthly.Income : num 6542 4583 11500 3833 3195 ...
## $ FICO.Range : Factor w/ 43 levels "","640-644","645-649",..: 21 17 12 13 13 8 18 15 11 17 ...
## $ Open.CREDIT.Lines : int 14 12 14 10 11 17 10 12 9 8 ...
## $ Revolving.CREDIT.Balance : int 14272 11140 21977 9346 14469 10391 15957 27874 7246 7612 ...
## $ Inquiries.in.the.Last.6.Months: int 2 1 1 0 0 2 0 0 1 0 ...
## $ Employment.Length : Factor w/ 13 levels "","< 1 year",..: 2 5 5 8 12 6 4 4 11 6 ...
head(x)
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 81174 20000 20000 8.90%
## 99592 19200 19200 12.12%
## 80059 35000 35000 21.98%
## 15825 10000 9975 9.99%
## 33182 12000 12000 11.71%
## 62403 6000 6000 15.31%
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State
## 81174 36 months debt_consolidation 14.90% SC
## 99592 36 months debt_consolidation 28.36% TX
## 80059 60 months debt_consolidation 23.81% CA
## 15825 36 months debt_consolidation 14.30% KS
## 33182 36 months credit_card 18.78% NJ
## 62403 36 months other 20.05% CT
## Home.Ownership Monthly.Income FICO.Range Open.CREDIT.Lines
## 81174 MORTGAGE 6542 735-739 14
## 99592 MORTGAGE 4583 715-719 12
## 80059 MORTGAGE 11500 690-694 14
## 15825 MORTGAGE 3833 695-699 10
## 33182 RENT 3195 695-699 11
## 62403 OWN 4892 670-674 17
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 81174 14272 2
## 99592 11140 1
## 80059 21977 1
## 15825 9346 0
## 33182 14469 0
## 62403 10391 2
## Employment.Length
## 81174 < 1 year
## 99592 2 years
## 80059 2 years
## 15825 5 years
## 33182 9 years
## 62403 3 years
Summarized Previews of Data
sapply(x[1, ], class)
## Amount.Requested Amount.Funded.By.Investors
## "integer" "numeric"
## Interest.Rate Loan.Length
## "factor" "factor"
## Loan.Purpose Debt.To.Income.Ratio
## "factor" "factor"
## State Home.Ownership
## "factor" "factor"
## Monthly.Income FICO.Range
## "numeric" "factor"
## Open.CREDIT.Lines Revolving.CREDIT.Balance
## "integer" "integer"
## Inquiries.in.the.Last.6.Months Employment.Length
## "integer" "factor"
Specifies Variable's Data Classes
loansData$Debt.To.Income.Ratio <- gsub("%", "", loansData$Debt.To.Income.Ratio)
loansData$Debt.To.Income.Ratio <- as.numeric(loansData$Debt.To.Income.Ratio)
Drops the “%” character and converts the observations from “factors” to “numeric”
loansData$Interest.Rate <- gsub("%", "", loansData$Interest.Rate)
loansData$Interest.Rate <- as.numeric(loansData$Interest.Rate)
Drops the “%” character and converts the observations from “factors” to “numeric”
sum(is.na(loansData))
## [1] 7
names(loansData[, !complete.cases(t(loansData))])
## [1] "Monthly.Income" "Open.CREDIT.Lines"
## [3] "Revolving.CREDIT.Balance" "Inquiries.in.the.Last.6.Months"
Checks for missing values (NA's) and identifies how many there are and the names of the columns they are found in
loansData$Monthly.Income[is.na(loansData$Monthly.Income)] <- mean(loansData$Monthly.Income,
na.rm = TRUE)
loansData$Open.CREDIT.Lines[is.na(loansData$Open.CREDIT.Lines)] <- mean(loansData$Open.CREDIT.Lines,
na.rm = TRUE)
loansData$Revolving.CREDIT.Balance[is.na(loansData$Revolving.CREDIT.Balance)] <- mean(loansData$Revolving.CREDIT.Balance,
na.rm = TRUE)
loansData$Inquiries.in.the.Last.6.Months[is.na(loansData$Inquiries.in.the.Last.6.Months)] <- mean(loansData$Inquiries.in.the.Last.6.Months,
na.rm = TRUE)
Replaces the NA's with mean values of their corresponding columns
sum(is.na(loansData))
## [1] 0
No missing values remain
SplitFICO <- data.frame(str_split_fixed(loansData$FICO.Range, "-", 2))
SplitFICO$X1 <- as.numeric(as.character(SplitFICO$X1))
SplitFICO$X2 <- as.numeric(as.character(SplitFICO$X2))
FICO.Mean <- rowMeans(SplitFICO)
loansData <- data.frame(loansData, FICO.Mean)
To use FICO score data for analysis the range was split and an average value was calculated for each
loansData$FICO.Range <- NULL
loansData$Amount.Funded.By.Investors <- NULL
Removes the 'FICO.Range' column and 'Amount.Funded.By.Investors' since this will have no affect on Interest Rates
positions <- sample(nrow(loansData), size = floor((nrow(loansData)/4) * 3))
training <- loansData[positions, ]
testing <- loansData[positions, ]
Creates a train and test data set that has been scaled down for simplicity
actualValues <- testing$Interest.Rate
testing$Interest.Rate <- NULL
gbmMod <- gbm(Interest.Rate ~ ., training, n.trees = 1000, shrinkage = 0.01,
distribution = "gaussian", interaction.depth = 7, bag.fraction = 0.9, cv.fold = 5,
n.minobsinnode = 50)
## CV: 1
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 17.7221 16.0974 0.0100 0.2785
## 2 17.4737 15.8915 0.0100 0.2429
## 3 17.2291 15.6899 0.0100 0.2310
## 4 16.9929 15.4908 0.0100 0.2296
## 5 16.7619 15.2850 0.0100 0.2176
## 6 16.5339 15.0974 0.0100 0.2089
## 7 16.3111 14.9128 0.0100 0.2279
## 8 16.0898 14.7274 0.0100 0.2823
## 9 15.8756 14.5395 0.0100 0.2115
## 10 15.6646 14.3663 0.0100 0.2216
## 20 13.7544 12.7711 0.0100 0.1845
## 40 10.8274 10.2924 0.0100 0.0927
## 60 8.7604 8.5596 0.0100 0.0816
## 80 7.2983 7.3134 0.0100 0.0521
## 100 6.2425 6.4290 0.0100 0.0362
## 120 5.4682 5.7541 0.0100 0.0313
## 140 4.8854 5.2471 0.0100 0.0201
## 160 4.4300 4.8594 0.0100 0.0211
## 180 4.0692 4.5592 0.0100 0.0075
## 200 3.7802 4.3258 0.0100 0.0125
## 220 3.5452 4.1443 0.0100 0.0092
## 240 3.3568 3.9979 0.0100 0.0007
## 260 3.1952 3.8799 0.0100 0.0019
## 280 3.0623 3.7890 0.0100 0.0026
## 300 2.9428 3.7135 0.0100 0.0023
## 320 2.8416 3.6513 0.0100 -0.0002
## 340 2.7533 3.6073 0.0100 -0.0014
## 360 2.6737 3.5700 0.0100 0.0006
## 380 2.5999 3.5409 0.0100 -0.0016
## 400 2.5344 3.5227 0.0100 0.0003
## 420 2.4760 3.5071 0.0100 -0.0013
## 440 2.4230 3.4946 0.0100 -0.0014
## 460 2.3742 3.4855 0.0100 -0.0020
## 480 2.3287 3.4762 0.0100 -0.0027
## 500 2.2845 3.4742 0.0100 -0.0027
## 520 2.2471 3.4739 0.0100 -0.0007
## 540 2.2110 3.4734 0.0100 0.0003
## 560 2.1753 3.4718 0.0100 -0.0013
## 580 2.1421 3.4776 0.0100 -0.0004
## 600 2.1099 3.4792 0.0100 -0.0022
## 620 2.0816 3.4797 0.0100 -0.0014
## 640 2.0544 3.4806 0.0100 -0.0005
## 660 2.0259 3.4816 0.0100 -0.0031
## 680 2.0001 3.4804 0.0100 -0.0009
## 700 1.9744 3.4846 0.0100 -0.0010
## 720 1.9506 3.4917 0.0100 -0.0006
## 740 1.9261 3.4952 0.0100 -0.0006
## 760 1.9060 3.4978 0.0100 -0.0006
## 780 1.8850 3.5004 0.0100 -0.0024
## 800 1.8638 3.5042 0.0100 0.0001
## 820 1.8442 3.5122 0.0100 -0.0010
## 840 1.8258 3.5122 0.0100 -0.0010
## 860 1.8081 3.5139 0.0100 -0.0019
## 880 1.7901 3.5200 0.0100 -0.0011
## 900 1.7722 3.5254 0.0100 -0.0020
## 920 1.7549 3.5281 0.0100 -0.0011
## 940 1.7370 3.5324 0.0100 -0.0018
## 960 1.7212 3.5395 0.0100 -0.0007
## 980 1.7051 3.5450 0.0100 -0.0019
## 1000 1.6890 3.5500 0.0100 -0.0010
##
## CV: 2
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 17.2418 17.9806 0.0100 0.2671
## 2 16.9979 17.7297 0.0100 0.2053
## 3 16.7606 17.4829 0.0100 0.2253
## 4 16.5304 17.2436 0.0100 0.2171
## 5 16.3036 17.0082 0.0100 0.2033
## 6 16.0797 16.7755 0.0100 0.2208
## 7 15.8620 16.5411 0.0100 0.1948
## 8 15.6478 16.3191 0.0100 0.2015
## 9 15.4353 16.1018 0.0100 0.2151
## 10 15.2294 15.8898 0.0100 0.2136
## 20 13.3673 13.9710 0.0100 0.1658
## 40 10.5388 11.1184 0.0100 0.1329
## 60 8.5504 9.1511 0.0100 0.0749
## 80 7.1352 7.7615 0.0100 0.0408
## 100 6.1209 6.7520 0.0100 0.0366
## 120 5.3661 6.0265 0.0100 0.0225
## 140 4.7922 5.4937 0.0100 0.0177
## 160 4.3561 5.1158 0.0100 0.0174
## 180 3.9960 4.7724 0.0100 0.0133
## 200 3.7141 4.5249 0.0100 0.0058
## 220 3.4820 4.3401 0.0100 0.0054
## 240 3.2998 4.2026 0.0100 0.0036
## 260 3.1414 4.0978 0.0100 0.0033
## 280 3.0053 3.9990 0.0100 -0.0006
## 300 2.8920 3.9215 0.0100 -0.0011
## 320 2.7975 3.8567 0.0100 -0.0003
## 340 2.7155 3.8111 0.0100 0.0006
## 360 2.6433 3.7762 0.0100 0.0002
## 380 2.5778 3.7346 0.0100 -0.0014
## 400 2.5151 3.7034 0.0100 -0.0025
## 420 2.4612 3.6866 0.0100 -0.0009
## 440 2.4111 3.6616 0.0100 -0.0028
## 460 2.3646 3.6441 0.0100 0.0005
## 480 2.3224 3.6307 0.0100 -0.0030
## 500 2.2796 3.6183 0.0100 -0.0016
## 520 2.2417 3.6103 0.0100 -0.0018
## 540 2.2042 3.6003 0.0100 -0.0004
## 560 2.1690 3.5898 0.0100 0.0003
## 580 2.1373 3.5860 0.0100 -0.0002
## 600 2.1062 3.5823 0.0100 -0.0022
## 620 2.0760 3.5796 0.0100 -0.0002
## 640 2.0482 3.5736 0.0100 -0.0008
## 660 2.0227 3.5723 0.0100 -0.0001
## 680 1.9979 3.5705 0.0100 -0.0005
## 700 1.9737 3.5634 0.0100 -0.0012
## 720 1.9514 3.5610 0.0100 -0.0006
## 740 1.9305 3.5600 0.0100 -0.0014
## 760 1.9095 3.5601 0.0100 -0.0017
## 780 1.8889 3.5611 0.0100 -0.0010
## 800 1.8709 3.5588 0.0100 -0.0020
## 820 1.8505 3.5592 0.0100 -0.0011
## 840 1.8290 3.5597 0.0100 -0.0023
## 860 1.8121 3.5617 0.0100 -0.0012
## 880 1.7934 3.5604 0.0100 -0.0020
## 900 1.7748 3.5625 0.0100 -0.0011
## 920 1.7558 3.5617 0.0100 -0.0025
## 940 1.7389 3.5651 0.0100 -0.0010
## 960 1.7227 3.5647 0.0100 -0.0014
## 980 1.7065 3.5672 0.0100 -0.0020
## 1000 1.6908 3.5696 0.0100 -0.0009
##
## CV: 3
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 17.7269 16.0833 0.0100 0.2659
## 2 17.4825 15.8504 0.0100 0.2449
## 3 17.2443 15.6303 0.0100 0.2228
## 4 17.0094 15.4072 0.0100 0.2209
## 5 16.7800 15.1940 0.0100 0.2583
## 6 16.5555 14.9818 0.0100 0.2257
## 7 16.3350 14.7820 0.0100 0.1857
## 8 16.1204 14.5781 0.0100 0.1925
## 9 15.9100 14.3778 0.0100 0.2594
## 10 15.7001 14.1785 0.0100 0.1841
## 20 13.8029 12.4300 0.0100 0.1682
## 40 10.9111 9.8491 0.0100 0.1173
## 60 8.8683 8.0417 0.0100 0.1023
## 80 7.4091 6.7797 0.0100 0.0687
## 100 6.3498 5.8582 0.0100 0.0353
## 120 5.5677 5.2113 0.0100 0.0233
## 140 4.9736 4.7320 0.0100 0.0163
## 160 4.5015 4.3541 0.0100 0.0135
## 180 4.1405 4.0783 0.0100 0.0123
## 200 3.8553 3.8535 0.0100 0.0096
## 220 3.6248 3.6922 0.0100 0.0074
## 240 3.4320 3.5625 0.0100 0.0038
## 260 3.2691 3.4645 0.0100 0.0049
## 280 3.1307 3.3825 0.0100 -0.0000
## 300 3.0133 3.3270 0.0100 0.0047
## 320 2.9094 3.2834 0.0100 0.0009
## 340 2.8175 3.2524 0.0100 -0.0003
## 360 2.7316 3.2324 0.0100 -0.0008
## 380 2.6549 3.2136 0.0100 0.0000
## 400 2.5858 3.2038 0.0100 -0.0005
## 420 2.5258 3.1952 0.0100 -0.0022
## 440 2.4698 3.1935 0.0100 -0.0009
## 460 2.4181 3.1890 0.0100 -0.0016
## 480 2.3696 3.1872 0.0100 -0.0002
## 500 2.3239 3.1870 0.0100 -0.0020
## 520 2.2840 3.1882 0.0100 -0.0022
## 540 2.2454 3.1889 0.0100 -0.0002
## 560 2.2103 3.1982 0.0100 -0.0007
## 580 2.1754 3.2027 0.0100 -0.0007
## 600 2.1423 3.2075 0.0100 -0.0018
## 620 2.1119 3.2143 0.0100 -0.0008
## 640 2.0838 3.2174 0.0100 -0.0003
## 660 2.0554 3.2197 0.0100 -0.0018
## 680 2.0282 3.2245 0.0100 -0.0012
## 700 2.0030 3.2345 0.0100 -0.0020
## 720 1.9791 3.2390 0.0100 -0.0015
## 740 1.9556 3.2469 0.0100 -0.0009
## 760 1.9330 3.2551 0.0100 -0.0007
## 780 1.9122 3.2618 0.0100 -0.0024
## 800 1.8903 3.2682 0.0100 -0.0014
## 820 1.8699 3.2703 0.0100 -0.0012
## 840 1.8492 3.2774 0.0100 -0.0007
## 860 1.8308 3.2824 0.0100 -0.0008
## 880 1.8110 3.2863 0.0100 -0.0004
## 900 1.7921 3.2892 0.0100 -0.0016
## 920 1.7753 3.2959 0.0100 -0.0018
## 940 1.7579 3.3029 0.0100 -0.0010
## 960 1.7403 3.3061 0.0100 -0.0012
## 980 1.7241 3.3140 0.0100 -0.0016
## 1000 1.7086 3.3167 0.0100 -0.0009
##
## CV: 4
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 17.2978 17.7847 0.0100 0.1887
## 2 17.0576 17.5617 0.0100 0.2381
## 3 16.8226 17.3350 0.0100 0.2540
## 4 16.5919 17.1128 0.0100 0.2229
## 5 16.3646 16.8928 0.0100 0.2646
## 6 16.1405 16.6762 0.0100 0.2331
## 7 15.9195 16.4695 0.0100 0.1889
## 8 15.7069 16.2706 0.0100 0.2025
## 9 15.4970 16.0639 0.0100 0.1941
## 10 15.2908 15.8654 0.0100 0.2367
## 20 13.4306 14.1062 0.0100 0.2035
## 40 10.6036 11.3857 0.0100 0.1152
## 60 8.6231 9.4207 0.0100 0.0815
## 80 7.2183 7.9651 0.0100 0.0316
## 100 6.2000 6.8955 0.0100 0.0421
## 120 5.4426 6.0906 0.0100 0.0282
## 140 4.8678 5.4978 0.0100 0.0269
## 160 4.4244 5.0563 0.0100 0.0105
## 180 4.0803 4.7079 0.0100 0.0093
## 200 3.7994 4.4269 0.0100 0.0065
## 220 3.5694 4.2135 0.0100 0.0066
## 240 3.3789 4.0478 0.0100 0.0048
## 260 3.2252 3.9152 0.0100 0.0056
## 280 3.0885 3.8004 0.0100 0.0013
## 300 2.9710 3.7135 0.0100 -0.0008
## 320 2.8712 3.6552 0.0100 0.0001
## 340 2.7858 3.6046 0.0100 0.0008
## 360 2.7080 3.5758 0.0100 0.0024
## 380 2.6365 3.5502 0.0100 -0.0029
## 400 2.5674 3.5229 0.0100 0.0016
## 420 2.5072 3.5058 0.0100 -0.0035
## 440 2.4521 3.4920 0.0100 0.0007
## 460 2.4030 3.4812 0.0100 -0.0010
## 480 2.3558 3.4722 0.0100 -0.0017
## 500 2.3121 3.4702 0.0100 -0.0027
## 520 2.2717 3.4698 0.0100 -0.0018
## 540 2.2344 3.4597 0.0100 -0.0013
## 560 2.1979 3.4581 0.0100 -0.0006
## 580 2.1631 3.4549 0.0100 -0.0008
## 600 2.1321 3.4533 0.0100 -0.0005
## 620 2.1023 3.4517 0.0100 -0.0027
## 640 2.0745 3.4502 0.0100 -0.0021
## 660 2.0463 3.4547 0.0100 -0.0006
## 680 2.0199 3.4540 0.0100 -0.0020
## 700 1.9940 3.4589 0.0100 -0.0026
## 720 1.9694 3.4601 0.0100 -0.0027
## 740 1.9465 3.4652 0.0100 -0.0007
## 760 1.9238 3.4640 0.0100 -0.0010
## 780 1.9027 3.4625 0.0100 -0.0017
## 800 1.8822 3.4699 0.0100 -0.0010
## 820 1.8611 3.4740 0.0100 -0.0005
## 840 1.8404 3.4757 0.0100 -0.0011
## 860 1.8216 3.4787 0.0100 -0.0006
## 880 1.8044 3.4810 0.0100 -0.0016
## 900 1.7859 3.4866 0.0100 -0.0016
## 920 1.7693 3.4910 0.0100 -0.0008
## 940 1.7522 3.4941 0.0100 -0.0023
## 960 1.7357 3.4962 0.0100 -0.0017
## 980 1.7201 3.4934 0.0100 -0.0009
## 1000 1.7036 3.4944 0.0100 -0.0006
##
## CV: 5
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 16.9606 19.1592 0.0100 0.2492
## 2 16.7328 18.9087 0.0100 0.2254
## 3 16.5085 18.6607 0.0100 0.1917
## 4 16.2906 18.4106 0.0100 0.2723
## 5 16.0758 18.1666 0.0100 0.2025
## 6 15.8650 17.9309 0.0100 0.2454
## 7 15.6561 17.6980 0.0100 0.2308
## 8 15.4528 17.4671 0.0100 0.2034
## 9 15.2542 17.2468 0.0100 0.2040
## 10 15.0570 17.0252 0.0100 0.1962
## 20 13.2830 14.9910 0.0100 0.1526
## 40 10.5647 11.8655 0.0100 0.0872
## 60 8.6469 9.6551 0.0100 0.0773
## 80 7.2685 8.0642 0.0100 0.0484
## 100 6.2652 6.9085 0.0100 0.0340
## 120 5.5237 6.0475 0.0100 0.0281
## 140 4.9512 5.4200 0.0100 0.0134
## 160 4.5120 4.9377 0.0100 0.0118
## 180 4.1610 4.5496 0.0100 0.0051
## 200 3.8828 4.2561 0.0100 0.0061
## 220 3.6521 4.0204 0.0100 0.0069
## 240 3.4661 3.8587 0.0100 0.0040
## 260 3.3049 3.7167 0.0100 0.0018
## 280 3.1641 3.6004 0.0100 0.0024
## 300 3.0419 3.5115 0.0100 -0.0004
## 320 2.9388 3.4419 0.0100 -0.0020
## 340 2.8488 3.3818 0.0100 -0.0010
## 360 2.7678 3.3314 0.0100 0.0007
## 380 2.6962 3.2822 0.0100 -0.0030
## 400 2.6285 3.2433 0.0100 0.0009
## 420 2.5677 3.2164 0.0100 -0.0013
## 440 2.5081 3.1885 0.0100 -0.0011
## 460 2.4576 3.1626 0.0100 -0.0030
## 480 2.4106 3.1417 0.0100 0.0009
## 500 2.3656 3.1184 0.0100 -0.0024
## 520 2.3223 3.0999 0.0100 -0.0007
## 540 2.2818 3.0874 0.0100 -0.0020
## 560 2.2450 3.0762 0.0100 -0.0016
## 580 2.2088 3.0650 0.0100 -0.0011
## 600 2.1762 3.0571 0.0100 -0.0006
## 620 2.1432 3.0447 0.0100 -0.0020
## 640 2.1103 3.0364 0.0100 0.0001
## 660 2.0815 3.0352 0.0100 -0.0013
## 680 2.0528 3.0358 0.0100 -0.0005
## 700 2.0265 3.0347 0.0100 -0.0005
## 720 2.0010 3.0374 0.0100 -0.0007
## 740 1.9762 3.0387 0.0100 -0.0021
## 760 1.9505 3.0375 0.0100 -0.0017
## 780 1.9263 3.0423 0.0100 -0.0005
## 800 1.9045 3.0440 0.0100 -0.0001
## 820 1.8827 3.0445 0.0100 -0.0012
## 840 1.8629 3.0411 0.0100 -0.0012
## 860 1.8412 3.0431 0.0100 -0.0008
## 880 1.8225 3.0467 0.0100 -0.0011
## 900 1.8033 3.0520 0.0100 -0.0019
## 920 1.7859 3.0554 0.0100 -0.0012
## 940 1.7681 3.0554 0.0100 -0.0014
## 960 1.7508 3.0592 0.0100 -0.0024
## 980 1.7347 3.0589 0.0100 -0.0002
## 1000 1.7173 3.0636 0.0100 -0.0021
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 17.3923 nan 0.0100 0.2434
## 2 17.1551 nan 0.0100 0.2336
## 3 16.9214 nan 0.0100 0.2401
## 4 16.6918 nan 0.0100 0.2184
## 5 16.4656 nan 0.0100 0.2161
## 6 16.2442 nan 0.0100 0.2459
## 7 16.0272 nan 0.0100 0.2285
## 8 15.8154 nan 0.0100 0.2060
## 9 15.6041 nan 0.0100 0.1864
## 10 15.3997 nan 0.0100 0.1966
## 20 13.5472 nan 0.0100 0.1978
## 40 10.7152 nan 0.0100 0.1270
## 60 8.7159 nan 0.0100 0.0558
## 80 7.2870 nan 0.0100 0.0423
## 100 6.2562 nan 0.0100 0.0435
## 120 5.4929 nan 0.0100 0.0248
## 140 4.9199 nan 0.0100 0.0204
## 160 4.4792 nan 0.0100 0.0144
## 180 4.1366 nan 0.0100 0.0146
## 200 3.8537 nan 0.0100 0.0135
## 220 3.6344 nan 0.0100 0.0054
## 240 3.4514 nan 0.0100 0.0036
## 260 3.2988 nan 0.0100 0.0011
## 280 3.1712 nan 0.0100 0.0022
## 300 3.0563 nan 0.0100 -0.0009
## 320 2.9586 nan 0.0100 -0.0016
## 340 2.8697 nan 0.0100 0.0014
## 360 2.7945 nan 0.0100 -0.0006
## 380 2.7272 nan 0.0100 -0.0008
## 400 2.6654 nan 0.0100 0.0018
## 420 2.6104 nan 0.0100 0.0004
## 440 2.5596 nan 0.0100 -0.0011
## 460 2.5087 nan 0.0100 -0.0017
## 480 2.4627 nan 0.0100 -0.0017
## 500 2.4234 nan 0.0100 -0.0013
## 520 2.3850 nan 0.0100 -0.0004
## 540 2.3483 nan 0.0100 -0.0013
## 560 2.3137 nan 0.0100 -0.0007
## 580 2.2813 nan 0.0100 -0.0006
## 600 2.2515 nan 0.0100 -0.0011
## 620 2.2226 nan 0.0100 -0.0007
## 640 2.1935 nan 0.0100 -0.0015
## 660 2.1677 nan 0.0100 -0.0009
## 680 2.1420 nan 0.0100 -0.0009
## 700 2.1166 nan 0.0100 -0.0003
## 720 2.0946 nan 0.0100 -0.0009
## 740 2.0730 nan 0.0100 -0.0017
## 760 2.0514 nan 0.0100 -0.0007
## 780 2.0312 nan 0.0100 -0.0004
## 800 2.0109 nan 0.0100 -0.0010
## 820 1.9917 nan 0.0100 0.0003
## 840 1.9732 nan 0.0100 -0.0005
## 860 1.9565 nan 0.0100 -0.0018
## 880 1.9358 nan 0.0100 -0.0010
## 900 1.9203 nan 0.0100 -0.0003
## 920 1.9031 nan 0.0100 -0.0003
## 940 1.8879 nan 0.0100 -0.0016
## 960 1.8732 nan 0.0100 -0.0004
## 980 1.8562 nan 0.0100 -0.0010
## 1000 1.8401 nan 0.0100 -0.0018
Applies a gradient boost using a general boosted regression model (gbm) that is better fit to handle non-linear, multivatiative data
best.iter <- gbm.perf(gbmMod, method = "cv")
Finds the best iteration
ListScores <- summary(gbmMod, best.iter)
View of the summary of best iterations for importance of attributes
names(ListScores)[1] <- "attribute"
names(ListScores)[2] <- "importance"
ListScores
## attribute importance
## FICO.Mean FICO.Mean 60.89937
## Loan.Length Loan.Length 15.92223
## Amount.Requested Amount.Requested 12.02739
## State State 4.42039
## Open.CREDIT.Lines Open.CREDIT.Lines 2.12498
## Inquiries.in.the.Last.6.Months Inquiries.in.the.Last.6.Months 2.12495
## Employment.Length Employment.Length 0.87534
## Loan.Purpose Loan.Purpose 0.65035
## Debt.To.Income.Ratio Debt.To.Income.Ratio 0.36767
## Monthly.Income Monthly.Income 0.28913
## Revolving.CREDIT.Balance Revolving.CREDIT.Balance 0.23217
## Home.Ownership Home.Ownership 0.06604
Variable contribution table showing the contribution (%) to predicting interest rates. Based on this table, FICO Mean, Loan Length and Amount Requested are the major contributors and will be further analyzed.
result <- predict(gbmMod, testing, best.iter, type = "response")
rMSEpercent <- sqrt(mean((actualValues - result)^2))/mean(actualValues) * 100
rMSEpercent
## [1] 11.29
Performs a Mean Squared Error (MSE) calculation on training and test data sets of randomly sampled variables within the model. Using predicted interest rates and actual observed rates, the MSE was calculated and found to be approximately 11%. This fairly high margin of error may be attributed to the amount of data used in the test sampling in the training set, but because the three attributes showed such a significantly higher importance (combined was almost 90%) than the others, there should not be any major errors factored into the concluding analysis.
png(filename = "FinalPlot.png", width = 800, height = 800)
par(mfrow = c(2, 2), mar = c(3, 3, 2, 1), oma = c(0, 0, 3, 1))
interestCut = cut2(loansData$Interest.Rate, g = 7)
amtRequestedCut = cut2(loansData$Amount.Requested, g = 5)
plot(loansData$FICO.Mean, loansData$Interest.Rate, pch = 19, col = interestCut,
main = "Figure A: Color grouped by interest rate", ylab = "", xlab = "")
lmInterest <- lm(loansData$Interest.Rate ~ loansData$FICO.Mean)
abline(lmInterest, col = "blue", lwd = 3)
mtext("Interest Rate", side = 2, las = 1, line = -3, at = 1.05 * max(loansData$Interest.Rate))
mtext("FICO Mean", side = 1, line = 2, at = max(loansData$FICO.Mean))
Displays the seven levels of interest rates defined by the Lending Club in colored grouping that shows that higher FICO scores correspond with lower interest rates, while lower FICO scores correspond with higher interest rates
plot(loansData$FICO.Mean, loansData$Interest.Rate, pch = 19, col = loansData$Loan.Length,
, main = "Figure B: Color grouped by loan length", ylab = "", xlab = "")
lmLength <- lm(loansData$Interest.Rate ~ loansData$FICO.Mean * loansData$Loan.Length)
abline(c(lmLength$coeff[1] + lmLength$coeff[3], lmLength$coeff[2] + lmLength$coeff[4]),
col = "blue", lwd = 3)
mtext("Interest Rate", side = 2, las = 1, line = -3, at = 1.05 * max(loansData$Interest.Rate))
mtext("FICO Mean", side = 1, line = 2, at = max(loansData$FICO.Mean))
Displays the two levels of loan length (green is 60 months and red is 36 months) versus FICO mean and interest rate. In general, those who apply for a longer loan length (green) will have a higher interest rate than those whose loan length is shorter (red).
plot(loansData$FICO.Mean, loansData$Interest.Rate, pch = 19, col = amtRequestedCut,
, main = "Figure C: Color grouped by amount requested", ylab = "", xlab = "")
lmAmt <- lm(loansData$Interest.Rate ~ loansData$FICO.Mean * loansData$Loan.Length *
loansData$Amount.Requested)
mtext("Interest Rate", side = 2, las = 1, line = -3, at = 1.05 * max(loansData$Interest.Rate))
mtext("FICO Mean", side = 1, line = 2, at = max(loansData$FICO.Mean))
Displays the five groupings of amount requested with the highest amounts shown in light blue and the lowest amounts in black versus FICO mean and interest rate. Generally those who requested higher amounts did so at a higher interest rate than those who applied for lesser amounts.
summary(lmAmt)
##
## Call:
## lm(formula = loansData$Interest.Rate ~ loansData$FICO.Mean *
## loansData$Loan.Length * loansData$Amount.Requested)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.433 -1.404 -0.122 1.222 9.963
##
## Coefficients:
## Estimate
## (Intercept) 6.81e+01
## loansData$FICO.Mean -8.10e-02
## loansData$Loan.Length60 months 3.15e+00
## loansData$Amount.Requested 3.60e-04
## loansData$FICO.Mean:loansData$Loan.Length60 months -6.90e-04
## loansData$FICO.Mean:loansData$Amount.Requested -3.28e-07
## loansData$Loan.Length60 months:loansData$Amount.Requested 4.79e-04
## loansData$FICO.Mean:loansData$Loan.Length60 months:loansData$Amount.Requested -6.22e-07
## Std. Error
## (Intercept) 1.70e+00
## loansData$FICO.Mean 2.40e-03
## loansData$Loan.Length60 months 4.72e+00
## loansData$Amount.Requested 1.31e-04
## loansData$FICO.Mean:loansData$Loan.Length60 months 6.65e-03
## loansData$FICO.Mean:loansData$Amount.Requested 1.83e-07
## loansData$Loan.Length60 months:loansData$Amount.Requested 2.64e-04
## loansData$FICO.Mean:loansData$Loan.Length60 months:loansData$Amount.Requested 3.71e-07
## t value
## (Intercept) 40.03
## loansData$FICO.Mean -33.78
## loansData$Loan.Length60 months 0.67
## loansData$Amount.Requested 2.75
## loansData$FICO.Mean:loansData$Loan.Length60 months -0.10
## loansData$FICO.Mean:loansData$Amount.Requested -1.79
## loansData$Loan.Length60 months:loansData$Amount.Requested 1.82
## loansData$FICO.Mean:loansData$Loan.Length60 months:loansData$Amount.Requested -1.68
## Pr(>|t|)
## (Intercept) <2e-16
## loansData$FICO.Mean <2e-16
## loansData$Loan.Length60 months 0.505
## loansData$Amount.Requested 0.006
## loansData$FICO.Mean:loansData$Loan.Length60 months 0.917
## loansData$FICO.Mean:loansData$Amount.Requested 0.073
## loansData$Loan.Length60 months:loansData$Amount.Requested 0.069
## loansData$FICO.Mean:loansData$Loan.Length60 months:loansData$Amount.Requested 0.093
##
## (Intercept) ***
## loansData$FICO.Mean ***
## loansData$Loan.Length60 months
## loansData$Amount.Requested **
## loansData$FICO.Mean:loansData$Loan.Length60 months
## loansData$FICO.Mean:loansData$Amount.Requested .
## loansData$Loan.Length60 months:loansData$Amount.Requested .
## loansData$FICO.Mean:loansData$Loan.Length60 months:loansData$Amount.Requested .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.1 on 2492 degrees of freedom
## Multiple R-squared: 0.749, Adjusted R-squared: 0.748
## F-statistic: 1.06e+03 on 7 and 2492 DF, p-value: <2e-16
anova(lm(FICO.Mean ~ ., loansData))
## Analysis of Variance Table
##
## Response: FICO.Mean
## Df Sum Sq Mean Sq F value Pr(>F)
## Amount.Requested 1 20858 20858 57.60 4.6e-14 ***
## Interest.Rate 1 1870365 1870365 5164.89 < 2e-16 ***
## Loan.Length 1 193158 193158 533.39 < 2e-16 ***
## Loan.Purpose 13 53168 4090 11.29 < 2e-16 ***
## Debt.To.Income.Ratio 1 13269 13269 36.64 1.6e-09 ***
## State 45 17890 398 1.10 0.3040
## Home.Ownership 4 4689 1172 3.24 0.0117 *
## Monthly.Income 1 1237 1237 3.42 0.0647 .
## Open.CREDIT.Lines 1 3793 3793 10.47 0.0012 **
## Revolving.CREDIT.Balance 1 341 341 0.94 0.3320
## Inquiries.in.the.Last.6.Months 1 10458 10458 28.88 8.4e-08 ***
## Employment.Length 11 2221 202 0.56 0.8641
## Residuals 2418 875632 362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(Interest.Rate ~ ., loansData))
## Analysis of Variance Table
##
## Response: Interest.Rate
## Df Sum Sq Mean Sq F value Pr(>F)
## Amount.Requested 1 4819 4819 1160.28 < 2e-16 ***
## Loan.Length 1 4318 4318 1039.71 < 2e-16 ***
## Loan.Purpose 13 1338 103 24.78 < 2e-16 ***
## Debt.To.Income.Ratio 1 650 650 156.59 < 2e-16 ***
## State 45 705 16 3.77 1.1e-15 ***
## Home.Ownership 4 559 140 33.66 < 2e-16 ***
## Monthly.Income 1 45 45 10.81 0.001 **
## Open.CREDIT.Lines 1 2 2 0.50 0.480
## Revolving.CREDIT.Balance 1 8 8 1.98 0.160
## Inquiries.in.the.Last.6.Months 1 1396 1396 336.10 < 2e-16 ***
## Employment.Length 11 107 10 2.35 0.007 **
## FICO.Mean 1 19636 19636 4727.84 < 2e-16 ***
## Residuals 2418 10042 4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1