Data Analysis Project 1

Step 4: Build the model on training data

George Fisher george@georgefisher.com

Observations

  1. I cut the dataset into training and test halves for the purpose of verifying my modeling results
  2. I built a simple quadratic model using just the FICO variable
  3. I built an elaborate multiple linear regression model starting with every single variable provided and winnowing the list down to maximize the R2 , F parameters and minimize the error parameter.

The simple model provides a very accurate depiction of the relationship between FICO and Interest.Rate but the predictive power of the Elaborate model was 57% greater based upon the Mean Squared Error it produced when run against the test half of the dataset.


create the subset for training data

set.seed(1234)
train = sample(nrow(loansData.complete), nrow(loansData.complete)/2)

My thanks to Benjamin De Baets in post https://class.coursera.org/dataanalysis-002/forum/thread?thread_id=310

create the base model

create the data.frame to model with

If you say lm(Y ~., data=df) you will get a model of Y against all the variables in the data.frame “df”.

It is much handier to deal with a large number of regression variables this way and it allows you to add and subtract variables programmatically, rather than by hand which quickly becomes unbearably tedious.

base.list = c("Debt.To.Income.Ratio", "Open.CREDIT.Lines", "Revolving.CREDIT.Balance", 
    "FICO.numeric", "FICO.numeric2", "Inquiries.in.the.Last.6.Months")
model.data.frame = loansData.complete[, base.list]

My thanks to http://stats.stackexchange.com/questions/29477/how-to-write-a-linear-model-formula-with-100-variables-in-r

create indicator variables

this time, we will try the base model with all the indicators

# Loan.Length
model.data.frame = cbind(model.data.frame, lm_create.indicators(loansData.complete$Loan.Length, 
    echo = TRUE, ret = TRUE))
## 
## indicators created: D.36months, D.60months
# model.data.frame = model.data.frame[,-ncol(model.data.frame)]

# Loan.Purpose
model.data.frame = cbind(model.data.frame, lm_create.indicators(loansData.complete$Loan.Purpose, 
    echo = TRUE, ret = TRUE))
## 
## indicators created: D.car, D.credit_card, D.debt_consolidation, D.educational, D.home_improvement, D.house, D.major_purchase, D.medical, D.moving, D.other, D.renewable_energy, D.small_business, D.vacation, D.wedding
# model.data.frame = model.data.frame[,-ncol(model.data.frame)]

# State
model.data.frame = cbind(model.data.frame, lm_create.indicators(loansData.complete$State, 
    echo = TRUE, ret = TRUE))
## 
## indicators created: D.AK, D.AL, D.AR, D.AZ, D.CA, D.CO, D.CT, D.DC, D.DE, D.FL, D.GA, D.HI, D.IA, D.IL, D.IN, D.KS, D.KY, D.LA, D.MA, D.MD, D.MI, D.MN, D.MO, D.MS, D.MT, D.NC, D.NH, D.NJ, D.NM, D.NV, D.NY, D.OH, D.OK, D.OR, D.PA, D.RI, D.SC, D.SD, D.TX, D.UT, D.VA, D.VT, D.WA, D.WI, D.WV, D.WY
# model.data.frame = model.data.frame[,-ncol(model.data.frame)]

# Home.Ownership
model.data.frame = cbind(model.data.frame, lm_create.indicators(loansData.complete$Home.Ownership, 
    echo = TRUE, ret = TRUE))
## 
## indicators created: D.MORTGAGE, D.NONE, D.OTHER, D.OWN, D.RENT
# model.data.frame = model.data.frame[,-ncol(model.data.frame)]

# FICO.Range NOTE: factor replaced with numeric FICO.numeric
# model.data.frame = cbind(model.data.frame,
# lm_create.indicators(loansData.complete$FICO.Range, echo=TRUE, ret=TRUE))
# model.data.frame = model.data.frame[,-ncol(model.data.frame)]

# Employment.Length
model.data.frame = cbind(model.data.frame, lm_create.indicators(loansData.complete$Employment.Length, 
    echo = TRUE, ret = TRUE))
## 
## indicators created: D.LT1year, D.1year, D.10PLUSyears, D.2years, D.3years, D.4years, D.5years, D.6years, D.7years, D.8years, D.9years, D.na
# model.data.frame = model.data.frame[,-ncol(model.data.frame)]

sum the obviously collinear variables

Loan.Amount = loansData.complete$Amount.Requested + loansData.complete$Amount.Funded.By.Investors
model.data.frame = cbind(model.data.frame, Loan.Amount)

add reciprocal of Monthly.Income

Monthly.Income.Recip = 1/loansData.complete$Monthly.Income
model.data.frame = cbind(model.data.frame, Monthly.Income.Recip)

create the base model

# keep track of the variables we start with
base.variable.list = names(model.data.frame)

model.base = lm(loansData.complete$Interest.Rate ~ ., data = model.data.frame, 
    subset = train)

# lm_assumptions_summary(model.base)
summary(model.base)
## 
## Call:
## lm(formula = loansData.complete$Interest.Rate ~ ., data = model.data.frame, 
##     subset = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06076 -0.01122 -0.00182  0.01073  0.09084 
## 
## Coefficients: (6 not defined because of singularities)
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.39e-01   2.01e-02   11.88  < 2e-16 ***
## Debt.To.Income.Ratio            9.60e-03   8.51e-03    1.13    0.260    
## Open.CREDIT.Lines              -3.18e-04   1.44e-04   -2.21    0.027 *  
## Revolving.CREDIT.Balance       -1.72e-08   3.73e-08   -0.46    0.644    
## FICO.numeric                   -8.97e-03   3.64e-04  -24.68  < 2e-16 ***
## FICO.numeric2                   1.27e-04   1.00e-05   12.72  < 2e-16 ***
## Inquiries.in.the.Last.6.Months  3.07e-03   4.72e-04    6.52  1.1e-10 ***
## D.36months                     -3.25e-02   1.53e-03  -21.22  < 2e-16 ***
## D.60months                            NA         NA      NA       NA    
## D.car                          -5.32e-04   5.80e-03   -0.09    0.927    
## D.credit_card                   3.72e-04   4.60e-03    0.08    0.936    
## D.debt_consolidation            1.38e-03   4.47e-03    0.31    0.757    
## D.educational                   3.92e-03   8.10e-03    0.48    0.629    
## D.home_improvement              1.68e-03   5.01e-03    0.34    0.737    
## D.house                        -1.37e-03   8.46e-03   -0.16    0.871    
## D.major_purchase                3.81e-03   5.13e-03    0.74    0.458    
## D.medical                      -1.27e-03   7.07e-03   -0.18    0.858    
## D.moving                        1.45e-02   6.50e-03    2.22    0.026 *  
## D.other                         1.28e-02   4.75e-03    2.70    0.007 ** 
## D.renewable_energy             -2.71e-03   1.44e-02   -0.19    0.851    
## D.small_business                1.32e-02   5.24e-03    2.53    0.012 *  
## D.vacation                      1.59e-03   6.86e-03    0.23    0.817    
## D.wedding                             NA         NA      NA       NA    
## D.AK                            1.65e-03   2.04e-02    0.08    0.936    
## D.AL                           -6.25e-03   1.94e-02   -0.32    0.748    
## D.AR                           -9.83e-03   2.04e-02   -0.48    0.630    
## D.AZ                           -1.83e-03   1.93e-02   -0.09    0.924    
## D.CA                           -9.61e-03   1.89e-02   -0.51    0.612    
## D.CO                           -1.09e-02   1.92e-02   -0.57    0.570    
## D.CT                           -7.62e-03   1.93e-02   -0.40    0.693    
## D.DC                            1.25e-02   2.32e-02    0.54    0.592    
## D.DE                           -1.10e-02   2.11e-02   -0.52    0.602    
## D.FL                           -8.37e-03   1.90e-02   -0.44    0.660    
## D.GA                           -5.65e-03   1.91e-02   -0.30    0.767    
## D.HI                           -1.31e-03   2.02e-02   -0.06    0.948    
## D.IA                           -3.33e-02   3.37e-02   -0.99    0.323    
## D.IL                           -1.41e-02   1.91e-02   -0.74    0.461    
## D.IN                           -4.61e-02   2.68e-02   -1.72    0.086 .  
## D.KS                           -6.66e-04   2.08e-02   -0.03    0.974    
## D.KY                           -1.07e-02   1.94e-02   -0.55    0.580    
## D.LA                           -1.09e-02   2.01e-02   -0.54    0.588    
## D.MA                           -9.09e-03   1.91e-02   -0.48    0.635    
## D.MD                           -6.44e-03   1.92e-02   -0.33    0.738    
## D.MI                           -1.23e-02   1.93e-02   -0.64    0.523    
## D.MN                           -1.49e-02   1.93e-02   -0.77    0.441    
## D.MO                           -1.09e-02   1.93e-02   -0.57    0.572    
## D.MS                            2.34e-02   3.33e-02    0.70    0.482    
## D.MT                           -1.55e-02   2.11e-02   -0.73    0.463    
## D.NC                           -8.39e-03   1.92e-02   -0.44    0.662    
## D.NH                           -2.39e-02   2.04e-02   -1.17    0.242    
## D.NJ                           -8.72e-03   1.91e-02   -0.46    0.648    
## D.NM                           -8.44e-03   2.18e-02   -0.39    0.699    
## D.NV                           -6.26e-03   1.97e-02   -0.32    0.750    
## D.NY                           -9.59e-03   1.90e-02   -0.51    0.613    
## D.OH                           -8.32e-03   1.92e-02   -0.43    0.665    
## D.OK                           -9.80e-03   1.96e-02   -0.50    0.617    
## D.OR                            2.40e-03   1.95e-02    0.12    0.902    
## D.PA                           -1.18e-02   1.91e-02   -0.62    0.538    
## D.RI                           -9.68e-03   1.99e-02   -0.49    0.627    
## D.SC                           -6.45e-03   1.95e-02   -0.33    0.741    
## D.SD                           -2.14e-02   2.32e-02   -0.92    0.355    
## D.TX                           -1.95e-03   1.90e-02   -0.10    0.918    
## D.UT                           -2.14e-02   2.07e-02   -1.03    0.301    
## D.VA                           -5.32e-03   1.91e-02   -0.28    0.781    
## D.VT                            4.05e-04   2.34e-02    0.02    0.986    
## D.WA                           -6.96e-03   1.92e-02   -0.36    0.717    
## D.WI                           -9.09e-03   1.97e-02   -0.46    0.644    
## D.WV                           -6.97e-03   1.98e-02   -0.35    0.726    
## D.WY                                  NA         NA      NA       NA    
## D.MORTGAGE                     -9.06e-04   1.31e-03   -0.69    0.489    
## D.NONE                                NA         NA      NA       NA    
## D.OTHER                        -5.92e-03   1.97e-02   -0.30    0.764    
## D.OWN                           6.06e-04   2.12e-03    0.29    0.775    
## D.RENT                                NA         NA      NA       NA    
## D.LT1year                       1.08e-03   3.53e-03    0.31    0.760    
## D.1year                         2.36e-04   3.67e-03    0.06    0.949    
## D.10PLUSyears                  -2.29e-05   3.30e-03   -0.01    0.994    
## D.2years                        1.42e-03   3.53e-03    0.40    0.687    
## D.3years                        6.52e-04   3.53e-03    0.18    0.854    
## D.4years                        1.36e-03   3.68e-03    0.37    0.712    
## D.5years                        1.17e-03   3.59e-03    0.33    0.745    
## D.6years                        1.95e-03   3.76e-03    0.52    0.605    
## D.7years                        8.29e-05   3.96e-03    0.02    0.983    
## D.8years                        2.00e-03   4.00e-03    0.50    0.618    
## D.9years                       -4.04e-03   4.57e-03   -0.89    0.376    
## D.na                                  NA         NA      NA       NA    
## Loan.Amount                     8.80e-07   4.71e-08   18.68  < 2e-16 ***
## Monthly.Income.Recip            1.18e+01   5.19e+00    2.27    0.023 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0188 on 1167 degrees of freedom
## Multiple R-squared:  0.812,  Adjusted R-squared:  0.799 
## F-statistic: 62.3 on 81 and 1167 DF,  p-value: <2e-16

Iteratively remove the high t statistic p-value names

# start with a list of non-NA variables in the model ex the Intercept
starting.variable.list = names(coef(model.base)[!is.na(coef(model.base))])
starting.variable.list = starting.variable.list[!starting.variable.list == "(Intercept)"]

# variable values to search for we want to find the max values
r2max = 0
r2sig.level = 1
r2names = ""
r2maxRES = 0
r2maxF = 0
r2maxMSE = 0
Fmax = 0
Fsig.level = 1
Fnames = ""
Fmaxr2 = 0
FmaxRES = 0
FmaxMSE = 0
## we want to find the min values
RESmin = 10000
RESsig.level = 1
RESnames = ""
RESminr2 = 0
RESminF = 0
RESminMSE = 0
MSEmin = 10000
MSEsig.level = 1
MSEnames = ""
MSEminr2 = 0
MSEminF = 0
MSEminRES = 0

# for every significance level of the t-stat p-value from 100% to 1%
for (sig.level in seq(1, 0.01, by = -0.01)) {

    # find the list to exclude
    exclude.variable.list = lm_print.model.hi_p(model.base, echo = FALSE, ret = TRUE, 
        sig.level = sig.level)
    if (length(exclude.variable.list) == 0) 
        next

    # create the list ex the exclude.variable.list
    shortened.variable.list = starting.variable.list[!starting.variable.list %in% 
        exclude.variable.list]

    # create the data.frame to model
    smaller.model.data.frame = model.data.frame[, shortened.variable.list]

    # model that smaller data.frame on the training list subset
    smaller.model.base = lm(loansData.complete$Interest.Rate ~ ., data = smaller.model.data.frame, 
        subset = train)

    # find the variables of interest
    if (summary(smaller.model.base)$sigma < RESmin) {
        RESmin = summary(smaller.model.base)$sigma
        RESsig.level = sig.level
        RESnames = shortened.variable.list
        RESminr2 = summary(smaller.model.base)$adj.r.squared
        RESminF = summary(smaller.model.base)$fstatistic[1]
        RESminMSE = mean((loansData.complete$Interest.Rate - predict(smaller.model.base, 
            smaller.model.data.frame))[-train]^2)
    }
    if (summary(smaller.model.base)$adj.r.squared > r2max) {
        r2max = summary(smaller.model.base)$adj.r.squared
        r2sig.level = sig.level
        r2names = shortened.variable.list
        r2maxRES = summary(smaller.model.base)$sigma
        r2maxF = summary(smaller.model.base)$fstatistic[1]
        r2maxMSE = mean((loansData.complete$Interest.Rate - predict(smaller.model.base, 
            smaller.model.data.frame))[-train]^2)
    }
    if (summary(smaller.model.base)$fstatistic[1] > Fmax) {
        Fmax = summary(smaller.model.base)$fstatistic[1]
        Fsig.level = sig.level
        Fnames = shortened.variable.list
        Fmaxr2 = summary(smaller.model.base)$adj.r.squared
        FmaxRES = summary(smaller.model.base)$sigma
        FmaxMSE = mean((loansData.complete$Interest.Rate - predict(smaller.model.base, 
            smaller.model.data.frame))[-train]^2)
    }
    if (mean((loansData.complete$Interest.Rate - predict(smaller.model.base, 
        smaller.model.data.frame))[-train]^2) < MSEmin) {
        MSEmin = mean((loansData.complete$Interest.Rate - predict(smaller.model.base, 
            smaller.model.data.frame))[-train]^2)
        MSEsig.level = sig.level
        MSEnames = shortened.variable.list
        MSEminr2 = summary(smaller.model.base)$adj.r.squared
        MSEminF = summary(smaller.model.base)$fstatistic[1]
        MSEminRES = summary(smaller.model.base)$sigma
    }

}
winning.levels = c(r2max, r2sig.level, r2maxRES, r2maxF, r2maxMSE, Fmax, Fsig.level, 
    Fmaxr2, FmaxRES, FmaxMSE, RESmin, RESsig.level, RESminr2, RESminF, RESminMSE, 
    MSEmin, MSEsig.level, MSEminr2, MSEminF, MSEminRES)
names(winning.levels) = c("r2max", "r2sig.level", "r2maxRES", "r2maxF", "r2maxMSE", 
    "Fmax", "Fsig.level", "Fmaxr2", "FmaxRES", "FmaxMSE", "RESmin", "RESsig.level", 
    "RESminr2", "RESminF", "RESminMSE", "MSEmin", "MSEsig.level", "MSEminr2", 
    "MSEminF", "MSEminRES")

for (i in 1:len(winning.levels)) cat(names(winning.levels)[i], format(winning.levels[i], 
    digits = 4), "\n")
## r2max 0.8036 
## r2sig.level 0.62 
## r2maxRES 0.0186 
## r2maxF 142.8 
## r2maxMSE 0.0003851 
## Fmax 798.2 
## Fsig.level 0.01 
## Fmaxr2 0.7931 
## FmaxRES 0.01909 
## FmaxMSE 0.0003629 
## RESmin 0.0186 
## RESsig.level 0.62 
## RESminr2 0.8036 
## RESminF 142.8 
## RESminMSE 0.0003851 
## MSEmin 0.0003619 
## MSEsig.level 0.24 
## MSEminr2 0.8002 
## MSEminF 455.5 
## MSEminRES 0.01876

create a new model using the R2 max sig.level names

# variables excluded
cat("\n-------- variables excluded (potential confounders)----------------\n")
## 
## -------- variables excluded (potential confounders)----------------
starting.variable.list[!(starting.variable.list %in% r2names)]
##  [1] "Revolving.CREDIT.Balance" "D.car"                   
##  [3] "D.credit_card"            "D.debt_consolidation"    
##  [5] "D.educational"            "D.home_improvement"      
##  [7] "D.house"                  "D.medical"               
##  [9] "D.renewable_energy"       "D.vacation"              
## [11] "D.AK"                     "D.AL"                    
## [13] "D.AR"                     "D.AZ"                    
## [15] "D.CT"                     "D.FL"                    
## [17] "D.GA"                     "D.HI"                    
## [19] "D.KS"                     "D.MA"                    
## [21] "D.MD"                     "D.NC"                    
## [23] "D.NJ"                     "D.NM"                    
## [25] "D.NV"                     "D.OH"                    
## [27] "D.OR"                     "D.RI"                    
## [29] "D.SC"                     "D.TX"                    
## [31] "D.VA"                     "D.VT"                    
## [33] "D.WA"                     "D.WI"                    
## [35] "D.WV"                     "D.OTHER"                 
## [37] "D.OWN"                    "D.LT1year"               
## [39] "D.1year"                  "D.10PLUSyears"           
## [41] "D.2years"                 "D.3years"                
## [43] "D.4years"                 "D.5years"                
## [45] "D.7years"

# the model
reduced.model.data.frame = model.data.frame[, r2names]

reduced.model = lm(loansData.complete$Interest.Rate ~ ., data = reduced.model.data.frame, 
    subset = train)

# test the assumptions underlying the model
lm_assumptions_summary(reduced.model)
## 
## 
## NOTE: in addition to this analysis, look at scatter.smooth plots of the residuals vs the main variables individually to see if quadratic transforms may be required
## 
## 
## Call:
## lm(formula = loansData.complete$Interest.Rate ~ ., data = reduced.model.data.frame, 
##     subset = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06211 -0.01144 -0.00163  0.01071  0.08928 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.34e-01   4.12e-03   56.75  < 2e-16 ***
## Debt.To.Income.Ratio            9.78e-03   8.10e-03    1.21   0.2274    
## Open.CREDIT.Lines              -3.22e-04   1.39e-04   -2.32   0.0204 *  
## FICO.numeric                   -9.00e-03   3.51e-04  -25.62  < 2e-16 ***
## FICO.numeric2                   1.27e-04   9.62e-06   13.25  < 2e-16 ***
## Inquiries.in.the.Last.6.Months  3.09e-03   4.58e-04    6.75  2.3e-11 ***
## D.36months                     -3.26e-02   1.47e-03  -22.23  < 2e-16 ***
## D.major_purchase                3.56e-03   2.73e-03    1.30   0.1938    
## D.moving                        1.36e-02   4.74e-03    2.87   0.0042 ** 
## D.other                         1.19e-02   1.93e-03    6.19  8.1e-10 ***
## D.small_business                1.31e-02   2.83e-03    4.64  3.9e-06 ***
## D.CA                           -3.55e-03   1.47e-03   -2.41   0.0161 *  
## D.CO                           -4.65e-03   3.39e-03   -1.37   0.1707    
## D.DC                            1.89e-02   1.33e-02    1.42   0.1554    
## D.DE                           -4.84e-03   9.37e-03   -0.52   0.6058    
## D.IA                           -3.61e-02   1.87e-02   -1.93   0.0539 .  
## D.IL                           -7.91e-03   2.75e-03   -2.87   0.0041 ** 
## D.IN                           -3.94e-02   1.87e-02   -2.11   0.0355 *  
## D.KY                           -4.46e-03   4.62e-03   -0.97   0.3345    
## D.LA                           -4.87e-03   6.66e-03   -0.73   0.4650    
## D.MI                           -6.36e-03   3.99e-03   -1.59   0.1112    
## D.MN                           -9.10e-03   4.16e-03   -2.19   0.0290 *  
## D.MO                           -5.02e-03   4.14e-03   -1.21   0.2254    
## D.MS                            2.27e-02   1.88e-02    1.21   0.2274    
## D.MT                           -9.70e-03   9.39e-03   -1.03   0.3020    
## D.NH                           -1.83e-02   7.69e-03   -2.38   0.0173 *  
## D.NY                           -3.49e-03   1.86e-03   -1.87   0.0611 .  
## D.OK                           -3.91e-03   5.06e-03   -0.77   0.4394    
## D.PA                           -5.74e-03   2.72e-03   -2.11   0.0352 *  
## D.SD                           -1.57e-02   1.32e-02   -1.19   0.2353    
## D.UT                           -1.54e-02   8.39e-03   -1.84   0.0660 .  
## D.MORTGAGE                     -9.00e-04   1.17e-03   -0.77   0.4426    
## D.6years                        1.00e-03   2.18e-03    0.46   0.6451    
## D.8years                        8.93e-04   2.54e-03    0.35   0.7255    
## D.9years                       -4.98e-03   3.33e-03   -1.50   0.1344    
## Loan.Amount                     8.88e-07   4.44e-08   20.01  < 2e-16 ***
## Monthly.Income.Recip            1.31e+01   4.89e+00    2.68   0.0076 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0186 on 1212 degrees of freedom
## Multiple R-squared:  0.809,  Adjusted R-squared:  0.804 
## F-statistic:  143 on 36 and 1212 DF,  p-value: <2e-16
## 
## 
## ----------F, F p, Adj R^2------------------------------
##   F_statistic F_statistic_p   adjusted_R2 
##      142.8192        0.0000        0.8036 
## [1] "F statistic p-value <= 0.05 indicates at least one predictor is predictive"
## 
## ---------p-values > 0.05------------------------------
## [1] "Below are listed, in descending order, the individual p-values > 0.05"
##                        Estimate Std. Error t value Pr(>|t|)
## D.8years              0.0008935   0.002544  0.3512  0.72551
## D.6years              0.0010025   0.002176  0.4607  0.64509
## D.DE                 -0.0048361   0.009367 -0.5163  0.60576
## D.LA                 -0.0048660   0.006658 -0.7309  0.46500
## D.MORTGAGE           -0.0009001   0.001172 -0.7681  0.44256
## D.OK                 -0.0039123   0.005058 -0.7735  0.43938
## D.KY                 -0.0044604   0.004620 -0.9654  0.33453
## D.MT                 -0.0096967   0.009391 -1.0325  0.30202
## D.SD                 -0.0157310   0.013249 -1.1873  0.23534
## Debt.To.Income.Ratio  0.0097812   0.008099  1.2077  0.22739
## D.MS                  0.0226581   0.018760  1.2078  0.22737
## D.MO                 -0.0050229   0.004141 -1.2130  0.22536
## D.major_purchase      0.0035558   0.002735  1.3002  0.19378
## D.CO                 -0.0046504   0.003393 -1.3707  0.17071
## D.DC                  0.0188610   0.013269  1.4215  0.15544
## D.9years             -0.0049824   0.003326 -1.4979  0.13441
## D.MI                 -0.0063573   0.003988 -1.5942  0.11116
## D.UT                 -0.0154389   0.008389 -1.8403  0.06597
## D.NY                 -0.0034922   0.001863 -1.8742  0.06114
## D.IA                 -0.0361494   0.018736 -1.9295  0.05391
## 
## ---------Cook's Distance------------------------------
## [1] "Cook's Distances less than 0.5 indicate no outlying Y's or Leveraged X's"
## 
## ---------Heteroskedasticity-----------------------
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 65.6, df = 36, p-value = 0.001849
## 
## [1] "Breusch-Pagan test indicates possible Heteroskedasticity"
## 
## --------Autocorrelation--------------------------
## [1] "Autocorrelation not indicated"
## 
## -------Multicollinearity if GT 10---------------
##  FICO.numeric FICO.numeric2 
##         20.51         20.44 
## [1] "Multicollinearity test generated an error"
## 
## --------Mean Zero?-------------------------------
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -3.400  -0.631  -0.094   0.000   0.582   4.880       3
## Warning: not plotting observations with leverage one:
##   71, 644, 1028

plot of chunk create reduced.model

## Warning: not plotting observations with leverage one:
##   71, 644, 1028

plot of chunk create reduced.model


# model coefficients in descending order
lm_print.ordered.coef(reduced.model, ret = FALSE, echo = TRUE)
##                                  Estimate Std. Error  t value   Pr(>|t|)
## Monthly.Income.Recip            1.307e+01  4.886e+00   2.6760  7.552e-03
## (Intercept)                     2.338e-01  4.120e-03  56.7476  0.000e+00
## D.IN                           -3.943e-02  1.873e-02  -2.1055  3.545e-02
## D.IA                           -3.615e-02  1.874e-02  -1.9295  5.391e-02
## D.36months                     -3.264e-02  1.468e-03 -22.2282  4.336e-92
## D.MS                            2.266e-02  1.876e-02   1.2078  2.274e-01
## D.DC                            1.886e-02  1.327e-02   1.4215  1.554e-01
## D.NH                           -1.833e-02  7.689e-03  -2.3842  1.727e-02
## D.SD                           -1.573e-02  1.325e-02  -1.1873  2.353e-01
## D.UT                           -1.544e-02  8.389e-03  -1.8403  6.597e-02
## D.moving                        1.360e-02  4.737e-03   2.8719  4.152e-03
## D.small_business                1.311e-02  2.827e-03   4.6371  3.917e-06
## D.other                         1.194e-02  1.927e-03   6.1932  8.060e-10
## Debt.To.Income.Ratio            9.781e-03  8.099e-03   1.2077  2.274e-01
## D.MT                           -9.697e-03  9.391e-03  -1.0325  3.020e-01
## D.MN                           -9.095e-03  4.160e-03  -2.1866  2.896e-02
## FICO.numeric                   -8.997e-03  3.512e-04 -25.6213 4.693e-116
## D.IL                           -7.907e-03  2.753e-03  -2.8725  4.143e-03
## D.MI                           -6.357e-03  3.988e-03  -1.5942  1.112e-01
## D.PA                           -5.744e-03  2.725e-03  -2.1080  3.523e-02
## D.MO                           -5.023e-03  4.141e-03  -1.2130  2.254e-01
## D.9years                       -4.982e-03  3.326e-03  -1.4979  1.344e-01
## D.LA                           -4.866e-03  6.658e-03  -0.7309  4.650e-01
## D.DE                           -4.836e-03  9.367e-03  -0.5163  6.058e-01
## D.CO                           -4.650e-03  3.393e-03  -1.3707  1.707e-01
## D.KY                           -4.460e-03  4.620e-03  -0.9654  3.345e-01
## D.OK                           -3.912e-03  5.058e-03  -0.7735  4.394e-01
## D.major_purchase                3.556e-03  2.735e-03   1.3002  1.938e-01
## D.CA                           -3.552e-03  1.474e-03  -2.4101  1.610e-02
## D.NY                           -3.492e-03  1.863e-03  -1.8742  6.114e-02
## Inquiries.in.the.Last.6.Months  3.089e-03  4.576e-04   6.7503  2.282e-11
## D.6years                        1.002e-03  2.176e-03   0.4607  6.451e-01
## D.MORTGAGE                     -9.001e-04  1.172e-03  -0.7681  4.426e-01
## D.8years                        8.935e-04  2.544e-03   0.3512  7.255e-01
## Open.CREDIT.Lines              -3.216e-04  1.385e-04  -2.3213  2.043e-02
## FICO.numeric2                   1.275e-04  9.620e-06  13.2522  1.560e-37
## Loan.Amount                     8.879e-07  4.437e-08  20.0124  3.308e-77

# print out the formula
lm_print.model.function(reduced.model, EY = "E(Interest.Rate)")
## [1] "E(Interest.Rate) = 0.234 + 0.01*Debt.To.Income.Ratio 0*Open.CREDIT.Lines -0.009*FICO.numeric + 0*FICO.numeric2 + 0.003*Inquiries.in.the.Last.6.Months -0.033*D.36months + 0.004*D.major_purchase + 0.014*D.moving + 0.012*D.other + 0.013*D.small_business -0.004*D.CA -0.005*D.CO + 0.019*D.DC -0.005*D.DE -0.036*D.IA -0.008*D.IL -0.039*D.IN -0.004*D.KY -0.005*D.LA -0.006*D.MI -0.009*D.MN -0.005*D.MO + 0.023*D.MS -0.01*D.MT -0.018*D.NH -0.003*D.NY -0.004*D.OK -0.006*D.PA -0.016*D.SD -0.015*D.UT -0.001*D.MORTGAGE + 0.001*D.6years + 0.001*D.8years -0.005*D.9years + 0*Loan.Amount + 13.074*Monthly.Income.Recip"

Create the simple quadratic model of FICO scores

fico.model = lm(Interest.Rate ~ FICO.numeric + FICO.numeric2, data = loansData.complete)

# test the assumptions underlying the model
lm_assumptions_summary(fico.model)
## 
## 
## NOTE: in addition to this analysis, look at scatter.smooth plots of the residuals vs the main variables individually to see if quadratic transforms may be required
## 
## 
## Call:
## lm(formula = Interest.Rate ~ FICO.numeric + FICO.numeric2, data = loansData.complete)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.07626 -0.02055 -0.00515  0.01699  0.10661 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.26e-01   2.93e-03    76.9   <2e-16 ***
## FICO.numeric  -8.35e-03   3.59e-04   -23.3   <2e-16 ***
## FICO.numeric2  1.14e-04   9.68e-06    11.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0287 on 2495 degrees of freedom
## Multiple R-squared:  0.529,  Adjusted R-squared:  0.529 
## F-statistic: 1.4e+03 on 2 and 2495 DF,  p-value: <2e-16
## 
## 
## ----------F, F p, Adj R^2------------------------------
##   F_statistic F_statistic_p   adjusted_R2 
##     1402.6669        0.0000        0.5289 
## [1] "F statistic p-value <= 0.05 indicates at least one predictor is predictive"
## 
## ---------p-values > 0.05------------------------------
## [1] "Below are listed, in descending order, the individual p-values > 0.05"
## 
## ---------Cook's Distance------------------------------
## [1] "Cook's Distances less than 0.5 indicate no outlying Y's or Leveraged X's"
## 
## ---------Heteroskedasticity-----------------------
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 33.76, df = 2, p-value = 4.675e-08
## 
## [1] "Breusch-Pagan test indicates possible Heteroskedasticity"
## 
## --------Autocorrelation--------------------------
## [1] "Autocorrelation not indicated"
## 
## -------Multicollinearity if GT 10---------------
##  FICO.numeric FICO.numeric2 
##         19.13         19.13 
## [1] "Multicollinearity test generated an error"
## 
## --------Mean Zero?-------------------------------
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.660  -0.717  -0.180   0.000   0.593   3.730

plot of chunk quadratic for FICO plot of chunk quadratic for FICO


# model coefficients in descending order
lm_print.ordered.coef(fico.model, ret = FALSE, echo = TRUE)
##                 Estimate Std. Error t value   Pr(>|t|)
## (Intercept)    0.2255477  2.933e-03    76.9  0.000e+00
## FICO.numeric  -0.0083547  3.586e-04   -23.3 8.718e-109
## FICO.numeric2  0.0001143  9.685e-06    11.8  2.499e-31

# print out the formula
lm_print.model.function(fico.model, EY = "E(Interest.Rate)")
## [1] "E(Interest.Rate) = 0.226 -0.008*FICO.numeric + 0*FICO.numeric2"

Compare the reduced.model to the fico.model

lm_compare(fico.model, reduced.model)
## Residual Standard Error 
##       fico.model          0.02868 
##       reduced.model       0.0186 
##       Decreased:      -0.01008 
##               reduced.model preferred 
## Adjusted R^2 
##       fico.model          0.5289 
##       reduced.model       0.8036 
##       Increased:      0.2747 
##               reduced.model preferred 
## F Statistic 
##       fico.model          1403 
##       reduced.model       142.8 
##       Decreased:      -1260 
##               fico.model preferred 
## F Statistic p-value 
##       fico.model          0 
##       reduced.model       0 
##       Unchanged:      0 
## 
## Coeffcient Statistics 
##   FICO.numeric  abs(t stat) 
##       fico.model          23.3 
##       reduced.model       1.208 
##       Decreased:      -22.09 
##               fico.model preferred 
##   FICO.numeric  t stat p-value 
##       fico.model          8.718e-109 
##       reduced.model       0.2274 
##       Increased:      0.2274

Compare Mean Squared Error vs the Test dataset

(MSE.reduced = mean((loansData.complete$Interest.Rate - predict(reduced.model, 
    loansData.complete))[-train]^2))
## [1] 0.0003851

(MSE.fico = mean((loansData.complete$Interest.Rate - predict(fico.model, loansData.complete))[-train]^2))
## [1] 0.0007985

cat("\n\nThe big reduced model produces a lower MSE than the simple quadratic FICO model ", 
    MSE.reduced < MSE.fico, "\n", (MSE.fico - MSE.reduced)/MSE.fico * 100, "% lower")
## 
## 
## The big reduced model produces a lower MSE than the simple quadratic FICO model  TRUE 
##  51.77 % lower

Plot the simple quadratic and elaborate multilinear regression models

# thanks, as always, to Winston Chang, his book, page 94 and his website
# http://www.cookbook-r.com/Graphs/

plot.data.frame = loansData.complete[seq(1, nrow(loansData.complete), by = 2), 
    ]
line.data.frame = data.frame(FICO.numeric = seq(1, 40, by = 0.03125))
line.data.frame$prediction = rev(sort(fitted(reduced.model)))

elaborate.png = ggplot(data = plot.data.frame, aes(x = FICO.numeric, y = Interest.Rate)) + 
    geom_point(shape = 1) + geom_line(data = line.data.frame, aes(x = FICO.numeric, 
    y = prediction), color = "red") + scale_x_discrete(name = "FICO Score", 
    breaks = seq(2, 43, by = 4), labels = FICO.levels[seq(2, 43, by = 4)]) + 

scale_y_continuous(name = "Interest Rate on a Loan (%)", labels = percent) + 
    ggtitle("Many Factors Affect the Interest Rate you pay\nElaborate Model: 14 factors in a Multiple Linear Regression") + 
    theme(plot.title = element_text(face = "bold"))


simple.png = ggplot(data = loansData.complete, aes(x = FICO.numeric, y = Interest.Rate)) + 
    geom_point(shape = 1) + geom_smooth(method = lm, formula = y ~ poly(x, 2), 
    se = FALSE, colour = "red") + 
scale_x_discrete(name = "FICO Score", breaks = seq(2, 43, by = 4), labels = FICO.levels[seq(2, 
    43, by = 4)]) + 
scale_y_continuous(name = "Interest Rate on a Loan (%)", labels = percent) + 

ggtitle("How Your FICO Score affects the Interest Rate you pay\nSimple Model: only FICO Score considered") + 
    theme(plot.title = element_text(face = "bold"))

simple.png

plot of chunk plot the two models

elaborate.png

plot of chunk plot the two models

multiplot(simple.png, elaborate.png)

plot of chunk plot the two models


create the *.png and *.pdf versions of the graphs


png("~/R/Data Analysis/Data Analysis Project 1/project_figures/finalfigure.png")
print(multiplot(simple.png, elaborate.png))
## NULL
dev.off()
## pdf 
##   2
pdf("~/R/Data Analysis/Data Analysis Project 1/project_figures/finalfigure.pdf")
print(multiplot(simple.png, elaborate.png))
## NULL
dev.off()
## pdf 
##   2

Try the step() function

model.base.stepped = step(model.base, trace = FALSE)
# the result of step
summary(model.base.stepped)
## 
## Call:
## lm(formula = loansData.complete$Interest.Rate ~ Open.CREDIT.Lines + 
##     FICO.numeric + FICO.numeric2 + Inquiries.in.the.Last.6.Months + 
##     D.36months + D.moving + D.other + D.small_business + D.CA + 
##     D.DC + D.IA + D.IL + D.IN + D.MI + D.MN + D.NH + D.NY + D.PA + 
##     D.UT + D.9years + Loan.Amount + Monthly.Income.Recip, data = model.data.frame, 
##     subset = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06212 -0.01137 -0.00159  0.01068  0.08965 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.34e-01   3.99e-03   58.71  < 2e-16 ***
## Open.CREDIT.Lines              -2.61e-04   1.27e-04   -2.06   0.0399 *  
## FICO.numeric                   -9.02e-03   3.49e-04  -25.82  < 2e-16 ***
## FICO.numeric2                   1.28e-04   9.57e-06   13.33  < 2e-16 ***
## Inquiries.in.the.Last.6.Months  3.04e-03   4.54e-04    6.68  3.6e-11 ***
## D.36months                     -3.28e-02   1.45e-03  -22.58  < 2e-16 ***
## D.moving                        1.39e-02   4.71e-03    2.96   0.0032 ** 
## D.other                         1.17e-02   1.91e-03    6.16  9.9e-10 ***
## D.small_business                1.27e-02   2.81e-03    4.52  6.9e-06 ***
## D.CA                           -2.83e-03   1.43e-03   -1.98   0.0476 *  
## D.DC                            2.00e-02   1.32e-02    1.51   0.1306    
## D.IA                           -3.60e-02   1.87e-02   -1.92   0.0548 .  
## D.IL                           -7.12e-03   2.73e-03   -2.61   0.0092 ** 
## D.IN                           -3.90e-02   1.87e-02   -2.08   0.0373 *  
## D.MI                           -5.67e-03   3.97e-03   -1.43   0.1534    
## D.MN                           -8.44e-03   4.15e-03   -2.04   0.0419 *  
## D.NH                           -1.82e-02   7.66e-03   -2.38   0.0174 *  
## D.NY                           -2.71e-03   1.81e-03   -1.50   0.1346    
## D.PA                           -4.77e-03   2.70e-03   -1.77   0.0774 .  
## D.UT                           -1.55e-02   8.37e-03   -1.86   0.0636 .  
## D.9years                       -5.08e-03   3.31e-03   -1.53   0.1251    
## Loan.Amount                     8.74e-07   4.33e-08   20.18  < 2e-16 ***
## Monthly.Income.Recip            1.40e+01   4.69e+00    2.98   0.0030 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0186 on 1226 degrees of freedom
## Multiple R-squared:  0.807,  Adjusted R-squared:  0.804 
## F-statistic:  233 on 22 and 1226 DF,  p-value: <2e-16
lm_print.model.hi_p(model.base.stepped, sig.level = 0.05, ret = FALSE, echo = TRUE)
##           Estimate Std. Error t value Pr(>|t|)
## D.MI     -0.005675   0.003972  -1.429  0.15335
## D.NY     -0.002713   0.001812  -1.497  0.13456
## D.DC      0.019967   0.013199   1.513  0.13060
## D.9years -0.005080   0.003310  -1.535  0.12507
## D.PA     -0.004766   0.002696  -1.768  0.07736
## D.UT     -0.015543   0.008371  -1.857  0.06360
## D.IA     -0.035963   0.018711  -1.922  0.05484
# my result
summary(reduced.model)
## 
## Call:
## lm(formula = loansData.complete$Interest.Rate ~ ., data = reduced.model.data.frame, 
##     subset = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06211 -0.01144 -0.00163  0.01071  0.08928 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.34e-01   4.12e-03   56.75  < 2e-16 ***
## Debt.To.Income.Ratio            9.78e-03   8.10e-03    1.21   0.2274    
## Open.CREDIT.Lines              -3.22e-04   1.39e-04   -2.32   0.0204 *  
## FICO.numeric                   -9.00e-03   3.51e-04  -25.62  < 2e-16 ***
## FICO.numeric2                   1.27e-04   9.62e-06   13.25  < 2e-16 ***
## Inquiries.in.the.Last.6.Months  3.09e-03   4.58e-04    6.75  2.3e-11 ***
## D.36months                     -3.26e-02   1.47e-03  -22.23  < 2e-16 ***
## D.major_purchase                3.56e-03   2.73e-03    1.30   0.1938    
## D.moving                        1.36e-02   4.74e-03    2.87   0.0042 ** 
## D.other                         1.19e-02   1.93e-03    6.19  8.1e-10 ***
## D.small_business                1.31e-02   2.83e-03    4.64  3.9e-06 ***
## D.CA                           -3.55e-03   1.47e-03   -2.41   0.0161 *  
## D.CO                           -4.65e-03   3.39e-03   -1.37   0.1707    
## D.DC                            1.89e-02   1.33e-02    1.42   0.1554    
## D.DE                           -4.84e-03   9.37e-03   -0.52   0.6058    
## D.IA                           -3.61e-02   1.87e-02   -1.93   0.0539 .  
## D.IL                           -7.91e-03   2.75e-03   -2.87   0.0041 ** 
## D.IN                           -3.94e-02   1.87e-02   -2.11   0.0355 *  
## D.KY                           -4.46e-03   4.62e-03   -0.97   0.3345    
## D.LA                           -4.87e-03   6.66e-03   -0.73   0.4650    
## D.MI                           -6.36e-03   3.99e-03   -1.59   0.1112    
## D.MN                           -9.10e-03   4.16e-03   -2.19   0.0290 *  
## D.MO                           -5.02e-03   4.14e-03   -1.21   0.2254    
## D.MS                            2.27e-02   1.88e-02    1.21   0.2274    
## D.MT                           -9.70e-03   9.39e-03   -1.03   0.3020    
## D.NH                           -1.83e-02   7.69e-03   -2.38   0.0173 *  
## D.NY                           -3.49e-03   1.86e-03   -1.87   0.0611 .  
## D.OK                           -3.91e-03   5.06e-03   -0.77   0.4394    
## D.PA                           -5.74e-03   2.72e-03   -2.11   0.0352 *  
## D.SD                           -1.57e-02   1.32e-02   -1.19   0.2353    
## D.UT                           -1.54e-02   8.39e-03   -1.84   0.0660 .  
## D.MORTGAGE                     -9.00e-04   1.17e-03   -0.77   0.4426    
## D.6years                        1.00e-03   2.18e-03    0.46   0.6451    
## D.8years                        8.93e-04   2.54e-03    0.35   0.7255    
## D.9years                       -4.98e-03   3.33e-03   -1.50   0.1344    
## Loan.Amount                     8.88e-07   4.44e-08   20.01  < 2e-16 ***
## Monthly.Income.Recip            1.31e+01   4.89e+00    2.68   0.0076 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0186 on 1212 degrees of freedom
## Multiple R-squared:  0.809,  Adjusted R-squared:  0.804 
## F-statistic:  143 on 36 and 1212 DF,  p-value: <2e-16
lm_print.model.hi_p(reduced.model, sig.level = 0.05, ret = FALSE, echo = TRUE)
##                        Estimate Std. Error t value Pr(>|t|)
## D.8years              0.0008935   0.002544  0.3512  0.72551
## D.6years              0.0010025   0.002176  0.4607  0.64509
## D.DE                 -0.0048361   0.009367 -0.5163  0.60576
## D.LA                 -0.0048660   0.006658 -0.7309  0.46500
## D.MORTGAGE           -0.0009001   0.001172 -0.7681  0.44256
## D.OK                 -0.0039123   0.005058 -0.7735  0.43938
## D.KY                 -0.0044604   0.004620 -0.9654  0.33453
## D.MT                 -0.0096967   0.009391 -1.0325  0.30202
## D.SD                 -0.0157310   0.013249 -1.1873  0.23534
## Debt.To.Income.Ratio  0.0097812   0.008099  1.2077  0.22739
## D.MS                  0.0226581   0.018760  1.2078  0.22737
## D.MO                 -0.0050229   0.004141 -1.2130  0.22536
## D.major_purchase      0.0035558   0.002735  1.3002  0.19378
## D.CO                 -0.0046504   0.003393 -1.3707  0.17071
## D.DC                  0.0188610   0.013269  1.4215  0.15544
## D.9years             -0.0049824   0.003326 -1.4979  0.13441
## D.MI                 -0.0063573   0.003988 -1.5942  0.11116
## D.UT                 -0.0154389   0.008389 -1.8403  0.06597
## D.NY                 -0.0034922   0.001863 -1.8742  0.06114
## D.IA                 -0.0361494   0.018736 -1.9295  0.05391
lm_compare(model.base.stepped, reduced.model)
## Residual Standard Error 
##       model.base.stepped          0.01859 
##       reduced.model       0.0186 
##       Increased:      6.904e-06 
##               model.base.stepped preferred 
## Adjusted R^2 
##       model.base.stepped          0.8037 
##       reduced.model       0.8036 
##       Decreased:      -0.0001458 
##               model.base.stepped preferred 
## F Statistic 
##       model.base.stepped          233.3 
##       reduced.model       142.8 
##       Decreased:      -90.46 
##               model.base.stepped preferred 
## F Statistic p-value 
##       model.base.stepped          0 
##       reduced.model       0 
##       Unchanged:      0 
## 
## Coeffcient Statistics 
##   Open.CREDIT.Lines  abs(t stat) 
##       model.base.stepped          2.057 
##       reduced.model       1.208 
##       Decreased:      -0.849 
##               model.base.stepped preferred 
##   Open.CREDIT.Lines  t stat p-value 
##       model.base.stepped          0.03993 
##       reduced.model       0.2274 
##       Increased:      0.1875

Compare stepped model to Test data

(MSE.reduced = mean((loansData.complete$Interest.Rate - predict(reduced.model, 
    loansData.complete))[-train]^2))
## [1] 0.0003851

(MSE.stepped = mean((loansData.complete$Interest.Rate - predict(model.base.stepped, 
    loansData.complete))[-train]^2))
## [1] 0.0003829

Look at the step() results assumptions

lm_assumptions_summary(model.base.stepped)
## 
## 
## NOTE: in addition to this analysis, look at scatter.smooth plots of the residuals vs the main variables individually to see if quadratic transforms may be required
## 
## 
## Call:
## lm(formula = loansData.complete$Interest.Rate ~ Open.CREDIT.Lines + 
##     FICO.numeric + FICO.numeric2 + Inquiries.in.the.Last.6.Months + 
##     D.36months + D.moving + D.other + D.small_business + D.CA + 
##     D.DC + D.IA + D.IL + D.IN + D.MI + D.MN + D.NH + D.NY + D.PA + 
##     D.UT + D.9years + Loan.Amount + Monthly.Income.Recip, data = model.data.frame, 
##     subset = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06212 -0.01137 -0.00159  0.01068  0.08965 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.34e-01   3.99e-03   58.71  < 2e-16 ***
## Open.CREDIT.Lines              -2.61e-04   1.27e-04   -2.06   0.0399 *  
## FICO.numeric                   -9.02e-03   3.49e-04  -25.82  < 2e-16 ***
## FICO.numeric2                   1.28e-04   9.57e-06   13.33  < 2e-16 ***
## Inquiries.in.the.Last.6.Months  3.04e-03   4.54e-04    6.68  3.6e-11 ***
## D.36months                     -3.28e-02   1.45e-03  -22.58  < 2e-16 ***
## D.moving                        1.39e-02   4.71e-03    2.96   0.0032 ** 
## D.other                         1.17e-02   1.91e-03    6.16  9.9e-10 ***
## D.small_business                1.27e-02   2.81e-03    4.52  6.9e-06 ***
## D.CA                           -2.83e-03   1.43e-03   -1.98   0.0476 *  
## D.DC                            2.00e-02   1.32e-02    1.51   0.1306    
## D.IA                           -3.60e-02   1.87e-02   -1.92   0.0548 .  
## D.IL                           -7.12e-03   2.73e-03   -2.61   0.0092 ** 
## D.IN                           -3.90e-02   1.87e-02   -2.08   0.0373 *  
## D.MI                           -5.67e-03   3.97e-03   -1.43   0.1534    
## D.MN                           -8.44e-03   4.15e-03   -2.04   0.0419 *  
## D.NH                           -1.82e-02   7.66e-03   -2.38   0.0174 *  
## D.NY                           -2.71e-03   1.81e-03   -1.50   0.1346    
## D.PA                           -4.77e-03   2.70e-03   -1.77   0.0774 .  
## D.UT                           -1.55e-02   8.37e-03   -1.86   0.0636 .  
## D.9years                       -5.08e-03   3.31e-03   -1.53   0.1251    
## Loan.Amount                     8.74e-07   4.33e-08   20.18  < 2e-16 ***
## Monthly.Income.Recip            1.40e+01   4.69e+00    2.98   0.0030 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0186 on 1226 degrees of freedom
## Multiple R-squared:  0.807,  Adjusted R-squared:  0.804 
## F-statistic:  233 on 22 and 1226 DF,  p-value: <2e-16
## 
## 
## ----------F, F p, Adj R^2------------------------------
##   F_statistic F_statistic_p   adjusted_R2 
##      233.2823        0.0000        0.8037 
## [1] "F statistic p-value <= 0.05 indicates at least one predictor is predictive"
## 
## ---------p-values > 0.05------------------------------
## [1] "Below are listed, in descending order, the individual p-values > 0.05"
##           Estimate Std. Error t value Pr(>|t|)
## D.MI     -0.005675   0.003972  -1.429  0.15335
## D.NY     -0.002713   0.001812  -1.497  0.13456
## D.DC      0.019967   0.013199   1.513  0.13060
## D.9years -0.005080   0.003310  -1.535  0.12507
## D.PA     -0.004766   0.002696  -1.768  0.07736
## D.UT     -0.015543   0.008371  -1.857  0.06360
## D.IA     -0.035963   0.018711  -1.922  0.05484
## 
## ---------Cook's Distance------------------------------
## [1] "Cook's Distances less than 0.5 indicate no outlying Y's or Leveraged X's"
## 
## ---------Heteroskedasticity-----------------------
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 58.47, df = 22, p-value = 3.752e-05
## 
## [1] "Breusch-Pagan test indicates possible Heteroskedasticity"
## 
## --------Autocorrelation--------------------------
## [1] "Autocorrelation not indicated"
## 
## -------Multicollinearity if GT 10---------------
##  FICO.numeric FICO.numeric2 
##         20.29         20.24 
## [1] "Multicollinearity test generated an error"
## 
## --------Mean Zero?-------------------------------
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -3.400  -0.617  -0.098   0.000   0.579   4.900       2
## Warning: not plotting observations with leverage one:
##   71, 644

plot of chunk stepped assumptions

## Warning: not plotting observations with leverage one:
##   71, 644

plot of chunk stepped assumptions


Info about the system running this code

print(str(.Platform))
## List of 8
##  $ OS.type   : chr "windows"
##  $ file.sep  : chr "/"
##  $ dynlib.ext: chr ".dll"
##  $ GUI       : chr "RTerm"
##  $ endian    : chr "little"
##  $ pkgType   : chr "win.binary"
##  $ path.sep  : chr ";"
##  $ r_arch    : chr "x64"
## NULL
print(version)
##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          3                           
## minor          0.2                         
## year           2013                        
## month          09                          
## day            25                          
## svn rev        63987                       
## language       R                           
## version.string R version 3.0.2 (2013-09-25)
## nickname       Frisbee Sailing
print(sessionInfo(), locale = FALSE)
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## attached base packages:
## [1] splines   grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] HH_2.3-42           multcomp_1.3-0      survival_2.37-4    
##  [4] mvtnorm_0.9-9996    latticeExtra_0.6-26 RColorBrewer_1.0-5 
##  [7] lattice_0.20-24     randomizeBE_0.3-1   lmtest_0.9-32      
## [10] zoo_1.7-10          scales_0.2.3        ggplot2_0.9.3.1    
## [13] knitr_1.5          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.2-4 dichromat_2.0-0  digest_0.6.3     evaluate_0.5.1  
##  [5] formatR_0.10     gtable_0.1.2     labeling_0.2     leaps_2.9       
##  [9] MASS_7.3-29      munsell_0.4.2    plyr_1.8         proto_0.3-10    
## [13] reshape_0.8.4    reshape2_1.2.2   sandwich_2.3-0   stringr_0.6.2   
## [17] tools_3.0.2      vcd_1.3-1
print(Sys.time())
## [1] "2013-11-14 16:17:30 EST"