Data Analysis MOOC- Project1

Below is the R code and comments of data analysis project-1

library(gclus)
## Loading required package: cluster
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object(s) are masked from 'package:ggplot2':
## 
##     %+%

dt <- read.csv("./loansData.csv")
a <- strsplit(as.character(dt$FICO.Range), "-")  #Splitting FICO RANGE STRING
j <- do.call("rbind", a)  # CONVERTS list created above to a dataframe
colnames(j) <- c("F1", "F2")  # Renaming columns of dataframe j
p <- cbind(dt, j)  # Making one data frame with all data
p[!complete.cases(p), ]  # This will give list of rows where missing data is present
##        Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 101596             5000                       4525         7.43%
## 101515             3500                        225        10.28%
##        Loan.Length Loan.Purpose Debt.To.Income.Ratio State Home.Ownership
## 101596   36 months        other                   1%    NY           NONE
## 101515   36 months        other                  10%    NY           RENT
##        Monthly.Income FICO.Range Open.CREDIT.Lines
## 101596             NA    800-804                NA
## 101515          15000    685-689                NA
##        Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 101596                       NA                             NA
## 101515                       NA                             NA
##        Employment.Length  F1  F2
## 101596          < 1 year 800 804
## 101515          < 1 year 685 689
# There are two rows in data framw with NAs
p <- p[complete.cases(p), ]  # Data frame with complete set of values

Below is to find numerical values of Interest Rate

m <- sub("%", "", p$Interest.Rate)  #repalces % in Interest Rate  and adds to dataframe 'm'
m <- as.numeric(m)  #converts string to numeric
q <- cbind(p, m)  # makes one data frame with interest rate added to original
head(q)
##       Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 81174            20000                      20000         8.90%
## 99592            19200                      19200        12.12%
## 80059            35000                      35000        21.98%
## 15825            10000                       9975         9.99%
## 33182            12000                      12000        11.71%
## 62403             6000                       6000        15.31%
##       Loan.Length       Loan.Purpose Debt.To.Income.Ratio State
## 81174   36 months debt_consolidation               14.90%    SC
## 99592   36 months debt_consolidation               28.36%    TX
## 80059   60 months debt_consolidation               23.81%    CA
## 15825   36 months debt_consolidation               14.30%    KS
## 33182   36 months        credit_card               18.78%    NJ
## 62403   36 months              other               20.05%    CT
##       Home.Ownership Monthly.Income FICO.Range Open.CREDIT.Lines
## 81174       MORTGAGE           6542    735-739                14
## 99592       MORTGAGE           4583    715-719                12
## 80059       MORTGAGE          11500    690-694                14
## 15825       MORTGAGE           3833    695-699                10
## 33182           RENT           3195    695-699                11
## 62403            OWN           4892    670-674                17
##       Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 81174                    14272                              2
## 99592                    11140                              1
## 80059                    21977                              1
## 15825                     9346                              0
## 33182                    14469                              0
## 62403                    10391                              2
##       Employment.Length  F1  F2     m
## 81174          < 1 year 735 739  8.90
## 99592           2 years 715 719 12.12
## 80059           2 years 690 694 21.98
## 15825           5 years 695 699  9.99
## 33182           9 years 695 699 11.71
## 62403           3 years 670 674 15.31
names(q)[names(q) == "m"] <- "Interest"  # Colname is updated with Interest

Below is to categorize Debt to Income Ratio

q$n <- sub("%", "", q$Debt.To.Income.Ratio)  #Replaces %in debt to income ratio
names(q)[names(q) == "n"] <- "DI_Ratio"
q$DI_Ratio <- as.numeric(q$DI_Ratio)
q$Category[q$DI_Ratio < 10] <- "A"  # categorizing DI ratio as A
q$Category[q$DI_Ratio >= 10 & q$DI_Ratio < 15] <- "B"  # categorizing DI ratio as B
q$Category[q$DI_Ratio >= 15 & q$DI_Ratio < 20] <- "C"  # categorizing DI ratio as C
q$Category[q$DI_Ratio >= 20] <- "D"  # categorizing DI ratio as D

Below is to find numerical vlaues of loadn length

q$LL <- sub(" months", "", q$Loan.Length)
names(q)[names(q) == "LL"] <- "LLen"
q$LLen <- as.numeric(q$LLen)

lapply(q, class)  # To understand type of each column
## $Amount.Requested
## [1] "integer"
## 
## $Amount.Funded.By.Investors
## [1] "numeric"
## 
## $Interest.Rate
## [1] "factor"
## 
## $Loan.Length
## [1] "factor"
## 
## $Loan.Purpose
## [1] "factor"
## 
## $Debt.To.Income.Ratio
## [1] "factor"
## 
## $State
## [1] "factor"
## 
## $Home.Ownership
## [1] "factor"
## 
## $Monthly.Income
## [1] "numeric"
## 
## $FICO.Range
## [1] "factor"
## 
## $Open.CREDIT.Lines
## [1] "integer"
## 
## $Revolving.CREDIT.Balance
## [1] "integer"
## 
## $Inquiries.in.the.Last.6.Months
## [1] "integer"
## 
## $Employment.Length
## [1] "factor"
## 
## $F1
## [1] "factor"
## 
## $F2
## [1] "factor"
## 
## $Interest
## [1] "numeric"
## 
## $DI_Ratio
## [1] "numeric"
## 
## $Category
## [1] "character"
## 
## $LLen
## [1] "numeric"

Graphs to understand data

attach(q)
boxplot(Interest ~ F1, data = q)

plot of chunk unnamed-chunk-4

plot(q$Amount.Requested, q$Interes)

plot of chunk unnamed-chunk-4

plot(q$F2, q$Interest)

plot of chunk unnamed-chunk-4

plot(q$Home.Ownership, q$Interest)

plot of chunk unnamed-chunk-4

plot(q$Inquiries.in.the.Last.6.Months, q$Interest)

plot of chunk unnamed-chunk-4

plot(q$Loan.Length, q$Interest)  # Higher the loan length more the interest

plot of chunk unnamed-chunk-4

plot(q$Open.CREDIT.Lines, q$Interest)

plot of chunk unnamed-chunk-4

plot(q$Employment.Length, q$Interest)

plot of chunk unnamed-chunk-4

a <- subset(q, q$Revolving.CREDIT.Balance < 50000)
plot(a$Revolving.CREDIT.Balance, a$Interest)  # lower the credit balance, lower the interest

plot of chunk unnamed-chunk-4

plot(q$Loan.Purpose, q$Interest)

plot of chunk unnamed-chunk-4

Graphs to understand impact of Debt_to_income_ratio

A <- subset(q, q$Category == "A")
B <- subset(q, q$Category == "B")
C <- subset(q, q$Category == "C")
D <- subset(q, q$Category == "D")
plot(D$Interest)  # plot of DI ratio >=20

plot of chunk unnamed-chunk-5

plot(C$Interest)  # plot of DI ratio <20 & >=15

plot of chunk unnamed-chunk-5

plot(B$Interest)  # plot of DI ratio <15 & >=10

plot of chunk unnamed-chunk-5

Correlation analysis to find out the relation between interest and other variables

cdf <- q[, c(1, 9, 11, 12, 13, 16, 17, 18, 20)]  # a new dataframe with variables of interest
cdf$F2 <- as.numeric(cdf$F2)  # Conversting FICO range upper value to a numeric to conduct cor

Summary statistics of cdf data frame with all numeric variables

descirbe(cdf)
## Error: could not find function "descirbe"

Correlation test of above is conducted to test null hypothesis that there is no significant correlation between Interest and FICO upper range.

cor.test(cdf$Interest, cdf$F2)
## 
##  Pearson's product-moment correlation
## 
## data:  cdf$Interest and cdf$F2 
## t = -50.27, df = 2496, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0 
## 95 percent confidence interval:
##  -0.7282 -0.6892 
## sample estimates:
##     cor 
## -0.7093

p-value of less than 0.05 suggests we should reject above null hypothesis and that there is significant correlation and cor.test gives 95% confidence interevals of correlation. Further correlation analysis is done to identify remaining variables.

Correlation matrix of cdf dataframe

cor(cdf)  # Generates coorelation matrix
##                                Amount.Requested Monthly.Income
## Amount.Requested                        1.00000        0.39118
## Monthly.Income                          0.39118        1.00000
## Open.CREDIT.Lines                       0.19594        0.17140
## Revolving.CREDIT.Balance                0.29337        0.35968
## Inquiries.in.the.Last.6.Months         -0.02956        0.03395
## F2                                      0.08334        0.12277
## Interest                                0.33183        0.01292
## DI_Ratio                                0.08129       -0.16234
## LLen                                    0.41230        0.07454
##                                Open.CREDIT.Lines Revolving.CREDIT.Balance
## Amount.Requested                         0.19594                 0.293365
## Monthly.Income                           0.17140                 0.359684
## Open.CREDIT.Lines                        1.00000                 0.290085
## Revolving.CREDIT.Balance                 0.29009                 1.000000
## Inquiries.in.the.Last.6.Months           0.11074                 0.012186
## F2                                      -0.08950                 0.002735
## Interest                                 0.09031                 0.061109
## DI_Ratio                                 0.37085                 0.189221
## LLen                                     0.04089                 0.055436
##                                Inquiries.in.the.Last.6.Months        F2
## Amount.Requested                                     -0.02956  0.083338
## Monthly.Income                                        0.03395  0.122766
## Open.CREDIT.Lines                                     0.11074 -0.089497
## Revolving.CREDIT.Balance                              0.01219  0.002735
## Inquiries.in.the.Last.6.Months                        1.00000 -0.092142
## F2                                                   -0.09214  1.000000
## Interest                                              0.16465 -0.709283
## DI_Ratio                                              0.01198 -0.216921
## LLen                                                  0.02384  0.012736
##                                Interest DI_Ratio    LLen
## Amount.Requested                0.33183  0.08129 0.41230
## Monthly.Income                  0.01292 -0.16234 0.07454
## Open.CREDIT.Lines               0.09031  0.37085 0.04089
## Revolving.CREDIT.Balance        0.06111  0.18922 0.05544
## Inquiries.in.the.Last.6.Months  0.16465  0.01198 0.02384
## F2                             -0.70928 -0.21692 0.01274
## Interest                        1.00000  0.17220 0.42351
## DI_Ratio                        0.17220  1.00000 0.02499
## LLen                            0.42351  0.02499 1.00000
round(cor(cdf), 2)  # Round to 2 decimal places
##                                Amount.Requested Monthly.Income
## Amount.Requested                           1.00           0.39
## Monthly.Income                             0.39           1.00
## Open.CREDIT.Lines                          0.20           0.17
## Revolving.CREDIT.Balance                   0.29           0.36
## Inquiries.in.the.Last.6.Months            -0.03           0.03
## F2                                         0.08           0.12
## Interest                                   0.33           0.01
## DI_Ratio                                   0.08          -0.16
## LLen                                       0.41           0.07
##                                Open.CREDIT.Lines Revolving.CREDIT.Balance
## Amount.Requested                            0.20                     0.29
## Monthly.Income                              0.17                     0.36
## Open.CREDIT.Lines                           1.00                     0.29
## Revolving.CREDIT.Balance                    0.29                     1.00
## Inquiries.in.the.Last.6.Months              0.11                     0.01
## F2                                         -0.09                     0.00
## Interest                                    0.09                     0.06
## DI_Ratio                                    0.37                     0.19
## LLen                                        0.04                     0.06
##                                Inquiries.in.the.Last.6.Months    F2
## Amount.Requested                                        -0.03  0.08
## Monthly.Income                                           0.03  0.12
## Open.CREDIT.Lines                                        0.11 -0.09
## Revolving.CREDIT.Balance                                 0.01  0.00
## Inquiries.in.the.Last.6.Months                           1.00 -0.09
## F2                                                      -0.09  1.00
## Interest                                                 0.16 -0.71
## DI_Ratio                                                 0.01 -0.22
## LLen                                                     0.02  0.01
##                                Interest DI_Ratio LLen
## Amount.Requested                   0.33     0.08 0.41
## Monthly.Income                     0.01    -0.16 0.07
## Open.CREDIT.Lines                  0.09     0.37 0.04
## Revolving.CREDIT.Balance           0.06     0.19 0.06
## Inquiries.in.the.Last.6.Months     0.16     0.01 0.02
## F2                                -0.71    -0.22 0.01
## Interest                           1.00     0.17 0.42
## DI_Ratio                           0.17     1.00 0.02
## LLen                               0.42     0.02 1.00

Above results suggest that there is high corection between Interest Rate and a. Amount.requested (0.3318) b. Inquiries.in.the.Last.6.Months (0.1646) c. DI_Ratio (0.172) d. LLen (0.423) e FICO vlaue (-0.7092) f.Open.CREDIT.Lines ( 0.09030695)

Below plot will graphically illustrate the impact of correlation

cdr <- abs(cor(cdf))
cd.col <- dmat.color(cdr)
cd.or <- order.single(cdr)
cpairs(cdf, cd.or, panel.colors = cd.col, gap = 0.5, main = "Correlation")

plot of chunk unnamed-chunk-10

Fitting regression equation with above varaibles ** UNSTANDARDIZED Regression**

summary(lm(cdf$Interest ~ cdf$F2))
## 
## Call:
## lm(formula = cdf$Interest ~ cdf$F2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.990 -2.136 -0.456  1.835 10.194 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19.07189    0.13314   143.2   <2e-16 ***
## cdf$F2      -0.42350    0.00842   -50.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.95 on 2496 degrees of freedom
## Multiple R-squared: 0.503,   Adjusted R-squared: 0.503 
## F-statistic: 2.53e+03 on 1 and 2496 DF,  p-value: <2e-16
confint(lm(cdf$Interest ~ cdf$F2))
##             2.5 % 97.5 %
## (Intercept) 18.81 19.333
## cdf$F2      -0.44 -0.407

ggplot(cdf, aes(x = F2, y = Interest)) + geom_smooth(method = "lm") + geom_point()

plot of chunk unnamed-chunk-11

Summary shows that FICO values explaines 50% of varaiation for Interest. Because R square value si 0.50 To explain the remainaing variation, multiple regression with variables identified after correlation analysis is done ** UNSTANDARDIZED Regression**

Rg1 <- lm(cdf$Interest ~ cdf$Inquiries.in.the.Last.6.Months + cdf$DI_Ratio + 
    cdf$LLen + cdf$F2 + cdf$Open.CREDIT.Lines + cdf$Amount.Requested)
summary(Rg1)
## 
## Call:
## lm(formula = cdf$Interest ~ cdf$Inquiries.in.the.Last.6.Months + 
##     cdf$DI_Ratio + cdf$LLen + cdf$F2 + cdf$Open.CREDIT.Lines + 
##     cdf$Amount.Requested)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.570 -1.381 -0.168  1.205  9.982 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                         1.21e+01   2.31e-01   52.24  < 2e-16
## cdf$Inquiries.in.the.Last.6.Months  3.52e-01   3.39e-02   10.38  < 2e-16
## cdf$DI_Ratio                        1.23e-03   6.05e-03    0.20     0.84
## cdf$LLen                            1.34e-01   4.57e-03   29.41  < 2e-16
## cdf$F2                             -4.36e-01   6.10e-03  -71.61  < 2e-16
## cdf$Open.CREDIT.Lines              -5.03e-02   1.01e-02   -4.98  6.7e-07
## cdf$Amount.Requested                1.47e-04   5.96e-06   24.67  < 2e-16
##                                       
## (Intercept)                        ***
## cdf$Inquiries.in.the.Last.6.Months ***
## cdf$DI_Ratio                          
## cdf$LLen                           ***
## cdf$F2                             ***
## cdf$Open.CREDIT.Lines              ***
## cdf$Amount.Requested               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.06 on 2491 degrees of freedom
## Multiple R-squared: 0.757,   Adjusted R-squared: 0.757 
## F-statistic: 1.3e+03 on 6 and 2491 DF,  p-value: <2e-16
confint(Rg1)
##                                         2.5 %     97.5 %
## (Intercept)                        11.6054002 12.5106546
## cdf$Inquiries.in.the.Last.6.Months  0.2857053  0.4187286
## cdf$DI_Ratio                       -0.0106278  0.0130914
## cdf$LLen                            0.1253267  0.1432347
## cdf$F2                             -0.4484503 -0.4245450
## cdf$Open.CREDIT.Lines              -0.0701221 -0.0305200
## cdf$Amount.Requested                0.0001354  0.0001588

Summary shows that R-squared value is 0.757 Other variables that affect interest rate are a. Amount.requested (0.3318) b. Inquiries.in.the.Last.6.Months (0.1646) c. DI_Ratio (0.172) d. LLen (0.423) e.#$FICO vlaue (-0.7092) f.Open.CREDIT.Lines ( 0.09030695)

Howeve, to check whether no.of open credit lines really affect interest rate, one more regressiona analysis without Open.credit.lines is done. ** UNSTANDARDIZED Regression**

Rg2 <- lm(cdf$Interest ~ cdf$Inquiries.in.the.Last.6.Months + cdf$DI_Ratio + 
    cdf$LLen + cdf$F2 + cdf$Amount.Requested)
summary(Rg2)
## 
## Call:
## lm(formula = cdf$Interest ~ cdf$Inquiries.in.the.Last.6.Months + 
##     cdf$DI_Ratio + cdf$LLen + cdf$F2 + cdf$Amount.Requested)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.518 -1.363 -0.146  1.229  9.946 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                         1.17e+01   2.23e-01   52.61   <2e-16
## cdf$Inquiries.in.the.Last.6.Months  3.32e-01   3.38e-02    9.80   <2e-16
## cdf$DI_Ratio                       -9.39e-03   5.69e-03   -1.65    0.099
## cdf$LLen                            1.35e-01   4.58e-03   29.55   <2e-16
## cdf$F2                             -4.36e-01   6.12e-03  -71.19   <2e-16
## cdf$Amount.Requested                1.41e-04   5.88e-06   24.05   <2e-16
##                                       
## (Intercept)                        ***
## cdf$Inquiries.in.the.Last.6.Months ***
## cdf$DI_Ratio                       .  
## cdf$LLen                           ***
## cdf$F2                             ***
## cdf$Amount.Requested               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.07 on 2492 degrees of freedom
## Multiple R-squared: 0.755,   Adjusted R-squared: 0.755 
## F-statistic: 1.54e+03 on 5 and 2492 DF,  p-value: <2e-16
confint(Rg2)
##                                         2.5 %    97.5 %
## (Intercept)                        11.3093363 12.185073
## cdf$Inquiries.in.the.Last.6.Months  0.2652968  0.397959
## cdf$DI_Ratio                       -0.0205383  0.001766
## cdf$LLen                            0.1264324  0.144403
## cdf$F2                             -0.4479058 -0.423891
## cdf$Amount.Requested                0.0001299  0.000153

R-square value of 0.755 shows that Open.credit.lines does not signifcantly explain or account for interest rate, Hence variables that affect interest rate are a. Amount.requested (0.3318) b. Inquiries.in.the.Last.6.Months (0.1646) c. DI_Ratio (0.172) d. LLen (0.423) e.$FICO vlaue (-0.7092)

To visualize above model, solve the predicted scores as new variable and then plot


```r
cdf$predicted <- fitted(Rg2)
ggplot(cdf, aes(x = predicted, y = Interest)) + geom_smooth(method = "lm") + 
    geom_point()

plot of chunk unnamed-chunk-14

Further, residuals are computed to check the relationship between residuals and predicted values. Residuals should be orthogonal to predicted values.

cdf$er <- resid(Rg2)
ggplot(cdf, aes(x = predicted, y = er)) + geom_smooth(method = "lm") + geom_point()

plot of chunk unnamed-chunk-15

Muliple plots on single page are plotted here

# Split screens to 2 columns par(mfrow=c(1,2))
layout(t(1:2))
# Graph 1
cdr <- abs(cor(cdf))
cd.col <- dmat.color(cdr)
cd.or <- order.single(cdr)
cpairs(cdf, cd.or, panel.colors = cd.col, gap = 0.5, main = "Correlation")

plot of chunk unnamed-chunk-16

# Graph 2
cdf$predicted <- fitted(Rg2)
ggplot(cdf, aes(x = predicted, y = Interest)) + geom_smooth(method = "lm") + 
    geom_point() + opts(title = "Predicted Interest Rate vs  Actual Interest Rate")
## 'opts' is deprecated. Use 'theme' instead. (Deprecated; last used in version 0.9.1)
## Setting the plot title with opts(title="...") is deprecated.
##  Use labs(title="...") or ggtitle("...") instead. (Deprecated; last used in version 0.9.1)

plot of chunk unnamed-chunk-16