Below is the R code and comments of data analysis project-1
library(gclus)
## Loading required package: cluster
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
##
## The following object(s) are masked from 'package:ggplot2':
##
## %+%
dt <- read.csv("./loansData.csv")
a <- strsplit(as.character(dt$FICO.Range), "-") #Splitting FICO RANGE STRING
j <- do.call("rbind", a) # CONVERTS list created above to a dataframe
colnames(j) <- c("F1", "F2") # Renaming columns of dataframe j
p <- cbind(dt, j) # Making one data frame with all data
p[!complete.cases(p), ] # This will give list of rows where missing data is present
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 101596 5000 4525 7.43%
## 101515 3500 225 10.28%
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State Home.Ownership
## 101596 36 months other 1% NY NONE
## 101515 36 months other 10% NY RENT
## Monthly.Income FICO.Range Open.CREDIT.Lines
## 101596 NA 800-804 NA
## 101515 15000 685-689 NA
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 101596 NA NA
## 101515 NA NA
## Employment.Length F1 F2
## 101596 < 1 year 800 804
## 101515 < 1 year 685 689
# There are two rows in data framw with NAs
p <- p[complete.cases(p), ] # Data frame with complete set of values
Below is to find numerical values of Interest Rate
m <- sub("%", "", p$Interest.Rate) #repalces % in Interest Rate and adds to dataframe 'm'
m <- as.numeric(m) #converts string to numeric
q <- cbind(p, m) # makes one data frame with interest rate added to original
head(q)
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 81174 20000 20000 8.90%
## 99592 19200 19200 12.12%
## 80059 35000 35000 21.98%
## 15825 10000 9975 9.99%
## 33182 12000 12000 11.71%
## 62403 6000 6000 15.31%
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State
## 81174 36 months debt_consolidation 14.90% SC
## 99592 36 months debt_consolidation 28.36% TX
## 80059 60 months debt_consolidation 23.81% CA
## 15825 36 months debt_consolidation 14.30% KS
## 33182 36 months credit_card 18.78% NJ
## 62403 36 months other 20.05% CT
## Home.Ownership Monthly.Income FICO.Range Open.CREDIT.Lines
## 81174 MORTGAGE 6542 735-739 14
## 99592 MORTGAGE 4583 715-719 12
## 80059 MORTGAGE 11500 690-694 14
## 15825 MORTGAGE 3833 695-699 10
## 33182 RENT 3195 695-699 11
## 62403 OWN 4892 670-674 17
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 81174 14272 2
## 99592 11140 1
## 80059 21977 1
## 15825 9346 0
## 33182 14469 0
## 62403 10391 2
## Employment.Length F1 F2 m
## 81174 < 1 year 735 739 8.90
## 99592 2 years 715 719 12.12
## 80059 2 years 690 694 21.98
## 15825 5 years 695 699 9.99
## 33182 9 years 695 699 11.71
## 62403 3 years 670 674 15.31
names(q)[names(q) == "m"] <- "Interest" # Colname is updated with Interest
Below is to categorize Debt to Income Ratio
q$n <- sub("%", "", q$Debt.To.Income.Ratio) #Replaces %in debt to income ratio
names(q)[names(q) == "n"] <- "DI_Ratio"
q$DI_Ratio <- as.numeric(q$DI_Ratio)
q$Category[q$DI_Ratio < 10] <- "A" # categorizing DI ratio as A
q$Category[q$DI_Ratio >= 10 & q$DI_Ratio < 15] <- "B" # categorizing DI ratio as B
q$Category[q$DI_Ratio >= 15 & q$DI_Ratio < 20] <- "C" # categorizing DI ratio as C
q$Category[q$DI_Ratio >= 20] <- "D" # categorizing DI ratio as D
Below is to find numerical vlaues of loadn length
q$LL <- sub(" months", "", q$Loan.Length)
names(q)[names(q) == "LL"] <- "LLen"
q$LLen <- as.numeric(q$LLen)
lapply(q, class) # To understand type of each column
## $Amount.Requested
## [1] "integer"
##
## $Amount.Funded.By.Investors
## [1] "numeric"
##
## $Interest.Rate
## [1] "factor"
##
## $Loan.Length
## [1] "factor"
##
## $Loan.Purpose
## [1] "factor"
##
## $Debt.To.Income.Ratio
## [1] "factor"
##
## $State
## [1] "factor"
##
## $Home.Ownership
## [1] "factor"
##
## $Monthly.Income
## [1] "numeric"
##
## $FICO.Range
## [1] "factor"
##
## $Open.CREDIT.Lines
## [1] "integer"
##
## $Revolving.CREDIT.Balance
## [1] "integer"
##
## $Inquiries.in.the.Last.6.Months
## [1] "integer"
##
## $Employment.Length
## [1] "factor"
##
## $F1
## [1] "factor"
##
## $F2
## [1] "factor"
##
## $Interest
## [1] "numeric"
##
## $DI_Ratio
## [1] "numeric"
##
## $Category
## [1] "character"
##
## $LLen
## [1] "numeric"
Graphs to understand data
attach(q)
boxplot(Interest ~ F1, data = q)
plot(q$Amount.Requested, q$Interes)
plot(q$F2, q$Interest)
plot(q$Home.Ownership, q$Interest)
plot(q$Inquiries.in.the.Last.6.Months, q$Interest)
plot(q$Loan.Length, q$Interest) # Higher the loan length more the interest
plot(q$Open.CREDIT.Lines, q$Interest)
plot(q$Employment.Length, q$Interest)
a <- subset(q, q$Revolving.CREDIT.Balance < 50000)
plot(a$Revolving.CREDIT.Balance, a$Interest) # lower the credit balance, lower the interest
plot(q$Loan.Purpose, q$Interest)
Graphs to understand impact of Debt_to_income_ratio
A <- subset(q, q$Category == "A")
B <- subset(q, q$Category == "B")
C <- subset(q, q$Category == "C")
D <- subset(q, q$Category == "D")
plot(D$Interest) # plot of DI ratio >=20
plot(C$Interest) # plot of DI ratio <20 & >=15
plot(B$Interest) # plot of DI ratio <15 & >=10
Correlation analysis to find out the relation between interest and other variables
cdf <- q[, c(1, 9, 11, 12, 13, 16, 17, 18, 20)] # a new dataframe with variables of interest
cdf$F2 <- as.numeric(cdf$F2) # Conversting FICO range upper value to a numeric to conduct cor
Summary statistics of cdf data frame with all numeric variables
descirbe(cdf)
## Error: could not find function "descirbe"
Correlation test of above is conducted to test null hypothesis that there is no significant correlation between Interest and FICO upper range.
cor.test(cdf$Interest, cdf$F2)
##
## Pearson's product-moment correlation
##
## data: cdf$Interest and cdf$F2
## t = -50.27, df = 2496, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7282 -0.6892
## sample estimates:
## cor
## -0.7093
p-value of less than 0.05 suggests we should reject above null hypothesis and that there is significant correlation and cor.test gives 95% confidence interevals of correlation. Further correlation analysis is done to identify remaining variables.
Correlation matrix of cdf dataframe
cor(cdf) # Generates coorelation matrix
## Amount.Requested Monthly.Income
## Amount.Requested 1.00000 0.39118
## Monthly.Income 0.39118 1.00000
## Open.CREDIT.Lines 0.19594 0.17140
## Revolving.CREDIT.Balance 0.29337 0.35968
## Inquiries.in.the.Last.6.Months -0.02956 0.03395
## F2 0.08334 0.12277
## Interest 0.33183 0.01292
## DI_Ratio 0.08129 -0.16234
## LLen 0.41230 0.07454
## Open.CREDIT.Lines Revolving.CREDIT.Balance
## Amount.Requested 0.19594 0.293365
## Monthly.Income 0.17140 0.359684
## Open.CREDIT.Lines 1.00000 0.290085
## Revolving.CREDIT.Balance 0.29009 1.000000
## Inquiries.in.the.Last.6.Months 0.11074 0.012186
## F2 -0.08950 0.002735
## Interest 0.09031 0.061109
## DI_Ratio 0.37085 0.189221
## LLen 0.04089 0.055436
## Inquiries.in.the.Last.6.Months F2
## Amount.Requested -0.02956 0.083338
## Monthly.Income 0.03395 0.122766
## Open.CREDIT.Lines 0.11074 -0.089497
## Revolving.CREDIT.Balance 0.01219 0.002735
## Inquiries.in.the.Last.6.Months 1.00000 -0.092142
## F2 -0.09214 1.000000
## Interest 0.16465 -0.709283
## DI_Ratio 0.01198 -0.216921
## LLen 0.02384 0.012736
## Interest DI_Ratio LLen
## Amount.Requested 0.33183 0.08129 0.41230
## Monthly.Income 0.01292 -0.16234 0.07454
## Open.CREDIT.Lines 0.09031 0.37085 0.04089
## Revolving.CREDIT.Balance 0.06111 0.18922 0.05544
## Inquiries.in.the.Last.6.Months 0.16465 0.01198 0.02384
## F2 -0.70928 -0.21692 0.01274
## Interest 1.00000 0.17220 0.42351
## DI_Ratio 0.17220 1.00000 0.02499
## LLen 0.42351 0.02499 1.00000
round(cor(cdf), 2) # Round to 2 decimal places
## Amount.Requested Monthly.Income
## Amount.Requested 1.00 0.39
## Monthly.Income 0.39 1.00
## Open.CREDIT.Lines 0.20 0.17
## Revolving.CREDIT.Balance 0.29 0.36
## Inquiries.in.the.Last.6.Months -0.03 0.03
## F2 0.08 0.12
## Interest 0.33 0.01
## DI_Ratio 0.08 -0.16
## LLen 0.41 0.07
## Open.CREDIT.Lines Revolving.CREDIT.Balance
## Amount.Requested 0.20 0.29
## Monthly.Income 0.17 0.36
## Open.CREDIT.Lines 1.00 0.29
## Revolving.CREDIT.Balance 0.29 1.00
## Inquiries.in.the.Last.6.Months 0.11 0.01
## F2 -0.09 0.00
## Interest 0.09 0.06
## DI_Ratio 0.37 0.19
## LLen 0.04 0.06
## Inquiries.in.the.Last.6.Months F2
## Amount.Requested -0.03 0.08
## Monthly.Income 0.03 0.12
## Open.CREDIT.Lines 0.11 -0.09
## Revolving.CREDIT.Balance 0.01 0.00
## Inquiries.in.the.Last.6.Months 1.00 -0.09
## F2 -0.09 1.00
## Interest 0.16 -0.71
## DI_Ratio 0.01 -0.22
## LLen 0.02 0.01
## Interest DI_Ratio LLen
## Amount.Requested 0.33 0.08 0.41
## Monthly.Income 0.01 -0.16 0.07
## Open.CREDIT.Lines 0.09 0.37 0.04
## Revolving.CREDIT.Balance 0.06 0.19 0.06
## Inquiries.in.the.Last.6.Months 0.16 0.01 0.02
## F2 -0.71 -0.22 0.01
## Interest 1.00 0.17 0.42
## DI_Ratio 0.17 1.00 0.02
## LLen 0.42 0.02 1.00
Above results suggest that there is high corection between Interest Rate and a. Amount.requested (0.3318) b. Inquiries.in.the.Last.6.Months (0.1646) c. DI_Ratio (0.172) d. LLen (0.423) e FICO vlaue (-0.7092) f.Open.CREDIT.Lines ( 0.09030695)
Below plot will graphically illustrate the impact of correlation
cdr <- abs(cor(cdf))
cd.col <- dmat.color(cdr)
cd.or <- order.single(cdr)
cpairs(cdf, cd.or, panel.colors = cd.col, gap = 0.5, main = "Correlation")
Fitting regression equation with above varaibles ** UNSTANDARDIZED Regression**
summary(lm(cdf$Interest ~ cdf$F2))
##
## Call:
## lm(formula = cdf$Interest ~ cdf$F2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.990 -2.136 -0.456 1.835 10.194
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.07189 0.13314 143.2 <2e-16 ***
## cdf$F2 -0.42350 0.00842 -50.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.95 on 2496 degrees of freedom
## Multiple R-squared: 0.503, Adjusted R-squared: 0.503
## F-statistic: 2.53e+03 on 1 and 2496 DF, p-value: <2e-16
confint(lm(cdf$Interest ~ cdf$F2))
## 2.5 % 97.5 %
## (Intercept) 18.81 19.333
## cdf$F2 -0.44 -0.407
ggplot(cdf, aes(x = F2, y = Interest)) + geom_smooth(method = "lm") + geom_point()
Summary shows that FICO values explaines 50% of varaiation for Interest. Because R square value si 0.50 To explain the remainaing variation, multiple regression with variables identified after correlation analysis is done ** UNSTANDARDIZED Regression**
Rg1 <- lm(cdf$Interest ~ cdf$Inquiries.in.the.Last.6.Months + cdf$DI_Ratio +
cdf$LLen + cdf$F2 + cdf$Open.CREDIT.Lines + cdf$Amount.Requested)
summary(Rg1)
##
## Call:
## lm(formula = cdf$Interest ~ cdf$Inquiries.in.the.Last.6.Months +
## cdf$DI_Ratio + cdf$LLen + cdf$F2 + cdf$Open.CREDIT.Lines +
## cdf$Amount.Requested)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.570 -1.381 -0.168 1.205 9.982
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.21e+01 2.31e-01 52.24 < 2e-16
## cdf$Inquiries.in.the.Last.6.Months 3.52e-01 3.39e-02 10.38 < 2e-16
## cdf$DI_Ratio 1.23e-03 6.05e-03 0.20 0.84
## cdf$LLen 1.34e-01 4.57e-03 29.41 < 2e-16
## cdf$F2 -4.36e-01 6.10e-03 -71.61 < 2e-16
## cdf$Open.CREDIT.Lines -5.03e-02 1.01e-02 -4.98 6.7e-07
## cdf$Amount.Requested 1.47e-04 5.96e-06 24.67 < 2e-16
##
## (Intercept) ***
## cdf$Inquiries.in.the.Last.6.Months ***
## cdf$DI_Ratio
## cdf$LLen ***
## cdf$F2 ***
## cdf$Open.CREDIT.Lines ***
## cdf$Amount.Requested ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.06 on 2491 degrees of freedom
## Multiple R-squared: 0.757, Adjusted R-squared: 0.757
## F-statistic: 1.3e+03 on 6 and 2491 DF, p-value: <2e-16
confint(Rg1)
## 2.5 % 97.5 %
## (Intercept) 11.6054002 12.5106546
## cdf$Inquiries.in.the.Last.6.Months 0.2857053 0.4187286
## cdf$DI_Ratio -0.0106278 0.0130914
## cdf$LLen 0.1253267 0.1432347
## cdf$F2 -0.4484503 -0.4245450
## cdf$Open.CREDIT.Lines -0.0701221 -0.0305200
## cdf$Amount.Requested 0.0001354 0.0001588
Summary shows that R-squared value is 0.757 Other variables that affect interest rate are a. Amount.requested (0.3318) b. Inquiries.in.the.Last.6.Months (0.1646) c. DI_Ratio (0.172) d. LLen (0.423) e.#$FICO vlaue (-0.7092) f.Open.CREDIT.Lines ( 0.09030695)
Howeve, to check whether no.of open credit lines really affect interest rate, one more regressiona analysis without Open.credit.lines is done. ** UNSTANDARDIZED Regression**
Rg2 <- lm(cdf$Interest ~ cdf$Inquiries.in.the.Last.6.Months + cdf$DI_Ratio +
cdf$LLen + cdf$F2 + cdf$Amount.Requested)
summary(Rg2)
##
## Call:
## lm(formula = cdf$Interest ~ cdf$Inquiries.in.the.Last.6.Months +
## cdf$DI_Ratio + cdf$LLen + cdf$F2 + cdf$Amount.Requested)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.518 -1.363 -0.146 1.229 9.946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.17e+01 2.23e-01 52.61 <2e-16
## cdf$Inquiries.in.the.Last.6.Months 3.32e-01 3.38e-02 9.80 <2e-16
## cdf$DI_Ratio -9.39e-03 5.69e-03 -1.65 0.099
## cdf$LLen 1.35e-01 4.58e-03 29.55 <2e-16
## cdf$F2 -4.36e-01 6.12e-03 -71.19 <2e-16
## cdf$Amount.Requested 1.41e-04 5.88e-06 24.05 <2e-16
##
## (Intercept) ***
## cdf$Inquiries.in.the.Last.6.Months ***
## cdf$DI_Ratio .
## cdf$LLen ***
## cdf$F2 ***
## cdf$Amount.Requested ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.07 on 2492 degrees of freedom
## Multiple R-squared: 0.755, Adjusted R-squared: 0.755
## F-statistic: 1.54e+03 on 5 and 2492 DF, p-value: <2e-16
confint(Rg2)
## 2.5 % 97.5 %
## (Intercept) 11.3093363 12.185073
## cdf$Inquiries.in.the.Last.6.Months 0.2652968 0.397959
## cdf$DI_Ratio -0.0205383 0.001766
## cdf$LLen 0.1264324 0.144403
## cdf$F2 -0.4479058 -0.423891
## cdf$Amount.Requested 0.0001299 0.000153
R-square value of 0.755 shows that Open.credit.lines does not signifcantly explain or account for interest rate, Hence variables that affect interest rate are a. Amount.requested (0.3318) b. Inquiries.in.the.Last.6.Months (0.1646) c. DI_Ratio (0.172) d. LLen (0.423) e.$FICO vlaue (-0.7092)
To visualize above model, solve the predicted scores as new variable and then plot
```r
cdf$predicted <- fitted(Rg2)
ggplot(cdf, aes(x = predicted, y = Interest)) + geom_smooth(method = "lm") +
geom_point()
Further, residuals are computed to check the relationship between residuals and predicted values. Residuals should be orthogonal to predicted values.
cdf$er <- resid(Rg2)
ggplot(cdf, aes(x = predicted, y = er)) + geom_smooth(method = "lm") + geom_point()
Muliple plots on single page are plotted here
# Split screens to 2 columns par(mfrow=c(1,2))
layout(t(1:2))
# Graph 1
cdr <- abs(cor(cdf))
cd.col <- dmat.color(cdr)
cd.or <- order.single(cdr)
cpairs(cdf, cd.or, panel.colors = cd.col, gap = 0.5, main = "Correlation")
# Graph 2
cdf$predicted <- fitted(Rg2)
ggplot(cdf, aes(x = predicted, y = Interest)) + geom_smooth(method = "lm") +
geom_point() + opts(title = "Predicted Interest Rate vs Actual Interest Rate")
## 'opts' is deprecated. Use 'theme' instead. (Deprecated; last used in version 0.9.1)
## Setting the plot title with opts(title="...") is deprecated.
## Use labs(title="...") or ggtitle("...") instead. (Deprecated; last used in version 0.9.1)