Dataset
td <- read.csv('C:/Users/tansiahong/Desktop/MAster Data Science/sem 2/Programming DS/group/Loan-Prediction-with-R-master/loan_data_train.csv', header = TRUE)
head(td)
## Loan_ID Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 1 LP001310 5000 5000 5.42%
## 2 5000 4900 5.79%
## 3 3000 2950 5.79%
## 4 4000 3882.78 5.79%
## 5 LP001333 4800 4775 5.79%
## 6 LP001562 4500 4500 5.99%
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State Home.Ownership
## 1 36 months wedding 23.15% OH RENT
## 2 36 months other 9.18% NY RENT
## 3 36 months small_business 8.49% CO RENT
## 4 36 months moving 23.94% OH RENT
## 5 36 months car 11.71% NJ RENT
## 6 36 months debt_consolidation 21.46% NY RENT
## Monthly.Income FICO.Range Open.CREDIT.Lines Revolving.CREDIT.Balance
## 1 2833.33 750-754 7 4573
## 2 5520.00 750-754 10 7351
## 3 5416.67 745-749 13 10380
## 4 4666.67 760-764 7 2889
## 5 2800.00 735-739 6 9228
## 6 3644.83 765-769 8 1058
## Inquiries.in.the.Last.6.Months Employment.Length
## 1 0 3 years
## 2 0 7 years
## 3 1 1 year
## 4 2 9 years
## 5 0 3 years
## 6 2 < 1 year
summary(td)
## Loan_ID Amount.Requested Amount.Funded.By.Investors
## Length:2200 Length:2200 Length:2200
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Interest.Rate Loan.Length Loan.Purpose Debt.To.Income.Ratio
## Length:2200 Length:2200 Length:2200 Length:2200
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## State Home.Ownership Monthly.Income FICO.Range
## Length:2200 Length:2200 Min. : 588.5 Length:2200
## Class :character Class :character 1st Qu.: 3458.0 Class :character
## Mode :character Mode :character Median : 5000.0 Mode :character
## Mean : 5727.5
## 3rd Qu.: 6883.3
## Max. :102750.0
## NA's :3
## Open.CREDIT.Lines Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## Length:2200 Length:2200 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Median :0.0000
## Mean :0.8985
## 3rd Qu.:1.0000
## Max. :9.0000
## NA's :3
## Employment.Length
## Length:2200
## Class :character
## Mode :character
##
##
##
##
str(td)
## 'data.frame': 2200 obs. of 15 variables:
## $ Loan_ID : chr "LP001310" "" "" "" ...
## $ Amount.Requested : chr "5000" "5000" "3000" "4000" ...
## $ Amount.Funded.By.Investors : chr "5000" "4900" "2950" "3882.78" ...
## $ Interest.Rate : chr "5.42%" "5.79%" "5.79%" "5.79%" ...
## $ Loan.Length : chr "36 months" "36 months" "36 months" "36 months" ...
## $ Loan.Purpose : chr "wedding" "other" "small_business" "moving" ...
## $ Debt.To.Income.Ratio : chr "23.15%" "9.18%" "8.49%" "23.94%" ...
## $ State : chr "OH" "NY" "CO" "OH" ...
## $ Home.Ownership : chr "RENT" "RENT" "RENT" "RENT" ...
## $ Monthly.Income : num 2833 5520 5417 4667 2800 ...
## $ FICO.Range : chr "750-754" "750-754" "745-749" "760-764" ...
## $ Open.CREDIT.Lines : chr "7" "10" "13" "7" ...
## $ Revolving.CREDIT.Balance : chr "4573" "7351" "10380" "2889" ...
## $ Inquiries.in.the.Last.6.Months: int 0 0 1 2 0 2 1 1 0 0 ...
## $ Employment.Length : chr "3 years" "7 years" "1 year" "9 years" ...
dim(td)
## [1] 2200 15
sapply(td, function(x) sum(is.na(x)))
## Loan_ID Amount.Requested
## 0 1
## Amount.Funded.By.Investors Interest.Rate
## 1 0
## Loan.Length Loan.Purpose
## 1 1
## Debt.To.Income.Ratio State
## 1 1
## Home.Ownership Monthly.Income
## 0 3
## FICO.Range Open.CREDIT.Lines
## 0 4
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 3 3
## Employment.Length
## 0
td$Debt.To.Income.Ratio <- as.numeric(gsub("\\%", "", td$Debt.To.Income.Ratio))
td$Interest.Rate <- as.numeric(gsub("\\%", "", td$Interest.Rate))
td$Amount.Funded.By.Investors <- as.numeric(td$Amount.Funded.By.Investors)
td$Amount.Requested <- as.numeric(td$Amount.Requested)
td$Debt.To.Income.Ratio[is.na(td$Debt.To.Income.Ratio)] <- mean(td$Debt.To.Income.Ratio, na.rm = TRUE)
td$Amount.Requested[is.na(td$Amount.Requested)] <- mean(td$Amount.Requested, na.rm = TRUE)
td$Amount.Funded.By.Investors[is.na(td$Amount.Funded.By.Investors)] <- mean(td$Amount.Funded.By.Investors, na.rm = TRUE)
td$Monthly.Income[is.na(td$Monthly.Income)] <- mean(td$Monthly.Income, na.rm = TRUE)
td$Inquiries.in.the.Last.6.Months [is.na(td$Inquiries.in.the.Last.6.Months )] <- mean(td$Inquiries.in.the.Last.6.Months , na.rm = TRUE)
td <- subset(td, select = -c(Loan.Length,State,FICO.Range,Open.CREDIT.Lines, Revolving.CREDIT.Balance,Employment.Length,Home.Ownership,Loan.Purpose))
sapply(td, function(x) sum(is.na(x)))
## Loan_ID Amount.Requested
## 0 0
## Amount.Funded.By.Investors Interest.Rate
## 0 0
## Debt.To.Income.Ratio Monthly.Income
## 0 0
## Inquiries.in.the.Last.6.Months
## 0
td$Annual.Income <- td$Monthly.Income*12
td$logAnnual.Income <- log(td$Annual.Income)
td$logAmount.Requested <- log(td$Amount.Requested)
td$Interest.Rate <- td$Interest.Rate/100
td$Debt.To.Income.Ratio <- td$Debt.To.Income.Ratio/100
str(td)
## 'data.frame': 2200 obs. of 10 variables:
## $ Loan_ID : chr "LP001310" "" "" "" ...
## $ Amount.Requested : num 5000 5000 3000 4000 4800 4500 16000 16000 12300 5800 ...
## $ Amount.Funded.By.Investors : num 5000 4900 2950 3883 4775 ...
## $ Interest.Rate : num 0.0542 0.0579 0.0579 0.0579 0.0579 0.0599 0.0603 0.0603 0.0603 0.0603 ...
## $ Debt.To.Income.Ratio : num 0.2315 0.0918 0.0849 0.2394 0.1171 ...
## $ Monthly.Income : num 2833 5520 5417 4667 2800 ...
## $ Inquiries.in.the.Last.6.Months: num 0 0 1 2 0 2 1 1 0 0 ...
## $ Annual.Income : num 34000 66240 65000 56000 33600 ...
## $ logAnnual.Income : num 10.4 11.1 11.1 10.9 10.4 ...
## $ logAmount.Requested : num 8.52 8.52 8.01 8.29 8.48 ...
summary(td)
## Loan_ID Amount.Requested Amount.Funded.By.Investors
## Length:2200 Min. : 1000 Min. : -0.01
## Class :character 1st Qu.: 6000 1st Qu.: 6000.00
## Mode :character Median :10000 Median :10000.00
## Mean :12496 Mean :12077.80
## 3rd Qu.:17000 3rd Qu.:16200.00
## Max. :35000 Max. :35000.00
## Interest.Rate Debt.To.Income.Ratio Monthly.Income
## Min. :0.0542 Min. :0.00000 Min. : 588.5
## 1st Qu.:0.1016 1st Qu.:0.09738 1st Qu.: 3458.0
## Median :0.1311 Median :0.15225 Median : 5000.0
## Mean :0.1304 Mean :0.15371 Mean : 5727.5
## 3rd Qu.:0.1580 3rd Qu.:0.20672 3rd Qu.: 6877.1
## Max. :0.2489 Max. :0.34910 Max. :102750.0
## Inquiries.in.the.Last.6.Months Annual.Income logAnnual.Income
## Min. :0.0000 Min. : 7062 Min. : 8.862
## 1st Qu.:0.0000 1st Qu.: 41496 1st Qu.:10.633
## Median :0.0000 Median : 60000 Median :11.002
## Mean :0.8985 Mean : 68730 Mean :10.991
## 3rd Qu.:1.0000 3rd Qu.: 82525 3rd Qu.:11.321
## Max. :9.0000 Max. :1233000 Max. :14.025
## logAmount.Requested
## Min. : 6.908
## 1st Qu.: 8.700
## Median : 9.210
## Mean : 9.213
## 3rd Qu.: 9.741
## Max. :10.463
This part will using the linear regression method and full dataset to analysis the relate variable coefficient.
Interest Rate Vs Annual Income and Inquiries in the last 6 monthsm1 <- lm(Interest.Rate ~ Annual.Income+Inquiries.in.the.Last.6.Months , data = td)
m1
##
## Call:
## lm(formula = Interest.Rate ~ Annual.Income + Inquiries.in.the.Last.6.Months,
## data = td)
##
## Coefficients:
## (Intercept) Annual.Income
## 1.260e-01 -1.133e-08
## Inquiries.in.the.Last.6.Months
## 5.788e-03
The result shown that the average increase of $1 in annual income is associated with an average decrease of 0.000000000113% in loan interest rate. This can describe as the higher income will get the lower interest rate in loan case.
Amount Requested Vs Annual incomem2 <- lm(Amount.Requested ~ Annual.Income, data = td)
m2
##
## Call:
## lm(formula = Amount.Requested ~ Annual.Income, data = td)
##
## Coefficients:
## (Intercept) Annual.Income
## 8.260e+03 6.165e-02
set.seed(100) # setting seed to reproduce results of random sampling
trainingRowIndex <- sample(1:nrow(td), 0.80*nrow(td)) # row indices for training data
trainingData <- td[trainingRowIndex, ] # model training data
testData <- td[-trainingRowIndex, ] # test data
lmMod <- lm(Interest.Rate ~ logAnnual.Income+logAmount.Requested+Inquiries.in.the.Last.6.Months+Amount.Funded.By.Investors+Debt.To.Income.Ratio, data=trainingData) # build the model
summary(lmMod)
##
## Call:
## lm(formula = Interest.Rate ~ logAnnual.Income + logAmount.Requested +
## Inquiries.in.the.Last.6.Months + Amount.Funded.By.Investors +
## Debt.To.Income.Ratio, data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.105444 -0.027417 -0.000356 0.025841 0.111537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.248e-01 2.961e-02 7.594 5.03e-14 ***
## logAnnual.Income -1.011e-02 1.994e-03 -5.072 4.36e-07 ***
## logAmount.Requested -2.938e-03 2.759e-03 -1.065 0.287
## Inquiries.in.the.Last.6.Months 6.377e-03 7.445e-04 8.565 < 2e-16 ***
## Amount.Funded.By.Investors 2.379e-06 2.573e-07 9.243 < 2e-16 ***
## Debt.To.Income.Ratio 6.072e-02 1.216e-02 4.996 6.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03746 on 1754 degrees of freedom
## Multiple R-squared: 0.1764, Adjusted R-squared: 0.1741
## F-statistic: 75.16 on 5 and 1754 DF, p-value: < 2.2e-16
distPred <- predict(lmMod, testData)
From the model summary, the model p value and prediction p value are less than the significance level.
So you have a statistically significant model.
Also, the R-Sq and Adj R-Sq are comparative to the original model built on full data.actuals_preds <- data.frame(cbind(actuals=testData$Amount.Requested, predicteds=distPred)) # make actuals_predicteds dataframe.
correlation_accuracy <- cor(actuals_preds)
correlation_accuracy
## actuals predicteds
## actuals 1.000000 0.776801
## predicteds 0.776801 1.000000
The accuracy of this model is 77.6%, that is a good accuracy. But there are still other reasons which can affect the interest rate charged like loan purpose. We also need to consider all possible reasons to decrease the risk of lender. Therefore, the increment of interest rate based on the borrower’s income will affect the approval of amount requested by borrower and their risk of borrowing.
ggplot(actuals_preds, aes(x = distPred,
y = actuals)) +
geom_point(aes(color = as.factor(actuals)), show.legend = F) +
geom_smooth(method = "lm", se = F) +
labs(title = "Predicted vs Actual Values Using Test Dataset",
x = "Predicted",
y = "Actual")
Classification
Dataset
data <- read.csv("C:/Users/tansiahong/Desktop/MAster Data Science/sem 2/Programming DS/group/Loan-Prediction-with-R-master/loan_data_set.csv")
dim(data)
## [1] 614 13
str(data)
## 'data.frame': 614 obs. of 13 variables:
## $ Loan_ID : chr "LP001002" "LP001003" "LP001005" "LP001006" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Married : chr "No" "Yes" "Yes" "Yes" ...
## $ Dependents : chr "0" "1" "0" "0" ...
## $ Education : chr "Graduate" "Graduate" "Graduate" "Not Graduate" ...
## $ Self_Employed : chr "No" "No" "Yes" "No" ...
## $ ApplicantIncome : int 5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
## $ CoapplicantIncome: num 0 1508 0 2358 0 ...
## $ LoanAmount : int NA 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : int 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Property_Area : chr "Urban" "Rural" "Urban" "Urban" ...
## $ Loan_Status : chr "Y" "N" "Y" "Y" ...
summary(data)
## Loan_ID Gender Married Dependents
## Length:614 Length:614 Length:614 Length:614
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Education Self_Employed ApplicantIncome CoapplicantIncome
## Length:614 Length:614 Min. : 150 Min. : 0
## Class :character Class :character 1st Qu.: 2878 1st Qu.: 0
## Mode :character Mode :character Median : 3812 Median : 1188
## Mean : 5403 Mean : 1621
## 3rd Qu.: 5795 3rd Qu.: 2297
## Max. :81000 Max. :41667
##
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## Min. : 9.0 Min. : 12 Min. :0.0000 Length:614
## 1st Qu.:100.0 1st Qu.:360 1st Qu.:1.0000 Class :character
## Median :128.0 Median :360 Median :1.0000 Mode :character
## Mean :146.4 Mean :342 Mean :0.8422
## 3rd Qu.:168.0 3rd Qu.:360 3rd Qu.:1.0000
## Max. :700.0 Max. :480 Max. :1.0000
## NA's :22 NA's :14 NA's :50
## Loan_Status
## Length:614
## Class :character
## Mode :character
##
##
##
##
b = aggr(data, numbers = TRUE, col = c("light blue","blue"),xlab = names(data), sortVars = TRUE, gap = 3, cex.axis=.5, ylab = c("Missing data", "Pattern"), only.miss = TRUE)
##
## Variables sorted by number of missings:
## Variable Count
## Credit_History 0.08143322
## LoanAmount 0.03583062
## Loan_Amount_Term 0.02280130
## Loan_ID 0.00000000
## Gender 0.00000000
## Married 0.00000000
## Dependents 0.00000000
## Education 0.00000000
## Self_Employed 0.00000000
## ApplicantIncome 0.00000000
## CoapplicantIncome 0.00000000
## Property_Area 0.00000000
## Loan_Status 0.00000000
#summary(b[1])
na.index <- as.numeric(rownames(data[rowSums(is.na(data)) > 0, ] ))
set.seed(42)
samp <- sample(as.numeric(row.names(data%>% na.omit)),345)
splitting.index <- c(samp, na.index)
train.data <- data[splitting.index,]
test.data <- data[-splitting.index,]
aggr(train.data, numbers = TRUE, col = c("light blue","blue"),combined = TRUE, xlab = names(train.data), sortVars = TRUE, gap = 3, cex.axis=.5)
##
## Variables sorted by number of missings:
## Variable Count
## Credit_History 50
## LoanAmount 22
## Loan_Amount_Term 14
## Loan_ID 0
## Gender 0
## Married 0
## Dependents 0
## Education 0
## Self_Employed 0
## ApplicantIncome 0
## CoapplicantIncome 0
## Property_Area 0
## Loan_Status 0
aggr(test.data, numbers = FALSE, col = c("light blue","blue"),xlab = names(test.data), sortVars = TRUE, gap = 3, cex.axis=.5, combined = TRUE)
##
## Variables sorted by number of missings:
## Variable Count
## Loan_ID 0
## Gender 0
## Married 0
## Dependents 0
## Education 0
## Self_Employed 0
## ApplicantIncome 0
## CoapplicantIncome 0
## LoanAmount 0
## Loan_Amount_Term 0
## Credit_History 0
## Property_Area 0
## Loan_Status 0
Data Pre-processing
imputed_Data <- mice(train.data, m=2, maxit = 2, method = 'cart', seed = 500)
##
## iter imp variable
## 1 1 LoanAmount Loan_Amount_Term Credit_History
## 1 2 LoanAmount Loan_Amount_Term Credit_History
## 2 1 LoanAmount Loan_Amount_Term Credit_History
## 2 2 LoanAmount Loan_Amount_Term Credit_History
train.data <- complete(imputed_Data,2)
train.data$Loan_Status <- as.factor(train.data$Loan_Status)
test.data$Loan_Status <- as.factor(test.data$Loan_Status)
lr_model <- glm (Loan_Status ~ Dependents + ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History ,data = train.data, family = binomial)
logistic.reg <- lr_model %>% stepAIC(directiom = "both")
## Start: AIC=441.73
## Loan_Status ~ Dependents + ApplicantIncome + CoapplicantIncome +
## LoanAmount + Loan_Amount_Term + Credit_History
##
## Df Deviance AIC
## - Dependents 4 425.57 437.57
## - Loan_Amount_Term 1 422.61 440.61
## - ApplicantIncome 1 422.87 440.87
## - LoanAmount 1 423.06 441.06
## - CoapplicantIncome 1 423.29 441.29
## <none> 421.73 441.73
## - Credit_History 1 520.42 538.42
##
## Step: AIC=437.57
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History
##
## Df Deviance AIC
## - Loan_Amount_Term 1 426.33 436.33
## - ApplicantIncome 1 426.65 436.65
## - CoapplicantIncome 1 427.07 437.07
## - LoanAmount 1 427.15 437.15
## <none> 425.57 437.57
## - Credit_History 1 529.59 539.59
##
## Step: AIC=436.33
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Credit_History
##
## Df Deviance AIC
## - CoapplicantIncome 1 427.45 435.45
## - ApplicantIncome 1 427.90 435.90
## <none> 426.33 436.33
## - LoanAmount 1 428.54 436.54
## - Credit_History 1 530.27 538.27
##
## Step: AIC=435.45
## Loan_Status ~ ApplicantIncome + LoanAmount + Credit_History
##
## Df Deviance AIC
## <none> 427.45 435.45
## - ApplicantIncome 1 430.46 436.46
## - LoanAmount 1 431.70 437.70
## - Credit_History 1 530.57 536.57
coef(logistic.reg)
## (Intercept) ApplicantIncome LoanAmount Credit_History
## -1.463295e+00 6.397482e-05 -3.756459e-03 3.017305e+00
prob.train <- predict(logistic.reg, newdata = train.data, type = "response")
prediction.train <- ifelse(prob.train > 0.5, 'Y', 'N')
tab.train <-table(as.factor(train.data$Loan_Status), prediction.train)
confusionMatrix(tab.train, positive = "Y")
## Confusion Matrix and Statistics
##
## prediction.train
## N Y
## N 59 76
## Y 11 284
##
## Accuracy : 0.7977
## 95% CI : (0.7565, 0.8346)
## No Information Rate : 0.8372
## P-Value [Acc > NIR] : 0.9872
##
## Kappa : 0.4598
##
## Mcnemar's Test P-Value : 6.813e-12
##
## Sensitivity : 0.7889
## Specificity : 0.8429
## Pos Pred Value : 0.9627
## Neg Pred Value : 0.4370
## Prevalence : 0.8372
## Detection Rate : 0.6605
## Detection Prevalence : 0.6860
## Balanced Accuracy : 0.8159
##
## 'Positive' Class : Y
##
prob <- predict(logistic.reg, newdata = test.data, type = "response")
prediction <- ifelse(prob >= 0.5, 'Y', 'N')
tab <- table(as.factor(test.data$Loan_Status), prediction)
performance <- confusionMatrix(tab, positive = "Y"); performance
## Confusion Matrix and Statistics
##
## prediction
## N Y
## N 25 32
## Y 2 125
##
## Accuracy : 0.8152
## 95% CI : (0.7515, 0.8685)
## No Information Rate : 0.8533
## P-Value [Acc > NIR] : 0.9373
##
## Kappa : 0.4946
##
## Mcnemar's Test P-Value : 6.577e-07
##
## Sensitivity : 0.7962
## Specificity : 0.9259
## Pos Pred Value : 0.9843
## Neg Pred Value : 0.4386
## Prevalence : 0.8533
## Detection Rate : 0.6793
## Detection Prevalence : 0.6902
## Balanced Accuracy : 0.8611
##
## 'Positive' Class : Y
##
accuracy <- performance$overall["Accuracy"] ; accuracy
## Accuracy
## 0.8152174
sensitivity <- performance$byClass["Sensitivity"]; sensitivity
## Sensitivity
## 0.7961783
specificity <- performance$byClass["Specificity"]; specificity
## Specificity
## 0.9259259
roc(test.data$Loan_Status ~ prob, plot = TRUE, print.auc = TRUE)
##
## Call:
## roc.formula(formula = test.data$Loan_Status ~ prob, plot = TRUE, print.auc = TRUE)
##
## Data: prob in 57 controls (test.data$Loan_Status N) < 127 cases (test.data$Loan_Status Y).
## Area under the curve: 0.7105
rf <- randomForest(Loan_Status ~., data=train.data , importance = TRUE)
`
rf.prob.train <- predict(rf, newdata = train.data, type = "prob")
pred.train <- ifelse(rf.prob.train > 0.5, 'Y', 'N')
rf.tab.train <- table(as.factor(pred.train[,2]), train.data$Loan_Status)
confusionMatrix(rf.tab.train, positive = "Y")
## Confusion Matrix and Statistics
##
##
## N Y
## N 135 0
## Y 0 295
##
## Accuracy : 1
## 95% CI : (0.9915, 1)
## No Information Rate : 0.686
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Prevalence : 0.686
## Detection Rate : 0.686
## Detection Prevalence : 0.686
## Balanced Accuracy : 1.000
##
## 'Positive' Class : Y
##
rf.prob <- predict(rf, newdata = test.data, type = "prob")
pred <- ifelse(rf.prob > 0.5, 'Y', 'N')
rf.tab <- table(as.factor(pred[,2]), test.data$Loan_Status)
rf.per <- confusionMatrix(rf.tab, positive = "Y"); rf.per
## Confusion Matrix and Statistics
##
##
## N Y
## N 27 8
## Y 30 119
##
## Accuracy : 0.7935
## 95% CI : (0.7277, 0.8495)
## No Information Rate : 0.6902
## P-Value [Acc > NIR] : 0.0011550
##
## Kappa : 0.4596
##
## Mcnemar's Test P-Value : 0.0006577
##
## Sensitivity : 0.9370
## Specificity : 0.4737
## Pos Pred Value : 0.7987
## Neg Pred Value : 0.7714
## Prevalence : 0.6902
## Detection Rate : 0.6467
## Detection Prevalence : 0.8098
## Balanced Accuracy : 0.7053
##
## 'Positive' Class : Y
##
rf.acc <- rf.per$overall["Accuracy"];rf.acc
## Accuracy
## 0.7934783
rf.sens <-rf.per$byClass["Sensitivity"]; rf.sens
## Sensitivity
## 0.9370079
rf.spec <-rf.per$byClass["Specificity"]; rf.spec
## Specificity
## 0.4736842
roc(test.data$Loan_Status ~ rf.prob[,1], plot = TRUE, print.auc = TRUE)
##
## Call:
## roc.formula(formula = test.data$Loan_Status ~ rf.prob[, 1], plot = TRUE, print.auc = TRUE)
##
## Data: rf.prob[, 1] in 57 controls (test.data$Loan_Status N) > 127 cases (test.data$Loan_Status Y).
## Area under the curve: 0.771