*Instalación de paquetes
## Instalación de librerías
#install.packages("dplyr", dependencies = TRUE)
#install.packages("rpart", dependencies = TRUE)
#install.packages("ggplot2", dependencies = TRUE)
#install.packages("rattle", dependencies = TRUE)
#install.packages("rpart.plot",dependencies = TRUE)
#install.packages("jmv")
#install.packages("readxl", dependencies = TRUE)
# Load the packages R
library(rpart)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(RColorBrewer)
library(class)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(jmv)
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("~/Desktop/R-MINIG/Proyecto Final ")
Loan_FinalDC <- read_excel("~/Desktop/Proyecto/Loan_FinalDC.xlsx")
#View(Loan_FinalDC)
*El Dataset cuenta con 614 registros, donde las columnas cuantitativas son el ingreso del deudor, ingreso del codeudor,el monto del crédito.
jmv::descriptives(
data = Loan_FinalDC,
vars = vars( Dependents, ApplicantIncome, CoapplicantIncome, LoanAmount),
hist = TRUE,
violin = TRUE,
dot = TRUE,
mode = TRUE,
sum = TRUE,
variance = TRUE,
range = TRUE,
kurt = TRUE)
##
## DESCRIPTIVES
##
## Descriptives
## ───────────────────────────────────────────────────────────────────────────────────────────
## Dependents ApplicantIncome CoapplicantIncome LoanAmount
## ───────────────────────────────────────────────────────────────────────────────────────────
## N 614 614 614 614
## Missing 0 0 0 0
## Mean 0.827 5403 1638 147
## Median 0.00 3812 1221 128
## Mode 0.00 2500 0.00 120
## Sum 508 3317724 1005912 90546
## Variance 1.47 3.73e+7 8668426 7327
## Range 4.00 80850 41667 691
## Minimum 0.00 150 0.00 9.00
## Maximum 4.00 81000 41667 700
## Kurtosis 1.19 60.5 82.8 9.90
## Std. error kurtosis 0.197 0.197 0.197 0.197
## ───────────────────────────────────────────────────────────────────────────────────────────
set.seed(1)
# Shuffle the dataset, call the result shuffled
n <- nrow(Loan_FinalDC)
shuffled <- Loan_FinalDC[sample(n),]
train_indices <- 1:round(0.7 * n)
train <- shuffled[train_indices, ]
test_indices <- (round(0.7 * n) + 1):n
test <- shuffled[test_indices, ]
# Print the structure of train
str(train)
## Classes 'tbl_df', 'tbl' and 'data.frame': 430 obs. of 12 variables:
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Dependents : num 2 0 0 0 2 0 4 0 0 0 ...
## $ Education : chr "Graduate" "Graduate" "Graduate" "Graduate" ...
## $ Self_Employed : chr "No" "No" "No" "No" ...
## $ ApplicantIncome : num 4167 4758 9083 3593 2957 ...
## $ CoapplicantIncome: num 1447 0 0 4266 0 ...
## $ LoanAmount : num 158 158 228 132 81 145 150 65 296 144 ...
## $ Loan_Amount_Term : num 360 480 360 180 360 360 360 300 360 360 ...
## $ Credit_History : num 1 1 1 0 1 1 1 1 1 1 ...
## $ Property_Area : chr "Rural" "Semiurban" "Semiurban" "Rural" ...
## $ Loan_Status : num 1 1 1 0 1 1 0 0 1 1 ...
# Print the structure of test
str(test)
## Classes 'tbl_df', 'tbl' and 'data.frame': 184 obs. of 12 variables:
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ Married : chr "No" "Yes" "Yes" "No" ...
## $ Dependents : num 0 4 4 0 0 4 0 0 0 0 ...
## $ Education : chr "Graduate" "Graduate" "Not Graduate" "Not Graduate" ...
## $ Self_Employed : chr "No" "No" "Yes" "No" ...
## $ ApplicantIncome : num 3660 3400 7100 1907 4887 ...
## $ CoapplicantIncome: num 5064 2500 0 2365 0 ...
## $ LoanAmount : num 187 123 125 120 133 100 96 490 76 80 ...
## $ Loan_Amount_Term : num 360 360 60 360 360 360 360 180 360 360 ...
## $ Credit_History : num 1 0 1 1 1 1 1 1 1 1 ...
## $ Property_Area : chr "Semiurban" "Rural" "Urban" "Urban" ...
## $ Loan_Status : num 1 0 1 1 0 1 1 1 1 1 ...
# Fill in the model that has been learned.
tree <- rpart(Loan_Status ~ ., train, method = "class")
# Predict the outcome on the test set with tree: pred
pred <- predict(tree,test, type="class")
# Calculate the confusion matrix: conf
conf <- table(test$Loan_Status,pred)
# Print this confusion matrix
conf
## pred
## 0 1
## 0 31 17
## 1 19 117
#Calculo del Accuracy del modelo inicial
sum(diag(conf)) / sum(conf)
## [1] 0.8043478
# The shuffled dataset is already loaded into your workspace
shuffled <-Loan_FinalDC[sample(n),]
# Set random seed. Don't remove this line.
set.seed(1)
# Initialize the accs vector
accs <- rep(0,4)
for (i in 1:4) {
# These indices indicate the interval of the test set
indices <- (((i-1) * round((1/4)*nrow(shuffled))) + 1):((i*round((1/4) * nrow(shuffled))))
# Exclude them from the train set
train <- shuffled[-indices,]
# Include them in the test set
test <- shuffled[indices,]
# A model is learned using each training set
tree <- rpart(Loan_Status ~ ., train, method = "class")
# Make a prediction on the test set using tree
pred <- predict(tree, test, type = "class")
# Assign the confusion matrix to conf
conf <- table(test$Loan_Status,pred)
# Assign the accuracy of this model to the ith index in accs
accs[i] <- sum(diag(conf))/sum(conf)
}
*Resultado del promedio del accuracy —77%—
# Print out the mean of accs
mean(accs)
## [1] 0.7767644
# The shuffled dataset is already loaded into your workspace
shuffled <-Loan_FinalDC[sample(n),]
# Set random seed. Don't remove this line.
set.seed(1)
# Initialize the accs vector
accs <- rep(0,30)
for (i in 1:30) {
# These indices indicate the interval of the test set
indices <- (((i-1) * round((1/30)*nrow(shuffled))) + 1):((i*round((1/30) * nrow(shuffled))))
# Exclude them from the train set
train <- shuffled[-indices,]
# Include them in the test set
test <- shuffled[indices,]
# A model is learned using each training set
tree <- rpart(Loan_Status ~ ., train, method = "class")
# Make a prediction on the test set using tree
pred <- predict(tree, test, type = "class")
# Assign the confusion matrix to conf
conf <- table(test$Loan_Status,pred)
# Assign the accuracy of this model to the ith index in accs
accs[i] <- sum(diag(conf))/sum(conf)
}
*Resultado del promedio del accuracy –80%–
# Print out the mean of accs
mean(accs)
## [1] 0.8016667
# Set random seed
set.seed(1)
# Build a tree model
tree <- rpart(Loan_Status ~ ., train, method = "class")
# Plot the decision model
fancyRpartPlot(tree)
# Predict the values
pred <- predict(tree,test, type = "class")
# Construct the confusion matrix: conf
conf <- table(test$Loan_Status,pred)
# Print out the accuracy
sum(diag(conf)) / sum(conf)
## [1] 0.85
tree <- rpart(Loan_Status ~ ., train, method = "class", control = rpart.control(cp=0.00001))
# Draw the complex tree
fancyRpartPlot(tree)
# Prune the tree: pruned
pruned <- prune(tree, cp=0.01)
*Resultado del prune
# Draw pruned
fancyRpartPlot(pruned)
# Set random seed
set.seed(1)
# Build a tree on the training
tree <- rpart(Loan_Status ~ ., train, method = "class")
*Predicción de las probabilidades
# Predict probability values using the model: all_probs
all_probs <- predict(tree,test, type="prob")
# Print out all_probs
all_probs
## 0 1
## 1 0.2114625 0.78853755
## 2 0.2114625 0.78853755
## 3 0.2114625 0.78853755
## 4 0.2114625 0.78853755
## 5 0.2114625 0.78853755
## 6 0.2114625 0.78853755
## 7 0.2114625 0.78853755
## 8 0.2114625 0.78853755
## 9 0.2114625 0.78853755
## 10 0.2114625 0.78853755
## 11 0.2114625 0.78853755
## 12 0.2114625 0.78853755
## 13 0.2114625 0.78853755
## 14 0.2114625 0.78853755
## 15 0.2114625 0.78853755
## 16 0.9204545 0.07954545
## 17 0.2114625 0.78853755
## 18 0.2114625 0.78853755
## 19 0.2114625 0.78853755
## 20 0.2114625 0.78853755
# Select second column of all_probs: probs
probs <- all_probs[,2]
# Make a prediction object: pred
pred <- prediction(probs,test$Loan_Status)
# Make a performance object: perf
perf <- performance(pred,"tpr","fpr")
# Plot this curve
plot(perf)
# Make a prediction object: pred
pred <- prediction(probs,test$Loan_Status)
# Make a performance object: perf
perf <- performance(pred,"auc")
*Resultado del performance del modelo (AUC) –62%–
# Print out the AUC
perf
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.625
##
##
## Slot "alpha.values":
## list()
logistic <- glm(Loan_Status ~ . , data=Loan_FinalDC, family="binomial")
summary(logistic)
##
## Call:
## glm(formula = Loan_Status ~ ., family = "binomial", data = Loan_FinalDC)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2488 -0.3730 0.5359 0.7128 2.5061
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.525e+00 8.450e-01 -2.988 0.00281 **
## GenderMale 1.781e-02 2.971e-01 0.060 0.95220
## MarriedYes 5.621e-01 2.455e-01 2.290 0.02205 *
## Dependents 3.439e-02 9.715e-02 0.354 0.72338
## EducationNot Graduate -4.290e-01 2.590e-01 -1.656 0.09771 .
## Self_EmployedYes -6.725e-02 2.697e-01 -0.249 0.80307
## ApplicantIncome 1.771e-05 2.346e-05 0.755 0.45037
## CoapplicantIncome -4.220e-05 3.412e-05 -1.237 0.21609
## LoanAmount -2.728e-03 1.538e-03 -1.773 0.07616 .
## Loan_Amount_Term -8.929e-04 1.806e-03 -0.494 0.62102
## Credit_History 3.920e+00 4.196e-01 9.341 < 2e-16 ***
## Property_AreaSemiurban 8.675e-01 2.675e-01 3.243 0.00118 **
## Property_AreaUrban 1.762e-01 2.565e-01 0.687 0.49210
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 762.89 on 613 degrees of freedom
## Residual deviance: 560.13 on 601 degrees of freedom
## AIC: 586.13
##
## Number of Fisher Scoring iterations: 5
logistic <- glm(Loan_Status ~ Credit_History + Property_Area , data=Loan_FinalDC, family="binomial")
summary(logistic)
##
## Call:
## glm(formula = Loan_Status ~ Credit_History + Property_Area, family = "binomial",
## data = Loan_FinalDC)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9985 -0.3658 0.5402 0.7301 2.4373
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.9176 0.4338 -6.725 1.75e-11 ***
## Credit_History 3.8569 0.4137 9.323 < 2e-16 ***
## Property_AreaSemiurban 0.9117 0.2634 3.461 0.000539 ***
## Property_AreaUrban 0.2468 0.2493 0.990 0.322274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 762.89 on 613 degrees of freedom
## Residual deviance: 574.71 on 610 degrees of freedom
## AIC: 582.71
##
## Number of Fisher Scoring iterations: 5