A análise de credito é feito da seguinte forma, pega a base de dados de clientes com o perfil parecido e criamos os estudos.
Convertendo as variaveis para o tipo fator(categorico)
to.factors <- function(df, variables){
for (variable in variables){
df[[variable]] <- as.factor(df[[variable]])
}
return(df)
}
scale.features <- function(df, variables){
for (variable in variables){
df[[variable]] <- scale(df[[variable]], center=T, scale=T)
}
return(df)
}
# Normalizando as variáveis
numeric.vars <- c("credit.duration.months", "age", "credit.amount")
credito <- scale.features(credito, numeric.vars)
# Variáveis do tipo fator
categorical.vars <- c('credit.rating', 'account.balance', 'previous.credit.payment.status',
'credit.purpose', 'savings', 'employment.duration', 'installment.rate',
'marital.status', 'guarantor', 'residence.duration', 'current.assets',
'other.credits', 'apartment.type', 'bank.credits', 'occupation',
'dependents', 'telephone', 'foreign.worker')
credito<- to.factors(df = credito, variables = categorical.vars)
indexes <- sample(1:nrow(credito), size = 0.6 * nrow(credito))
train.data <- credito[indexes,]
test.data <- credito[-indexes,]
Criar uma função para a seleção de variaveis
#Criamos a função para selecionar quais colunas vamos trabalhar. Nem sempre iremos
#trabalhar com toda a base de dados.
run.feature.selection <- function(num.iters=20, feature.vars, class.var){
set.seed(10)
variable.sizes <- 1:10
control <- rfeControl(functions = rfFuncs, method = "cv",
verbose = FALSE, returnResamp = "all",
number = num.iters)
results.rfe <- rfe(x = feature.vars, y = class.var,
sizes = variable.sizes,
rfeControl = control)
return(results.rfe)
}
rfe.results <- run.feature.selection(feature.vars = train.data[,-1],
class.var = train.data[,1])
rfe.results
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (20 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.6752 0.1641 0.06328 0.1883
## 2 0.7318 0.2892 0.08317 0.2167
## 3 0.7620 0.3613 0.08767 0.2475 *
## 4 0.7605 0.3897 0.10169 0.2653
## 5 0.7420 0.3293 0.08432 0.2278
## 6 0.7500 0.3614 0.07478 0.1937
## 7 0.7517 0.3647 0.08677 0.2330
## 8 0.7499 0.3530 0.07685 0.2246
## 9 0.7400 0.3477 0.08205 0.2223
## 10 0.7449 0.3582 0.08895 0.2334
## 20 0.7482 0.3305 0.07384 0.2017
##
## The top 3 variables (out of 3):
## account.balance, credit.duration.months, previous.credit.payment.status
varImp((rfe.results))
## Overall
## account.balance 19.165378
## credit.duration.months 12.785366
## previous.credit.payment.status 8.009212
## guarantor 7.152412
Ou seja, o credito é aprovado quando a pessoa tem uma bom saldo bancario.
# Biblioteca de utilitários para construção de gráficos
source("plot_utils.R")
test.feature.vars <- test.data[,-1]
test.class.var <- test.data[,1]
formula.init <- "credit.rating ~ ."
formula.init <- as.formula(formula.init)
lr.model <- glm(formula = formula.init, data = train.data, family = "binomial")
summary(lr.model)
##
## Call:
## glm(formula = formula.init, family = "binomial", data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5483 -0.7012 0.3736 0.6994 2.0943
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26765 1.00048 0.268 0.78907
## account.balance2 0.59474 0.27362 2.174 0.02974 *
## account.balance3 1.74332 0.28950 6.022 1.73e-09 ***
## credit.duration.months -0.41323 0.14126 -2.925 0.00344 **
## previous.credit.payment.status2 0.52134 0.39180 1.331 0.18331
## previous.credit.payment.status3 1.32046 0.41534 3.179 0.00148 **
## credit.purpose2 -0.96992 0.52714 -1.840 0.06577 .
## credit.purpose3 -0.93383 0.49192 -1.898 0.05765 .
## credit.purpose4 -1.52566 0.47845 -3.189 0.00143 **
## credit.amount -0.35316 0.16396 -2.154 0.03125 *
## savings2 0.03168 0.38728 0.082 0.93480
## savings3 0.17571 0.40340 0.436 0.66315
## savings4 0.58072 0.34144 1.701 0.08898 .
## employment.duration2 0.23854 0.29937 0.797 0.42555
## employment.duration3 0.99957 0.37346 2.676 0.00744 **
## employment.duration4 0.24954 0.35740 0.698 0.48506
## installment.rate2 -0.11883 0.38074 -0.312 0.75497
## installment.rate3 -0.62551 0.42598 -1.468 0.14199
## installment.rate4 -0.96172 0.37223 -2.584 0.00978 **
## marital.status3 0.60166 0.26148 2.301 0.02139 *
## marital.status4 0.13407 0.38605 0.347 0.72838
## guarantor2 0.49274 0.37005 1.332 0.18301
## residence.duration2 -0.51126 0.37895 -1.349 0.17729
## residence.duration3 -0.41727 0.41561 -1.004 0.31539
## residence.duration4 -0.28448 0.38321 -0.742 0.45788
## current.assets2 -0.30824 0.32506 -0.948 0.34298
## current.assets3 -0.29912 0.30489 -0.981 0.32656
## current.assets4 -0.58922 0.55127 -1.069 0.28514
## age 0.17606 0.13748 1.281 0.20034
## other.credits2 0.26818 0.27642 0.970 0.33195
## apartment.type2 0.39356 0.29361 1.340 0.18010
## apartment.type3 0.15187 0.60157 0.252 0.80069
## bank.credits2 -0.45329 0.30468 -1.488 0.13681
## occupation2 0.10354 0.72174 0.143 0.88593
## occupation3 0.19619 0.69606 0.282 0.77805
## occupation4 0.14443 0.74056 0.195 0.84537
## dependents2 -0.33488 0.32988 -1.015 0.31004
## telephone2 0.21892 0.26195 0.836 0.40331
## foreign.worker2 1.20275 0.77877 1.544 0.12248
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 741.31 on 599 degrees of freedom
## Residual deviance: 545.78 on 561 degrees of freedom
## AIC: 623.78
##
## Number of Fisher Scoring iterations: 5
lr.predictions <- predict(lr.model, test.data, type="response")
lr.predictions <- round(lr.predictions)
confusionMatrix(table(data = lr.predictions, reference = test.class.var), positive = '1')
## Confusion Matrix and Statistics
##
## reference
## data 0 1
## 0 56 32
## 1 59 253
##
## Accuracy : 0.7725
## 95% CI : (0.7282, 0.8127)
## No Information Rate : 0.7125
## P-Value [Acc > NIR] : 0.004056
##
## Kappa : 0.4029
##
## Mcnemar's Test P-Value : 0.006420
##
## Sensitivity : 0.8877
## Specificity : 0.4870
## Pos Pred Value : 0.8109
## Neg Pred Value : 0.6364
## Prevalence : 0.7125
## Detection Rate : 0.6325
## Detection Prevalence : 0.7800
## Balanced Accuracy : 0.6873
##
## 'Positive' Class : 1
##
Ele etsa com um bom nível de acuracia. Mas podemos melhorar.
## Feature selection
formula <- "credit.rating ~ ."
formula <- as.formula(formula)
control <- trainControl(method = "repeatedcv", number = 10, repeats = 2)
model <- train(formula, data = train.data, method = "glm", trControl = control)
importance <- varImp(model, scale = FALSE)
plot(importance)
#### Construindo o modelo com as variáveis selecionadas
formula.new <- "credit.rating ~ account.balance + credit.purpose + previous.credit.payment.status + savings + credit.duration.months"
formula.new <- as.formula(formula.new)
lr.model.new <- glm(formula = formula.new, data = train.data, family = "binomial")
summary(lr.model.new)
##
## Call:
## glm(formula = formula.new, family = "binomial", data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5973 -0.8527 0.4662 0.7801 1.9928
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.25230 0.50376 -0.501 0.616492
## account.balance2 0.52367 0.24405 2.146 0.031895 *
## account.balance3 1.67680 0.26401 6.351 2.14e-10 ***
## credit.purpose2 -0.65516 0.44440 -1.474 0.140411
## credit.purpose3 -0.53313 0.41123 -1.296 0.194829
## credit.purpose4 -1.00485 0.40875 -2.458 0.013957 *
## previous.credit.payment.status2 0.82577 0.34188 2.415 0.015718 *
## previous.credit.payment.status3 1.38351 0.36556 3.785 0.000154 ***
## savings2 0.18985 0.35342 0.537 0.591153
## savings3 0.16117 0.37238 0.433 0.665147
## savings4 0.63801 0.30879 2.066 0.038814 *
## credit.duration.months -0.57597 0.09983 -5.770 7.94e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 741.31 on 599 degrees of freedom
## Residual deviance: 593.05 on 588 degrees of freedom
## AIC: 617.05
##
## Number of Fisher Scoring iterations: 5
lr.predictions.new <- predict(lr.model.new, test.data, type = "response")
lr.predictions.new <- round(lr.predictions.new)
# Avaliando o modelo
confusionMatrix(table(data = lr.predictions.new, reference = test.class.var), positive = '1')
## Confusion Matrix and Statistics
##
## reference
## data 0 1
## 0 36 26
## 1 79 259
##
## Accuracy : 0.7375
## 95% CI : (0.6915, 0.78)
## No Information Rate : 0.7125
## P-Value [Acc > NIR] : 0.1468
##
## Kappa : 0.2572
##
## Mcnemar's Test P-Value : 3.881e-07
##
## Sensitivity : 0.9088
## Specificity : 0.3130
## Pos Pred Value : 0.7663
## Neg Pred Value : 0.5806
## Prevalence : 0.7125
## Detection Rate : 0.6475
## Detection Prevalence : 0.8450
## Balanced Accuracy : 0.6109
##
## 'Positive' Class : 1
##
Melhoramos para 245 o nível de acuracia.
# Criando curvas ROC
lr.model.best <- lr.model
lr.prediction.values <- predict(lr.model.best, test.feature.vars, type = "response")
predictions <- prediction(lr.prediction.values, test.class.var)
par(mfrow = c(1,2))
plot.roc.curve(predictions, title.text = "Curva ROC")
plot.pr.curve(predictions, title.text = "Curva Precision/Recall")
O que estiver abaixo da linha vermelha deve ser rejeitada (volta e
refaz). O que estiver acima da linha vermelha está bom.
Quanto mais para o canto esquerdo melhor.