Ir para Fase 9.

Antes de começar as predições setamos a função para montar o layout da Matriz de confusão.

NAYVE BAYES

A caption

Realiza-se a carga de dados de treino e teste .

rm(list = ls(all.names = TRUE))
setwd("~/OneDrive/r-files/TCC/")
load(file = "treino.Rda")
load(file = "teste_iguatemi.Rda")
head(treino)

head(teste)

Carrega-se a biblioteca de aplicação do algorítimo Naive Bayes.

library(e1071)

Aplica-se o método para gerar o classificador.

classificador = naiveBayes(x = treino[, 4:12], y = treino$perc_oc, laplace = 3)
print(classificador)

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = treino[, 4:12], y = treino$perc_oc, laplace = 3)
## 
## A-priori probabilities:
## treino$perc_oc
##   0   1 
## 0.6 0.4 
## 
## Conditional probabilities:
##               area
## treino$perc_oc      [,1]      [,2]
##              0 0.4859251 0.2879239
##              1 0.5684838 0.3396876
## 
##               ocorrencias
## treino$perc_oc     [,1]     [,2]
##              0 3319.605 1399.331
##              1 2883.039 1305.014
## 
##               alvara_km2
## treino$perc_oc     [,1]     [,2]
##              0 10405.01 9905.198
##              1  7364.23 7436.275
## 
##               distancia_medoide
## treino$perc_oc     [,1]     [,2]
##              0 314.0194 184.1401
##              1 399.5308 163.7092
## 
##               distancia_cemtroide
## treino$perc_oc     [,1]     [,2]
##              0 206.6915 92.08138
##              1 234.5654 94.61492
## 
##               distancia_exemplars
## treino$perc_oc     [,1]      [,2]
##              0 203.9196  94.30407
##              1 243.8121 105.81817
## 
##               alvaras
## treino$perc_oc     [,1]     [,2]
##              0 441.3070 418.9180
##              1 262.0526 233.4798
## 
##               comercio
## treino$perc_oc      [,1]      [,2]
##              0 0.2756409 0.1437719
##              1 0.3286748 0.1553855
## 
##               servico
## treino$perc_oc      [,1]      [,2]
##              0 0.7243591 0.1437719
##              1 0.6756499 0.1588840

classificador$apriori

## treino$perc_oc
##   0   1 
## 114  76

Realiza-se a predição dos resultados.

previsoes = predict(classificador, newdata = teste[, 4:12])
previsoes

## [1] 0 1 0 0 0 0 0 1
## Levels: 0 1

Cria-se a matriz de confusão e aplica-se a verificação dos resultados.

cm = table(previsoes, teste$perc_oc, dnn=c("Predito", "Atual"))
cm

##        Atual
## Predito 0 1
##       0 2 4
##       1 1 1

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

cm_nb = confusionMatrix(cm)
cm_nb

## Confusion Matrix and Statistics
## 
##        Atual
## Predito 0 1
##       0 2 4
##       1 1 1
##                                           
##                Accuracy : 0.375           
##                  95% CI : (0.0852, 0.7551)
##     No Information Rate : 0.625           
##     P-Value [Acc > NIR] : 0.9640          
##                                           
##                   Kappa : -0.1111         
##                                           
##  Mcnemar's Test P-Value : 0.3711          
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.2000          
##          Pos Pred Value : 0.3333          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.3750          
##          Detection Rate : 0.2500          
##    Detection Prevalence : 0.7500          
##       Balanced Accuracy : 0.4333          
##                                           
##        'Positive' Class : 0               
##

source('matriz_confusao.R')
matriz(cm_nb)

fourfoldplot(cm_nb$table)

cm_nb = c("Naive Bayes", cm_nb$overall[1], cm_nb$byClass[1:4])
resultados = as.data.frame(cm_nb)

REGRAS DE DECISÃO

A caption

setwd("~/OneDrive/r-files/TCC/")
load(file = "treino.Rda")
load(file = "teste.Rda")
treino$comercio[is.na(treino$comercio)] <- 0

Praparação dos datasets de treino e teste.

dt_treino = treino[, c(1, 4:12)]
dt_treino = cbind(dt_treino[, 2:10], dt_treino$perc_oc)
head(dt_treino)

dt_teste= teste[, c(1, 4:12)]
dt_teste = cbind(dt_teste[, 2:10], dt_teste$perc_oc)
head(dt_teste)

Transformação doa dataframes em objetos do tipo Decision Table.

library(RoughSets)

## Loading required package: Rcpp

dt_treino = SF.asDecisionTable(dataset = dt_treino, decision.attr = 10)
dt_teste = SF.asDecisionTable(dataset = dt_teste, decision.attr = 10)

Realiza a conversão de números em valores nominais. Esta abordagem é necessária para métodos que utilizam a teoria dos conjuntos aproximados que calculam a relaçào de indiscernibilidade. Estes dados nominais servirão para definir valores de corte (fatores ordenados) que também requerem um atribut de decisão nominal.

discretizados = D.discretization.RST(dt_treino, nOfIntervals = 10)
discretizados

## $cut.values
## $cut.values$area
## [1] 0.2223374 0.3379639 0.3435523 0.3688524 0.3927846 0.5214642 0.5980715
## [8] 0.6977012 0.7514241
## 
## $cut.values$ocorrencias
## [1] 1857.0 1995.0 2143.0 2359.0 2572.0 3255.0 4380.0 4400.4 4685.0
## 
## $cut.values$alvara_km2
## [1]  2293.438  2661.598  3571.712  5809.913  6005.836  6358.560  8595.029
## [8] 12959.962 23190.807
## 
## $cut.values$distancia_medoide
## [1] 141.0276 194.2406 228.0667 265.5146 303.1181 351.1803 449.9052 544.4051
## [9] 597.0984
## 
## $cut.values$distancia_cemtroide
## [1]  79.70009 132.15562 174.63601 200.48180 218.50759 243.59899 272.39066
## [8] 303.66067 331.10901
## 
## $cut.values$distancia_exemplars
## [1]  89.85229 127.98123 168.43967 196.77732 219.36332 245.83364 269.82913
## [8] 308.47148 345.45784
## 
## $cut.values$alvaras
## [1]  81.7 134.0 169.0 197.2 251.5 309.0 394.6 588.8 753.2
## 
## $cut.values$comercio
## [1] 0.08595492 0.15685412 0.21257993 0.24976722 0.30781441 0.34059430
## [7] 0.37681999 0.41346642 0.49765897
## 
## $cut.values$servico
## [1] 0.5023410 0.5865336 0.6231800 0.6594057 0.6921856 0.7502328 0.7874201
## [8] 0.8431459 0.9140451
## 
## 
## $type.method
## [1] "unsupervised.quantiles"
## 
## $type.task
## [1] "discretization"
## 
## $model
## [1] "RST"
## 
## attr(,"class")
## [1] "Discretization" "list"

Realiza a o typecast com base no modelo discretizado.

dt_treino = SF.applyDecTable(dt_treino, discretizados)
dt_teste= SF.applyDecTable(dt_teste, discretizados)
head(dt_teste)

head(dt_treino)

Realiza a classificação superfisionada por indução de acordo com o algorítimo CN2.

classifier = RI.CN2Rules.RST(dt_treino, K = 15)
print(classifier)

## A set consisting of  56  rules:
## 1. IF alvara_km2 is (1.3e+04,2.32e+04] THEN  is 0;
##      (supportSize=12; laplace=0.928571428571429) 
## 2. IF distancia_medoide is [-Inf,141] and distancia_cemtroide is (79.7,132] THEN  is 0;
##      (supportSize=6; laplace=0.875) 
## 3. IF alvara_km2 is (6.01e+03,6.36e+03] and distancia_medoide is (597, Inf] THEN  is 0;
##      (supportSize=5; laplace=0.857142857142857) 
## 4. IF ocorrencias is (4.4e+03,4.68e+03] and alvaras is (589,753] THEN  is 0;
##      (supportSize=5; laplace=0.857142857142857) 
## 5. IF alvara_km2 is (2.66e+03,3.57e+03] and area is (0.598,0.698] THEN  is 1;
##      (supportSize=7; laplace=0.888888888888889) 
## 6. IF distancia_medoide is (194,228] and area is (0.344,0.369] THEN  is 0;
##      (supportSize=5; laplace=0.857142857142857) 
## 7. IF alvara_km2 is (3.57e+03,5.81e+03] and alvaras is (197,252] THEN  is 1;
##      (supportSize=7; laplace=0.888888888888889) 
## 8. IF ocorrencias is (2.57e+03,3.26e+03] and distancia_medoide is (228,266] THEN  is 0;
##      (supportSize=5; laplace=0.857142857142857) 
## 9. IF distancia_medoide is (266,303] and area is (0.222,0.338] THEN  is 0;
##      (supportSize=4; laplace=0.833333333333333) 
## 10. IF area is (0.369,0.393] and distancia_medoide is (194,228] THEN  is 0;
##      (supportSize=4; laplace=0.833333333333333) 
## ... and 46 other rules.

Com base das regras atribuidas ao classificadore, é feita a classificação dos dados de teste.

prediction = predict(classifier, newdata = dt_teste[-10])
prediction

Cria-se a tabela matriz de confusão e faz-se a Verificação dos resultados da matriz de confusão.

mc = table(unlist(prediction), dt_teste$dt_teste.perc_oc, dnn=c("Predito", "Atual"))
mc

##        Atual
## Predito  0  1
##       0 13  7
##       1  5  5

library(caret)
plot(mc)

cm_rd = confusionMatrix(mc)
cm_rd

## Confusion Matrix and Statistics
## 
##        Atual
## Predito  0  1
##       0 13  7
##       1  5  5
##                                          
##                Accuracy : 0.6            
##                  95% CI : (0.406, 0.7734)
##     No Information Rate : 0.6            
##     P-Value [Acc > NIR] : 0.5785         
##                                          
##                   Kappa : 0.1429         
##                                          
##  Mcnemar's Test P-Value : 0.7728         
##                                          
##             Sensitivity : 0.7222         
##             Specificity : 0.4167         
##          Pos Pred Value : 0.6500         
##          Neg Pred Value : 0.5000         
##              Prevalence : 0.6000         
##          Detection Rate : 0.4333         
##    Detection Prevalence : 0.6667         
##       Balanced Accuracy : 0.5694         
##                                          
##        'Positive' Class : 0              
##

source('matriz_confusao.R')
matriz(cm_rd)

fourfoldplot(cm_rd$table)

cm_rd = c("Regras de Decisão", cm_rd$overall[1], cm_rd$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_rd))

REGRESSÃO LOGÍSTICA

A caption

load(file = "treino.Rda")
head(treino)

load(file = "teste.Rda")
head(teste)

treino$comercio[is.na(treino$comercio)] <- 0

treino = treino[, c(1, 4:12)]
teste = teste[, c(1, 4:12)]
head(treino)

head(teste)

classificador = glm(formula = perc_oc ~., family = binomial, data = treino, control = list(maxit = 50))

probabilidades = predict(classificador, type = 'response', newdata = teste[, -1])

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

previsoes = ifelse(probabilidades > 0.5, 1, 0)
previsoes

##   9  15  19  27  44  48  60  64  75  78  88  96 100 101 106 116 134 138 
##   0   1   0   1   1   1   0   0   0   0   1   0   0   0   0   1   0   0 
## 143 144 152 153 189 193 194 197 202 214 216 218 
##   0   0   0   0   0   0   0   0   0   0   1   1

cm = table(previsoes, teste$perc_oc, dnn=c("Predito", "Atual"))
cm

##        Atual
## Predito  0  1
##       0 15  7
##       1  3  5

library(caret)
plot(mc)

cm_rl = confusionMatrix(cm)
cm_rl

## Confusion Matrix and Statistics
## 
##        Atual
## Predito  0  1
##       0 15  7
##       1  3  5
##                                           
##                Accuracy : 0.6667          
##                  95% CI : (0.4719, 0.8271)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.2915          
##                                           
##                   Kappa : 0.2647          
##                                           
##  Mcnemar's Test P-Value : 0.3428          
##                                           
##             Sensitivity : 0.8333          
##             Specificity : 0.4167          
##          Pos Pred Value : 0.6818          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.6000          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.7333          
##       Balanced Accuracy : 0.6250          
##                                           
##        'Positive' Class : 0               
##

source('matriz_confusao.R')
matriz(cm_rl)

fourfoldplot(cm_rl$table)

cm_rl = c("Regressão Logística", cm_rl$overall[1], cm_rl$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_rl))

REGRESSÃO LINEAR

options(scipen=999)
setwd("~/OneDrive/r-files//TCC")
load(file = "treino.Rda")
treino$ocupacao = NULL
treino$faixas_ocupacao = NULL
head(treino)

dim(treino)

## [1] 190  10

treino$comercio[is.na(treino$comercio)] <- 0
load(file = "teste.Rda")
teste$ocupacao = NULL
teste$faixas_ocupacao = NULL
head(teste)

dim(teste)

## [1] 30 10

treino$perc_oc = as.numeric(as.character(treino$perc_oc))
treino$area = as.numeric(as.character(treino$area))
treino$ocorrencias = as.numeric(as.character(treino$ocorrencias))
treino$alvara_km2 = as.numeric(as.character(treino$alvara_km2))
treino$distancia_medoide = as.numeric(as.character(treino$distancia_medoide))
treino$distancia_cemtroide = as.numeric(as.character(treino$distancia_cemtroide))
treino$alvaras = as.numeric(as.character(treino$alvaras))
treino$comercio = as.numeric(as.character(treino$comercio))
treino$servico = as.numeric(as.character(treino$servico))
treino$comercio[is.na(treino$comercio)] <- mean(treino$comercio)

teste$perc_oc = as.numeric(as.character(teste$perc_oc))
teste$area = as.numeric(as.character(teste$area))
teste$ocorrencias = as.numeric(as.character(teste$ocorrencias))
teste$alvara_km2 = as.numeric(as.character(teste$alvara_km2))
teste$distancia_medoide = as.numeric(as.character(teste$distancia_medoide))
teste$distancia_cemtroide = as.numeric(as.character(teste$distancia_cemtroide))
teste$alvaras = as.numeric(as.character(teste$alvaras))
teste$comercio = as.numeric(as.character(teste$comercio))
teste$servico = as.numeric(as.character(teste$servico))
teste$comercio[is.na(teste$comercio)] <- mean(teste$comercio)

linearMod <- lm(perc_oc ~ ., data=treino)  
summary(linearMod)

## 
## Call:
## lm(formula = perc_oc ~ ., data = treino)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9260 -0.3747 -0.2190  0.4625  0.9805 
## 
## Coefficients: (1 not defined because of singularities)
##                         Estimate   Std. Error t value Pr(>|t|)  
## (Intercept)         -0.080505162  0.217437803  -0.370   0.7116  
## area                 0.167956582  0.152419140   1.102   0.2720  
## ocorrencias          0.000038101  0.000051212   0.744   0.4578  
## alvara_km2          -0.000003586  0.000008676  -0.413   0.6798  
## distancia_medoide    0.000455034  0.000233509   1.949   0.0529 .
## distancia_cemtroide -0.001447604  0.000830130  -1.744   0.0829 .
## distancia_exemplars  0.001761692  0.000747266   2.358   0.0195 *
## alvaras             -0.000175937  0.000133326  -1.320   0.1886  
## comercio             0.477882208  0.236070047   2.024   0.0444 *
## servico                       NA           NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4677 on 181 degrees of freedom
## Multiple R-squared:  0.1318, Adjusted R-squared:  0.09342 
## F-statistic: 3.434 on 8 and 181 DF,  p-value: 0.001052

options(digits=2)
predicoes = predict.lm(linearMod,  teste[, -1])

## Warning in predict.lm(linearMod, teste[, -1]): prediction from a rank-
## deficient fit may be misleading

predicoes = ifelse(predicoes > 0.5, 1, 0)
predicoes

##   9  15  19  27  44  48  60  64  75  78  88  96 100 101 106 116 134 138 
##   0   1   0   1   1   1   0   0   0   0   1   0   0   0   0   1   0   0 
## 143 144 152 153 189 193 194 197 202 214 216 218 
##   0   0   0   0   0   0   1   0   0   0   1   1

cm = table(predicoes, teste$perc_oc, dnn=c("Predito", "Atual"))
cm

##        Atual
## Predito  0  1
##       0 14  7
##       1  4  5

library(caret)
plot(mc)

cm_rlin = confusionMatrix(cm)
cm_rlin

## Confusion Matrix and Statistics
## 
##        Atual
## Predito  0  1
##       0 14  7
##       1  4  5
##                                         
##                Accuracy : 0.633         
##                  95% CI : (0.439, 0.801)
##     No Information Rate : 0.6           
##     P-Value [Acc > NIR] : 0.431         
##                                         
##                   Kappa : 0.203         
##                                         
##  Mcnemar's Test P-Value : 0.546         
##                                         
##             Sensitivity : 0.778         
##             Specificity : 0.417         
##          Pos Pred Value : 0.667         
##          Neg Pred Value : 0.556         
##              Prevalence : 0.600         
##          Detection Rate : 0.467         
##    Detection Prevalence : 0.700         
##       Balanced Accuracy : 0.597         
##                                         
##        'Positive' Class : 0             
##

source('matriz_confusao.R')
matriz(cm_rlin)

fourfoldplot(cm_rlin$table)

cm_rlin = c("Regressão Linear", cm_rlin$overall[1], cm_rlin$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_rlin))

Árvores de Decisão

A caption

load(file = "treino.Rda")
load(file = "teste.Rda")
head(treino)

head(teste)

treino$comercio[is.na(treino$comercio)] <- 0

library(rpart)
library(rpart.plot)

treino[,c(2:3)] = NULL
treino$perc_oc = as.factor(treino$perc_oc)
head(treino)

classifier2 = rpart(formula = perc_oc ~ ., data = treino)
print(classifier2)

## n= 190 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 190 76 0 (0.60 0.40)  
##    2) distancia_medoide< 3.1e+02 97 24 0 (0.75 0.25)  
##      4) distancia_exemplars< 3.1e+02 88 18 0 (0.80 0.20) *
##      5) distancia_exemplars>=3.1e+02 9  3 1 (0.33 0.67) *
##    3) distancia_medoide>=3.1e+02 93 41 1 (0.44 0.56)  
##      6) distancia_medoide>=6.9e+02 8  1 0 (0.87 0.12) *
##      7) distancia_medoide< 6.9e+02 85 34 1 (0.40 0.60)  
##       14) comercio< 0.37 58 28 1 (0.48 0.52)  
##         28) distancia_medoide< 5.3e+02 28 10 0 (0.64 0.36)  
##           56) area< 0.72 21  5 0 (0.76 0.24) *
##           57) area>=0.72 7  2 1 (0.29 0.71) *
##         29) distancia_medoide>=5.3e+02 30 10 1 (0.33 0.67) *
##       15) comercio>=0.37 27  6 1 (0.22 0.78) *

rpart.plot(classifier2)

teste[,c(2:3)] = NULL

head(teste)

previsoes = predict(classifier2, newdata = teste[,-1])
previsoes

##        0    1
## 9   0.87 0.12
## 15  0.22 0.78
## 19  0.76 0.24
## 27  0.33 0.67
## 44  0.29 0.71
## 48  0.29 0.71
## 60  0.80 0.20
## 64  0.80 0.20
## 75  0.33 0.67
## 78  0.33 0.67
## 88  0.80 0.20
## 96  0.80 0.20
## 100 0.80 0.20
## 101 0.80 0.20
## 106 0.80 0.20
## 116 0.80 0.20
## 134 0.80 0.20
## 138 0.80 0.20
## 143 0.80 0.20
## 144 0.80 0.20
## 152 0.80 0.20
## 153 0.80 0.20
## 189 0.76 0.24
## 193 0.76 0.24
## 194 0.33 0.67
## 197 0.80 0.20
## 202 0.80 0.20
## 214 0.76 0.24
## 216 0.76 0.24
## 218 0.22 0.78

previsoes = predict(classifier2, newdata = teste, type = 'class')
head(previsoes)

##  9 15 19 27 44 48 
##  0  1  0  1  1  1 
## Levels: 0 1

mc = table(previsoes, teste$perc_oc, dnn=c("Predito", "Atual"))
print(mc)

##        Atual
## Predito  0  1
##       0 13  9
##       1  5  3

library(caret)
plot(mc)

cm_ad = confusionMatrix(mc)
cm_ad

## Confusion Matrix and Statistics
## 
##        Atual
## Predito  0  1
##       0 13  9
##       1  5  3
##                                         
##                Accuracy : 0.533         
##                  95% CI : (0.343, 0.717)
##     No Information Rate : 0.6           
##     P-Value [Acc > NIR] : 0.825         
##                                         
##                   Kappa : -0.029        
##                                         
##  Mcnemar's Test P-Value : 0.423         
##                                         
##             Sensitivity : 0.722         
##             Specificity : 0.250         
##          Pos Pred Value : 0.591         
##          Neg Pred Value : 0.375         
##              Prevalence : 0.600         
##          Detection Rate : 0.433         
##    Detection Prevalence : 0.733         
##       Balanced Accuracy : 0.486         
##                                         
##        'Positive' Class : 0             
##

source('matriz_confusao.R')
matriz(cm_ad)

fourfoldplot(cm_ad$table)

cm_ad = c("Arvores de Decisão", cm_ad$overall[1], cm_ad$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_ad))

Floresta Randômica

load(file = "treino.Rda")
head(treino)

load(file = "teste.Rda")
head(teste)

treino$comercio[is.na(treino$comercio)] <- 0

A caption

teste$perc_oc = as.numeric(as.character(teste$perc_oc))
teste$faixas_ocupacao = as.numeric(teste$faixas_ocupacao)
teste$ocupacao = as.numeric(as.character(teste$ocupacao))
teste$area = as.numeric(as.character(teste$area))
teste$ocorrencias = as.numeric(as.character(teste$ocorrencias))
teste$alvara_km2 = as.numeric(as.character(teste$alvara_km2))
teste$distancia_medoide = as.numeric(as.character(teste$distancia_medoide))
teste$distancia_cemtroide = as.numeric(as.character(teste$distancia_cemtroide))
teste$distancia_exemplars = as.numeric(as.character(teste$distancia_exemplars))
teste$alvaras = as.numeric(as.character(teste$alvaras))
teste$comercio = as.numeric(as.character(teste$comercio))
teste$servico = as.numeric(as.character(teste$servico))

treino$perc_oc = as.numeric(as.character(treino$perc_oc))
treino$faixas_ocupacao = as.numeric(treino$faixas_ocupacao)
treino$ocupacao = as.numeric(as.character(treino$ocupacao))
treino$area = as.numeric(as.character(treino$area))
treino$ocorrencias = as.numeric(as.character(treino$ocorrencias))
treino$alvara_km2 = as.numeric(as.character(treino$alvara_km2))
treino$distancia_medoide = as.numeric(as.character(treino$distancia_medoide))
treino$distancia_cemtroide = as.numeric(as.character(treino$distancia_cemtroide))
treino$distancia_exemplars = as.numeric(as.character(treino$distancia_exemplars))
treino$alvaras = as.numeric(as.character(treino$alvaras))
treino$comercio = as.numeric(as.character(treino$comercio))
treino$servico = as.numeric(as.character(treino$servico))

teste[, 4:12] = scale(teste[, 4:12])
treino[, 4:12] = scale(treino[, 4:12])

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

treino$perc_oc = as.factor(treino$perc_oc)
treino$faixas_ocupacao = as.factor(treino$faixas_ocupacao)
treino$ocupacao = as.factor(treino$ocupacao)

classifier = randomForest(x = treino[, 4:12], y = treino$perc_oc, ntree = 13)
print(classifier)

## 
## Call:
##  randomForest(x = treino[, 4:12], y = treino$perc_oc, ntree = 13) 
##                Type of random forest: classification
##                      Number of trees: 13
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 36%
## Confusion matrix:
##    0  1 class.error
## 0 81 33        0.29
## 1 36 40        0.47

previsoes = predict(classifier, newdata = teste[, 4:12])
previsoes

##   9  15  19  27  44  48  60  64  75  78  88  96 100 101 106 116 134 138 
##   0   0   0   0   1   1   0   0   0   0   1   0   0   0   0   1   0   0 
## 143 144 152 153 189 193 194 197 202 214 216 218 
##   0   0   0   0   0   0   0   0   0   1   1   1 
## Levels: 0 1

mc = table(previsoes, teste$perc_oc, dnn=c("Predito", "Atual"))
mc

##        Atual
## Predito  0  1
##       0 17  6
##       1  1  6

library(caret)
plot(mc)

cm_fr = confusionMatrix(mc)
cm_fr

## Confusion Matrix and Statistics
## 
##        Atual
## Predito  0  1
##       0 17  6
##       1  1  6
##                                         
##                Accuracy : 0.767         
##                  95% CI : (0.577, 0.901)
##     No Information Rate : 0.6           
##     P-Value [Acc > NIR] : 0.0435        
##                                         
##                   Kappa : 0.478         
##                                         
##  Mcnemar's Test P-Value : 0.1306        
##                                         
##             Sensitivity : 0.944         
##             Specificity : 0.500         
##          Pos Pred Value : 0.739         
##          Neg Pred Value : 0.857         
##              Prevalence : 0.600         
##          Detection Rate : 0.567         
##    Detection Prevalence : 0.767         
##       Balanced Accuracy : 0.722         
##                                         
##        'Positive' Class : 0             
##

source('matriz_confusao.R')
matriz(cm_fr)

fourfoldplot(cm_fr$table)

cm_fr = c("Floresta Randômica", cm_fr$overall[1], cm_fr$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_fr))

REDE NEURAL ARTIFICIAL

A caption

options(scipen=999)
load(file = "treino.Rda")
treino$ocupacao = NULL
treino$faixas_ocupacao = NULL
head(treino)

dim(treino)

## [1] 190  10

treino$comercio[is.na(treino$comercio)] <- 0
load(file = "teste.Rda")
teste$ocupacao = NULL
teste$faixas_ocupacao = NULL
head(teste)

dim(teste)

## [1] 30 10

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

h2o.init(nthreads = -1)

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         37 minutes 49 seconds 
##     H2O cluster timezone:       America/Sao_Paulo 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.22.1.1 
##     H2O cluster version age:    6 months and 8 days !!! 
##     H2O cluster name:           H2O_started_from_R_fagnersuteldemoura_hep385 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.00 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.2 (2018-12-20)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is too old (6 months and 8 days)!
## Please download and install the latest version from http://h2o.ai/download/

classifier = h2o.deeplearning(y= 'perc_oc', training_frame = as.h2o(treino), 
                              activation = 'Rectifier', hidden = c(22,30), epochs = 1000)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |=================================================================| 100%

print(classifier)

## Model Details:
## ==============
## 
## H2OBinomialModel: deeplearning
## Model ID:  DeepLearning_model_R_1562507513506_25 
## Status of Neuron Layers: predicting perc_oc, 2-class classification, bernoulli distribution, CrossEntropy loss, 972 weights/biases, 16.2 KB, 190,000 training samples, mini-batch size 1
##   layer units      type dropout       l1       l2 mean_rate rate_rms
## 1     1     9     Input  0.00 %       NA       NA        NA       NA
## 2     2    22 Rectifier  0.00 % 0.000000 0.000000  0.000709 0.000974
## 3     3    30 Rectifier  0.00 % 0.000000 0.000000  0.002032 0.002552
## 4     4     2   Softmax      NA 0.000000 0.000000  0.002374 0.002061
##   momentum mean_weight weight_rms mean_bias bias_rms
## 1       NA          NA         NA        NA       NA
## 2 0.000000   -0.062958   0.378726  0.552060 0.199394
## 3 0.000000   -0.012868   0.301501  1.021265 0.077520
## 4 0.000000   -0.296918   1.274050  0.001083 0.018389
## 
## 
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
## 
## MSE:  0.05
## RMSE:  0.22
## LogLoss:  0.18
## Mean Per-Class Error:  0.044
## AUC:  0.98
## pr_auc:  0.97
## Gini:  0.97
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          0  1    Error    Rate
## 0      107  7 0.061404  =7/114
## 1        2 74 0.026316   =2/76
## Totals 109 81 0.047368  =9/190
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.105517 0.942675  80
## 2                       max f2  0.105517 0.961039  80
## 3                 max f0point5  0.526731 0.941011  69
## 4                 max accuracy  0.105517 0.952632  80
## 5                max precision  0.999990 1.000000   0
## 6                   max recall  0.003417 1.000000 114
## 7              max specificity  0.999990 1.000000   0
## 8             max absolute_mcc  0.105517 0.903716  80
## 9   max min_per_class_accuracy  0.174170 0.947368  77
## 10 max mean_per_class_accuracy  0.105517 0.956140  80
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

predicoes = h2o.predict(classifier, newdata = as.h2o(teste[, -1]))

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

predicoes

##   predict         p0     p1
## 1       1 0.44147370 0.5585
## 2       0 0.99182652 0.0082
## 3       0 0.99545383 0.0045
## 4       1 0.72081741 0.2792
## 5       1 0.00000077 1.0000
## 6       1 0.00000653 1.0000
## 
## [30 rows x 3 columns]

predicoes = as.data.frame(predicoes)
cm = table(predicoes$predict, teste$perc_oc, dnn=c("Predito", "Atual"))
cm

##        Atual
## Predito  0  1
##       0 14  3
##       1  4  9

library(caret)
plot(mc)

cm_rna = confusionMatrix(cm)
cm_rna

## Confusion Matrix and Statistics
## 
##        Atual
## Predito  0  1
##       0 14  3
##       1  4  9
##                                         
##                Accuracy : 0.767         
##                  95% CI : (0.577, 0.901)
##     No Information Rate : 0.6           
##     P-Value [Acc > NIR] : 0.0435        
##                                         
##                   Kappa : 0.521         
##                                         
##  Mcnemar's Test P-Value : 1.0000        
##                                         
##             Sensitivity : 0.778         
##             Specificity : 0.750         
##          Pos Pred Value : 0.824         
##          Neg Pred Value : 0.692         
##              Prevalence : 0.600         
##          Detection Rate : 0.467         
##    Detection Prevalence : 0.567         
##       Balanced Accuracy : 0.764         
##                                         
##        'Positive' Class : 0             
##

source('matriz_confusao.R')
matriz(cm_rna)

fourfoldplot(cm_rna$table)

cm_rna = c("Rede Neural Artificial", cm_rna$overall[1], cm_rna$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_rna))

SUPORT VETOR MACHINE

A caption

load(file = "treino.Rda")
treino$ocupacao = NULL
treino$faixas_ocupacao = NULL
head(treino)

dim(treino)

## [1] 190  10

treino$comercio[is.na(treino$comercio)] <- 0
load(file = "teste.Rda")
teste$ocupacao = NULL
teste$faixas_ocupacao = NULL
head(teste)

dim(teste)

## [1] 30 10

library(e1071)

classifier = svm(formula = perc_oc ~., data = treino, type = 'C-classification', kernel = 'radial')
print(classifier)

## 
## Call:
## svm(formula = perc_oc ~ ., data = treino, type = "C-classification", 
##     kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.11 
## 
## Number of Support Vectors:  159

predicao = predict(classifier, newdata = teste[, -1])
predicao

##   9  15  19  27  44  48  60  64  75  78  88  96 100 101 106 116 134 138 
##   0   1   0   1   1   1   0   0   0   0   1   0   0   0   0   1   0   0 
## 143 144 152 153 189 193 194 197 202 214 216 218 
##   0   0   0   0   0   0   0   0   0   1   1   1 
## Levels: 0 1

cm = table(predicao, teste$perc_oc, dnn=c("Predito", "Atual"))
cm

##        Atual
## Predito  0  1
##       0 15  6
##       1  3  6

library(caret)
plot(mc)

cm_svm = confusionMatrix(cm)
cm_svm

## Confusion Matrix and Statistics
## 
##        Atual
## Predito  0  1
##       0 15  6
##       1  3  6
##                                         
##                Accuracy : 0.7           
##                  95% CI : (0.506, 0.853)
##     No Information Rate : 0.6           
##     P-Value [Acc > NIR] : 0.176         
##                                         
##                   Kappa : 0.348         
##                                         
##  Mcnemar's Test P-Value : 0.505         
##                                         
##             Sensitivity : 0.833         
##             Specificity : 0.500         
##          Pos Pred Value : 0.714         
##          Neg Pred Value : 0.667         
##              Prevalence : 0.600         
##          Detection Rate : 0.500         
##    Detection Prevalence : 0.700         
##       Balanced Accuracy : 0.667         
##                                         
##        'Positive' Class : 0             
##

source('matriz_confusao.R')
matriz(cm_svm)

fourfoldplot(cm_svm$table)

cm_svm = c("Suport Vetor Machine", cm_svm$overall[1], cm_svm$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_svm))

APRENDIZADO POR INSTÂNCIAS - KNN

A caption

load(file = "treino.Rda")
treino$ocupacao = NULL
treino$faixas_ocupacao = NULL
head(treino)

treino$comercio[is.na(treino$comercio)] <- 0
load(file = "teste.Rda")
teste$ocupacao = NULL
teste$faixas_ocupacao = NULL
head(teste)

dim(teste)

## [1] 30 10

dim(treino)

## [1] 190  10

teste$perc_oc = factor(as.character(teste$perc_oc))
teste$area = factor(as.character(teste$area))
teste$ocorrencias = factor(as.character(teste$ocorrencias))
teste$alvara_km2 = factor(as.character(teste$alvara_km2))
teste$distancia_medoide = factor(as.character(teste$distancia_medoide))
teste$distancia_cemtroide = factor(as.character(teste$distancia_cemtroide))
teste$distancia_exemplars = factor(as.character(teste$distancia_exemplars))
teste$alvaras = factor(as.character(teste$alvaras))
teste$comercio = factor(as.character(teste$comercio))
teste$servico = factor(as.character(teste$servico))

treino$perc_oc = factor(as.character(treino$perc_oc))
treino$area = factor(as.character(treino$area))
treino$ocorrencias = factor(as.character(treino$ocorrencias))
treino$alvara_km2 = factor(as.character(treino$alvara_km2))
treino$distancia_medoide = factor(as.character(treino$distancia_medoide))
treino$distancia_cemtroide = factor(as.character(treino$distancia_cemtroide))
treino$distancia_exemplars = factor(as.character(treino$distancia_exemplars))
treino$alvaras = factor(as.character(treino$alvaras))
treino$comercio = factor(as.character(treino$comercio))
treino$servico = factor(as.character(treino$servico))

treino = cbind(treino[, 2:10], treino$perc_oc)
teste = cbind(teste[, 2:10], teste$perc_oc)

library(class)

classifier = knn(train = treino[, -10], test = teste[, -10],
                cl =treino[, 10], k = 10)
classifier

##  [1] 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1
## Levels: 0 1

cm = table(classifier, teste[, 10], dnn=c("Predito", "Atual"))
cm

##        Atual
## Predito  0  1
##       0 18  4
##       1  0  8

library(caret)
plot(mc)

cm_knn = confusionMatrix(cm)
cm_knn

## Confusion Matrix and Statistics
## 
##        Atual
## Predito  0  1
##       0 18  4
##       1  0  8
##                                         
##                Accuracy : 0.867         
##                  95% CI : (0.693, 0.962)
##     No Information Rate : 0.6           
##     P-Value [Acc > NIR] : 0.00151       
##                                         
##                   Kappa : 0.706         
##                                         
##  Mcnemar's Test P-Value : 0.13361       
##                                         
##             Sensitivity : 1.000         
##             Specificity : 0.667         
##          Pos Pred Value : 0.818         
##          Neg Pred Value : 1.000         
##              Prevalence : 0.600         
##          Detection Rate : 0.600         
##    Detection Prevalence : 0.733         
##       Balanced Accuracy : 0.833         
##                                         
##        'Positive' Class : 0             
##

source('matriz_confusao.R')
matriz(cm_knn)

fourfoldplot(cm_knn$table)

cm_knn = c("K-Nearest Neighbors", cm_knn$overall[1], cm_knn$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_knn))

names(resultados) = as.matrix(resultados[1, ])
resultados <- resultados[-1, ]
resultados[] <- lapply(resultados, function(x) type.convert(as.character(x)))

nomes = c("N.B.", "R.D.", "R.Lo", "R.Li", "A.D.", "F.R.", "RNA", "SVM", "KNN")
nomes

## [1] "N.B." "R.D." "R.Lo" "R.Li" "A.D." "F.R." "RNA"  "SVM"  "KNN"

plot_df = data.frame()
plot_df = cbind(resultados[1,], resultados[2,], resultados[3,], resultados[4,], resultados[5,])

plot_df = data.frame(matrix(unlist(plot_df), nrow=9, byrow=F),stringsAsFactors=FALSE)

plot_df = cbind(plot_df, nomes, c(1:9))

names(plot_df) = c("AC", "SE", "ES", "PPV", "NPV", "TESTE", "CLASSE")
plot_df

ggplot(plot_df, aes(CLASSE)) + 
  ylim(0.15, 1.0) + 
  geom_line(aes(y = AC, colour = "Acurácia")) + 
  geom_line(aes(y = SE, colour = "Sensibilidade")) +
  geom_line(aes(y = ES, colour = "Especificidade")) +
  geom_line(aes(y = PPV, colour = "Pos. Pred. Val.")) +
  geom_line(aes(y = NPV, colour = "Neg. Pred. Val.")) +
  labs(title="Avaliação dos testes de aprendizado - Iguatemi",
       x ="Testes Aplicados", y = "Desempenho") + 
  scale_x_continuous(breaks = 1:9, labels = nomes, limits = c(1, 9))

nomes = c("N.Bayes.", "Reg .Dedisão.", "R.Log", "R.Lin", "Arv.Decisão", "Flor. Randômica", "RNA", "MSV", "KNN")
nomes

## [1] "N.Bayes."        "Reg .Dedisão."   "R.Log"           "R.Lin"          
## [5] "Arv.Decisão"     "Flor. Randômica" "RNA"             "MSV"            
## [9] "KNN"

plot_df = data.frame()
plot_df = cbind(resultados[1,], resultados[2,], resultados[3,])

plot_df = data.frame(matrix(unlist(plot_df), nrow=9, byrow=F),stringsAsFactors=FALSE)

plot_df = cbind(plot_df, nomes, c(1:9))

names(plot_df) = c("AC", "SE", "ES", "TESTE", "CLASSE")
plot_df

ggplot(plot_df, aes(CLASSE)) + 
  ylim(0.15, 1.0) + 
  geom_line(aes(y = AC, colour = "Acurácia")) + 
  geom_line(aes(y = SE, colour = "Sensibilidade")) +
  geom_line(aes(y = ES, colour = "Especificidade")) +
  labs(title="Avaliação dos testes de aprendizado - Iguatemi",
       x ="Testes Aplicados", y = "Desempenho", color = "Medidas de Desempenho\n") + 
  scale_x_continuous(breaks = 1:9, labels = nomes, limits = c(1, 9)) +
  theme(axis.text.x = element_text(angle = 45))

resultados$`Naive Bayes` = resultados$`Naive Bayes`*100
resultados$`Regras de Decisão` = resultados$`Regras de Decisão`*100
resultados$`Regressão Logística` = resultados$`Regressão Logística`*100
resultados$`Regressão Linear` = resultados$`Regressão Linear`*100
resultados$`Arvores de Decisão` = resultados$`Arvores de Decisão`*100
resultados$`Floresta Randômica` = resultados$`Floresta Randômica`*100
resultados$`Rede Neural Artificial`= resultados$`Rede Neural Artificial`*100
resultados$`Suport Vetor Machine` = resultados$`Suport Vetor Machine`*100
resultados$`K-Nearest Neighbors` = resultados$`K-Nearest Neighbors`*100
resultados

library(fmsb)
data=rbind(rep(100,5) , rep(0,5) , resultados)

radarchart(data)

colors_border=c( rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9) , rgb(0.5,0.2,0.1,0.9) , rgb(0.7,0.9,0.9,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.1), rgb(0.8,0.2,0.5,0.1) , rgb(0.7,0.5,0.1,0.1) , rgb(0.5,0.2,0.1,0.1) , rgb(0.7,0.9,0.9,0.1) )
radarchart( data  , axistype=1 ,
            #custom polygon
            pcol=colors_border, pfcol=colors_in, plwd=2 , plty=1,
            #custom the grid
            cglcol="grey", cglty=1, axislabcol="red", caxislabels=seq(0,100,25), cglwd=0.8,
            #custom labels
            vlcex=0.99
)
legend(x=1.35, y=1.4, legend = rownames(data[-c(1,2),]), bty = "n", pch=20 , col=colors_border ,
       text.col = "Black", cex=1.2, pt.cex=3)

Testes de Aprendizado aplicados a novos dados

Ir para Fase 9.

Antes de começar as predições setamos a função para montar o layout da Matriz de confusão.

NAYVE BAYES

REGRAS DE DECISÃO

REGRESSÃO LOGÍSTICA

REGRESSÃO LINEAR

Árvores de Decisão

Floresta Randômica

REDE NEURAL ARTIFICIAL

SUPORT VETOR MACHINE

APRENDIZADO POR INSTÂNCIAS - KNN

Ir para Portofolio.