A caption
Realiza-se a carga de dados de treino e teste .
rm(list = ls(all.names = TRUE))
setwd("~/OneDrive/r-files/TCC/")
load(file = "treino.Rda")
load(file = "teste_iguatemi.Rda")
head(treino)
head(teste)
Carrega-se a biblioteca de aplicação do algorítimo Naive Bayes.
library(e1071)
Aplica-se o método para gerar o classificador.
classificador = naiveBayes(x = treino[, 4:12], y = treino$perc_oc, laplace = 3)
print(classificador)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = treino[, 4:12], y = treino$perc_oc, laplace = 3)
##
## A-priori probabilities:
## treino$perc_oc
## 0 1
## 0.6 0.4
##
## Conditional probabilities:
## area
## treino$perc_oc [,1] [,2]
## 0 0.4859251 0.2879239
## 1 0.5684838 0.3396876
##
## ocorrencias
## treino$perc_oc [,1] [,2]
## 0 3319.605 1399.331
## 1 2883.039 1305.014
##
## alvara_km2
## treino$perc_oc [,1] [,2]
## 0 10405.01 9905.198
## 1 7364.23 7436.275
##
## distancia_medoide
## treino$perc_oc [,1] [,2]
## 0 314.0194 184.1401
## 1 399.5308 163.7092
##
## distancia_cemtroide
## treino$perc_oc [,1] [,2]
## 0 206.6915 92.08138
## 1 234.5654 94.61492
##
## distancia_exemplars
## treino$perc_oc [,1] [,2]
## 0 203.9196 94.30407
## 1 243.8121 105.81817
##
## alvaras
## treino$perc_oc [,1] [,2]
## 0 441.3070 418.9180
## 1 262.0526 233.4798
##
## comercio
## treino$perc_oc [,1] [,2]
## 0 0.2756409 0.1437719
## 1 0.3286748 0.1553855
##
## servico
## treino$perc_oc [,1] [,2]
## 0 0.7243591 0.1437719
## 1 0.6756499 0.1588840
classificador$apriori
## treino$perc_oc
## 0 1
## 114 76
Realiza-se a predição dos resultados.
previsoes = predict(classificador, newdata = teste[, 4:12])
previsoes
## [1] 0 1 0 0 0 0 0 1
## Levels: 0 1
Cria-se a matriz de confusão e aplica-se a verificação dos resultados.
cm = table(previsoes, teste$perc_oc, dnn=c("Predito", "Atual"))
cm
## Atual
## Predito 0 1
## 0 2 4
## 1 1 1
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
cm_nb = confusionMatrix(cm)
cm_nb
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 2 4
## 1 1 1
##
## Accuracy : 0.375
## 95% CI : (0.0852, 0.7551)
## No Information Rate : 0.625
## P-Value [Acc > NIR] : 0.9640
##
## Kappa : -0.1111
##
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.6667
## Specificity : 0.2000
## Pos Pred Value : 0.3333
## Neg Pred Value : 0.5000
## Prevalence : 0.3750
## Detection Rate : 0.2500
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.4333
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_nb)
fourfoldplot(cm_nb$table)
cm_nb = c("Naive Bayes", cm_nb$overall[1], cm_nb$byClass[1:4])
resultados = as.data.frame(cm_nb)
A caption
setwd("~/OneDrive/r-files/TCC/")
load(file = "treino.Rda")
load(file = "teste.Rda")
treino$comercio[is.na(treino$comercio)] <- 0
Praparação dos datasets de treino e teste.
dt_treino = treino[, c(1, 4:12)]
dt_treino = cbind(dt_treino[, 2:10], dt_treino$perc_oc)
head(dt_treino)
dt_teste= teste[, c(1, 4:12)]
dt_teste = cbind(dt_teste[, 2:10], dt_teste$perc_oc)
head(dt_teste)
Transformação doa dataframes em objetos do tipo Decision Table.
library(RoughSets)
## Loading required package: Rcpp
dt_treino = SF.asDecisionTable(dataset = dt_treino, decision.attr = 10)
dt_teste = SF.asDecisionTable(dataset = dt_teste, decision.attr = 10)
Realiza a conversão de números em valores nominais. Esta abordagem é necessária para métodos que utilizam a teoria dos conjuntos aproximados que calculam a relaçào de indiscernibilidade. Estes dados nominais servirão para definir valores de corte (fatores ordenados) que também requerem um atribut de decisão nominal.
discretizados = D.discretization.RST(dt_treino, nOfIntervals = 10)
discretizados
## $cut.values
## $cut.values$area
## [1] 0.2223374 0.3379639 0.3435523 0.3688524 0.3927846 0.5214642 0.5980715
## [8] 0.6977012 0.7514241
##
## $cut.values$ocorrencias
## [1] 1857.0 1995.0 2143.0 2359.0 2572.0 3255.0 4380.0 4400.4 4685.0
##
## $cut.values$alvara_km2
## [1] 2293.438 2661.598 3571.712 5809.913 6005.836 6358.560 8595.029
## [8] 12959.962 23190.807
##
## $cut.values$distancia_medoide
## [1] 141.0276 194.2406 228.0667 265.5146 303.1181 351.1803 449.9052 544.4051
## [9] 597.0984
##
## $cut.values$distancia_cemtroide
## [1] 79.70009 132.15562 174.63601 200.48180 218.50759 243.59899 272.39066
## [8] 303.66067 331.10901
##
## $cut.values$distancia_exemplars
## [1] 89.85229 127.98123 168.43967 196.77732 219.36332 245.83364 269.82913
## [8] 308.47148 345.45784
##
## $cut.values$alvaras
## [1] 81.7 134.0 169.0 197.2 251.5 309.0 394.6 588.8 753.2
##
## $cut.values$comercio
## [1] 0.08595492 0.15685412 0.21257993 0.24976722 0.30781441 0.34059430
## [7] 0.37681999 0.41346642 0.49765897
##
## $cut.values$servico
## [1] 0.5023410 0.5865336 0.6231800 0.6594057 0.6921856 0.7502328 0.7874201
## [8] 0.8431459 0.9140451
##
##
## $type.method
## [1] "unsupervised.quantiles"
##
## $type.task
## [1] "discretization"
##
## $model
## [1] "RST"
##
## attr(,"class")
## [1] "Discretization" "list"
Realiza a o typecast com base no modelo discretizado.
dt_treino = SF.applyDecTable(dt_treino, discretizados)
dt_teste= SF.applyDecTable(dt_teste, discretizados)
head(dt_teste)
head(dt_treino)
Realiza a classificação superfisionada por indução de acordo com o algorítimo CN2.
classifier = RI.CN2Rules.RST(dt_treino, K = 15)
print(classifier)
## A set consisting of 56 rules:
## 1. IF alvara_km2 is (1.3e+04,2.32e+04] THEN is 0;
## (supportSize=12; laplace=0.928571428571429)
## 2. IF distancia_medoide is [-Inf,141] and distancia_cemtroide is (79.7,132] THEN is 0;
## (supportSize=6; laplace=0.875)
## 3. IF alvara_km2 is (6.01e+03,6.36e+03] and distancia_medoide is (597, Inf] THEN is 0;
## (supportSize=5; laplace=0.857142857142857)
## 4. IF ocorrencias is (4.4e+03,4.68e+03] and alvaras is (589,753] THEN is 0;
## (supportSize=5; laplace=0.857142857142857)
## 5. IF alvara_km2 is (2.66e+03,3.57e+03] and area is (0.598,0.698] THEN is 1;
## (supportSize=7; laplace=0.888888888888889)
## 6. IF distancia_medoide is (194,228] and area is (0.344,0.369] THEN is 0;
## (supportSize=5; laplace=0.857142857142857)
## 7. IF alvara_km2 is (3.57e+03,5.81e+03] and alvaras is (197,252] THEN is 1;
## (supportSize=7; laplace=0.888888888888889)
## 8. IF ocorrencias is (2.57e+03,3.26e+03] and distancia_medoide is (228,266] THEN is 0;
## (supportSize=5; laplace=0.857142857142857)
## 9. IF distancia_medoide is (266,303] and area is (0.222,0.338] THEN is 0;
## (supportSize=4; laplace=0.833333333333333)
## 10. IF area is (0.369,0.393] and distancia_medoide is (194,228] THEN is 0;
## (supportSize=4; laplace=0.833333333333333)
## ... and 46 other rules.
Com base das regras atribuidas ao classificadore, é feita a classificação dos dados de teste.
prediction = predict(classifier, newdata = dt_teste[-10])
prediction
Cria-se a tabela matriz de confusão e faz-se a Verificação dos resultados da matriz de confusão.
mc = table(unlist(prediction), dt_teste$dt_teste.perc_oc, dnn=c("Predito", "Atual"))
mc
## Atual
## Predito 0 1
## 0 13 7
## 1 5 5
library(caret)
plot(mc)
cm_rd = confusionMatrix(mc)
cm_rd
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 13 7
## 1 5 5
##
## Accuracy : 0.6
## 95% CI : (0.406, 0.7734)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.5785
##
## Kappa : 0.1429
##
## Mcnemar's Test P-Value : 0.7728
##
## Sensitivity : 0.7222
## Specificity : 0.4167
## Pos Pred Value : 0.6500
## Neg Pred Value : 0.5000
## Prevalence : 0.6000
## Detection Rate : 0.4333
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.5694
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_rd)
fourfoldplot(cm_rd$table)
cm_rd = c("Regras de Decisão", cm_rd$overall[1], cm_rd$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_rd))
A caption
load(file = "treino.Rda")
head(treino)
load(file = "teste.Rda")
head(teste)
treino$comercio[is.na(treino$comercio)] <- 0
treino = treino[, c(1, 4:12)]
teste = teste[, c(1, 4:12)]
head(treino)
head(teste)
classificador = glm(formula = perc_oc ~., family = binomial, data = treino, control = list(maxit = 50))
probabilidades = predict(classificador, type = 'response', newdata = teste[, -1])
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
previsoes = ifelse(probabilidades > 0.5, 1, 0)
previsoes
## 9 15 19 27 44 48 60 64 75 78 88 96 100 101 106 116 134 138
## 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0
## 143 144 152 153 189 193 194 197 202 214 216 218
## 0 0 0 0 0 0 0 0 0 0 1 1
cm = table(previsoes, teste$perc_oc, dnn=c("Predito", "Atual"))
cm
## Atual
## Predito 0 1
## 0 15 7
## 1 3 5
library(caret)
plot(mc)
cm_rl = confusionMatrix(cm)
cm_rl
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 15 7
## 1 3 5
##
## Accuracy : 0.6667
## 95% CI : (0.4719, 0.8271)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.2915
##
## Kappa : 0.2647
##
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.8333
## Specificity : 0.4167
## Pos Pred Value : 0.6818
## Neg Pred Value : 0.6250
## Prevalence : 0.6000
## Detection Rate : 0.5000
## Detection Prevalence : 0.7333
## Balanced Accuracy : 0.6250
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_rl)
fourfoldplot(cm_rl$table)
cm_rl = c("Regressão Logística", cm_rl$overall[1], cm_rl$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_rl))
options(scipen=999)
setwd("~/OneDrive/r-files//TCC")
load(file = "treino.Rda")
treino$ocupacao = NULL
treino$faixas_ocupacao = NULL
head(treino)
dim(treino)
## [1] 190 10
treino$comercio[is.na(treino$comercio)] <- 0
load(file = "teste.Rda")
teste$ocupacao = NULL
teste$faixas_ocupacao = NULL
head(teste)
dim(teste)
## [1] 30 10
treino$perc_oc = as.numeric(as.character(treino$perc_oc))
treino$area = as.numeric(as.character(treino$area))
treino$ocorrencias = as.numeric(as.character(treino$ocorrencias))
treino$alvara_km2 = as.numeric(as.character(treino$alvara_km2))
treino$distancia_medoide = as.numeric(as.character(treino$distancia_medoide))
treino$distancia_cemtroide = as.numeric(as.character(treino$distancia_cemtroide))
treino$alvaras = as.numeric(as.character(treino$alvaras))
treino$comercio = as.numeric(as.character(treino$comercio))
treino$servico = as.numeric(as.character(treino$servico))
treino$comercio[is.na(treino$comercio)] <- mean(treino$comercio)
teste$perc_oc = as.numeric(as.character(teste$perc_oc))
teste$area = as.numeric(as.character(teste$area))
teste$ocorrencias = as.numeric(as.character(teste$ocorrencias))
teste$alvara_km2 = as.numeric(as.character(teste$alvara_km2))
teste$distancia_medoide = as.numeric(as.character(teste$distancia_medoide))
teste$distancia_cemtroide = as.numeric(as.character(teste$distancia_cemtroide))
teste$alvaras = as.numeric(as.character(teste$alvaras))
teste$comercio = as.numeric(as.character(teste$comercio))
teste$servico = as.numeric(as.character(teste$servico))
teste$comercio[is.na(teste$comercio)] <- mean(teste$comercio)
linearMod <- lm(perc_oc ~ ., data=treino)
summary(linearMod)
##
## Call:
## lm(formula = perc_oc ~ ., data = treino)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9260 -0.3747 -0.2190 0.4625 0.9805
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.080505162 0.217437803 -0.370 0.7116
## area 0.167956582 0.152419140 1.102 0.2720
## ocorrencias 0.000038101 0.000051212 0.744 0.4578
## alvara_km2 -0.000003586 0.000008676 -0.413 0.6798
## distancia_medoide 0.000455034 0.000233509 1.949 0.0529 .
## distancia_cemtroide -0.001447604 0.000830130 -1.744 0.0829 .
## distancia_exemplars 0.001761692 0.000747266 2.358 0.0195 *
## alvaras -0.000175937 0.000133326 -1.320 0.1886
## comercio 0.477882208 0.236070047 2.024 0.0444 *
## servico NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4677 on 181 degrees of freedom
## Multiple R-squared: 0.1318, Adjusted R-squared: 0.09342
## F-statistic: 3.434 on 8 and 181 DF, p-value: 0.001052
options(digits=2)
predicoes = predict.lm(linearMod, teste[, -1])
## Warning in predict.lm(linearMod, teste[, -1]): prediction from a rank-
## deficient fit may be misleading
predicoes = ifelse(predicoes > 0.5, 1, 0)
predicoes
## 9 15 19 27 44 48 60 64 75 78 88 96 100 101 106 116 134 138
## 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0
## 143 144 152 153 189 193 194 197 202 214 216 218
## 0 0 0 0 0 0 1 0 0 0 1 1
cm = table(predicoes, teste$perc_oc, dnn=c("Predito", "Atual"))
cm
## Atual
## Predito 0 1
## 0 14 7
## 1 4 5
library(caret)
plot(mc)
cm_rlin = confusionMatrix(cm)
cm_rlin
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 14 7
## 1 4 5
##
## Accuracy : 0.633
## 95% CI : (0.439, 0.801)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.431
##
## Kappa : 0.203
##
## Mcnemar's Test P-Value : 0.546
##
## Sensitivity : 0.778
## Specificity : 0.417
## Pos Pred Value : 0.667
## Neg Pred Value : 0.556
## Prevalence : 0.600
## Detection Rate : 0.467
## Detection Prevalence : 0.700
## Balanced Accuracy : 0.597
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_rlin)
fourfoldplot(cm_rlin$table)
cm_rlin = c("Regressão Linear", cm_rlin$overall[1], cm_rlin$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_rlin))
A caption
load(file = "treino.Rda")
load(file = "teste.Rda")
head(treino)
head(teste)
treino$comercio[is.na(treino$comercio)] <- 0
library(rpart)
library(rpart.plot)
treino[,c(2:3)] = NULL
treino$perc_oc = as.factor(treino$perc_oc)
head(treino)
classifier2 = rpart(formula = perc_oc ~ ., data = treino)
print(classifier2)
## n= 190
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 190 76 0 (0.60 0.40)
## 2) distancia_medoide< 3.1e+02 97 24 0 (0.75 0.25)
## 4) distancia_exemplars< 3.1e+02 88 18 0 (0.80 0.20) *
## 5) distancia_exemplars>=3.1e+02 9 3 1 (0.33 0.67) *
## 3) distancia_medoide>=3.1e+02 93 41 1 (0.44 0.56)
## 6) distancia_medoide>=6.9e+02 8 1 0 (0.87 0.12) *
## 7) distancia_medoide< 6.9e+02 85 34 1 (0.40 0.60)
## 14) comercio< 0.37 58 28 1 (0.48 0.52)
## 28) distancia_medoide< 5.3e+02 28 10 0 (0.64 0.36)
## 56) area< 0.72 21 5 0 (0.76 0.24) *
## 57) area>=0.72 7 2 1 (0.29 0.71) *
## 29) distancia_medoide>=5.3e+02 30 10 1 (0.33 0.67) *
## 15) comercio>=0.37 27 6 1 (0.22 0.78) *
rpart.plot(classifier2)
teste[,c(2:3)] = NULL
head(teste)
previsoes = predict(classifier2, newdata = teste[,-1])
previsoes
## 0 1
## 9 0.87 0.12
## 15 0.22 0.78
## 19 0.76 0.24
## 27 0.33 0.67
## 44 0.29 0.71
## 48 0.29 0.71
## 60 0.80 0.20
## 64 0.80 0.20
## 75 0.33 0.67
## 78 0.33 0.67
## 88 0.80 0.20
## 96 0.80 0.20
## 100 0.80 0.20
## 101 0.80 0.20
## 106 0.80 0.20
## 116 0.80 0.20
## 134 0.80 0.20
## 138 0.80 0.20
## 143 0.80 0.20
## 144 0.80 0.20
## 152 0.80 0.20
## 153 0.80 0.20
## 189 0.76 0.24
## 193 0.76 0.24
## 194 0.33 0.67
## 197 0.80 0.20
## 202 0.80 0.20
## 214 0.76 0.24
## 216 0.76 0.24
## 218 0.22 0.78
previsoes = predict(classifier2, newdata = teste, type = 'class')
head(previsoes)
## 9 15 19 27 44 48
## 0 1 0 1 1 1
## Levels: 0 1
mc = table(previsoes, teste$perc_oc, dnn=c("Predito", "Atual"))
print(mc)
## Atual
## Predito 0 1
## 0 13 9
## 1 5 3
library(caret)
plot(mc)
cm_ad = confusionMatrix(mc)
cm_ad
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 13 9
## 1 5 3
##
## Accuracy : 0.533
## 95% CI : (0.343, 0.717)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.825
##
## Kappa : -0.029
##
## Mcnemar's Test P-Value : 0.423
##
## Sensitivity : 0.722
## Specificity : 0.250
## Pos Pred Value : 0.591
## Neg Pred Value : 0.375
## Prevalence : 0.600
## Detection Rate : 0.433
## Detection Prevalence : 0.733
## Balanced Accuracy : 0.486
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_ad)
fourfoldplot(cm_ad$table)
cm_ad = c("Arvores de Decisão", cm_ad$overall[1], cm_ad$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_ad))
load(file = "treino.Rda")
head(treino)
load(file = "teste.Rda")
head(teste)
treino$comercio[is.na(treino$comercio)] <- 0
A caption
teste$perc_oc = as.numeric(as.character(teste$perc_oc))
teste$faixas_ocupacao = as.numeric(teste$faixas_ocupacao)
teste$ocupacao = as.numeric(as.character(teste$ocupacao))
teste$area = as.numeric(as.character(teste$area))
teste$ocorrencias = as.numeric(as.character(teste$ocorrencias))
teste$alvara_km2 = as.numeric(as.character(teste$alvara_km2))
teste$distancia_medoide = as.numeric(as.character(teste$distancia_medoide))
teste$distancia_cemtroide = as.numeric(as.character(teste$distancia_cemtroide))
teste$distancia_exemplars = as.numeric(as.character(teste$distancia_exemplars))
teste$alvaras = as.numeric(as.character(teste$alvaras))
teste$comercio = as.numeric(as.character(teste$comercio))
teste$servico = as.numeric(as.character(teste$servico))
treino$perc_oc = as.numeric(as.character(treino$perc_oc))
treino$faixas_ocupacao = as.numeric(treino$faixas_ocupacao)
treino$ocupacao = as.numeric(as.character(treino$ocupacao))
treino$area = as.numeric(as.character(treino$area))
treino$ocorrencias = as.numeric(as.character(treino$ocorrencias))
treino$alvara_km2 = as.numeric(as.character(treino$alvara_km2))
treino$distancia_medoide = as.numeric(as.character(treino$distancia_medoide))
treino$distancia_cemtroide = as.numeric(as.character(treino$distancia_cemtroide))
treino$distancia_exemplars = as.numeric(as.character(treino$distancia_exemplars))
treino$alvaras = as.numeric(as.character(treino$alvaras))
treino$comercio = as.numeric(as.character(treino$comercio))
treino$servico = as.numeric(as.character(treino$servico))
teste[, 4:12] = scale(teste[, 4:12])
treino[, 4:12] = scale(treino[, 4:12])
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
treino$perc_oc = as.factor(treino$perc_oc)
treino$faixas_ocupacao = as.factor(treino$faixas_ocupacao)
treino$ocupacao = as.factor(treino$ocupacao)
classifier = randomForest(x = treino[, 4:12], y = treino$perc_oc, ntree = 13)
print(classifier)
##
## Call:
## randomForest(x = treino[, 4:12], y = treino$perc_oc, ntree = 13)
## Type of random forest: classification
## Number of trees: 13
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 36%
## Confusion matrix:
## 0 1 class.error
## 0 81 33 0.29
## 1 36 40 0.47
previsoes = predict(classifier, newdata = teste[, 4:12])
previsoes
## 9 15 19 27 44 48 60 64 75 78 88 96 100 101 106 116 134 138
## 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0
## 143 144 152 153 189 193 194 197 202 214 216 218
## 0 0 0 0 0 0 0 0 0 1 1 1
## Levels: 0 1
mc = table(previsoes, teste$perc_oc, dnn=c("Predito", "Atual"))
mc
## Atual
## Predito 0 1
## 0 17 6
## 1 1 6
library(caret)
plot(mc)
cm_fr = confusionMatrix(mc)
cm_fr
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 17 6
## 1 1 6
##
## Accuracy : 0.767
## 95% CI : (0.577, 0.901)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.0435
##
## Kappa : 0.478
##
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.944
## Specificity : 0.500
## Pos Pred Value : 0.739
## Neg Pred Value : 0.857
## Prevalence : 0.600
## Detection Rate : 0.567
## Detection Prevalence : 0.767
## Balanced Accuracy : 0.722
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_fr)
fourfoldplot(cm_fr$table)
cm_fr = c("Floresta Randômica", cm_fr$overall[1], cm_fr$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_fr))
A caption
options(scipen=999)
load(file = "treino.Rda")
treino$ocupacao = NULL
treino$faixas_ocupacao = NULL
head(treino)
dim(treino)
## [1] 190 10
treino$comercio[is.na(treino$comercio)] <- 0
load(file = "teste.Rda")
teste$ocupacao = NULL
teste$faixas_ocupacao = NULL
head(teste)
dim(teste)
## [1] 30 10
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 37 minutes 49 seconds
## H2O cluster timezone: America/Sao_Paulo
## H2O data parsing timezone: UTC
## H2O cluster version: 3.22.1.1
## H2O cluster version age: 6 months and 8 days !!!
## H2O cluster name: H2O_started_from_R_fagnersuteldemoura_hep385
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.00 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.2 (2018-12-20)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (6 months and 8 days)!
## Please download and install the latest version from http://h2o.ai/download/
classifier = h2o.deeplearning(y= 'perc_oc', training_frame = as.h2o(treino),
activation = 'Rectifier', hidden = c(22,30), epochs = 1000)
##
|
| | 0%
|
|=================================================================| 100%
##
|
| | 0%
|
|========================= | 39%
|
|========================================================= | 88%
|
|=================================================================| 100%
print(classifier)
## Model Details:
## ==============
##
## H2OBinomialModel: deeplearning
## Model ID: DeepLearning_model_R_1562507513506_25
## Status of Neuron Layers: predicting perc_oc, 2-class classification, bernoulli distribution, CrossEntropy loss, 972 weights/biases, 16.2 KB, 190,000 training samples, mini-batch size 1
## layer units type dropout l1 l2 mean_rate rate_rms
## 1 1 9 Input 0.00 % NA NA NA NA
## 2 2 22 Rectifier 0.00 % 0.000000 0.000000 0.000709 0.000974
## 3 3 30 Rectifier 0.00 % 0.000000 0.000000 0.002032 0.002552
## 4 4 2 Softmax NA 0.000000 0.000000 0.002374 0.002061
## momentum mean_weight weight_rms mean_bias bias_rms
## 1 NA NA NA NA NA
## 2 0.000000 -0.062958 0.378726 0.552060 0.199394
## 3 0.000000 -0.012868 0.301501 1.021265 0.077520
## 4 0.000000 -0.296918 1.274050 0.001083 0.018389
##
##
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
##
## MSE: 0.05
## RMSE: 0.22
## LogLoss: 0.18
## Mean Per-Class Error: 0.044
## AUC: 0.98
## pr_auc: 0.97
## Gini: 0.97
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 107 7 0.061404 =7/114
## 1 2 74 0.026316 =2/76
## Totals 109 81 0.047368 =9/190
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.105517 0.942675 80
## 2 max f2 0.105517 0.961039 80
## 3 max f0point5 0.526731 0.941011 69
## 4 max accuracy 0.105517 0.952632 80
## 5 max precision 0.999990 1.000000 0
## 6 max recall 0.003417 1.000000 114
## 7 max specificity 0.999990 1.000000 0
## 8 max absolute_mcc 0.105517 0.903716 80
## 9 max min_per_class_accuracy 0.174170 0.947368 77
## 10 max mean_per_class_accuracy 0.105517 0.956140 80
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
predicoes = h2o.predict(classifier, newdata = as.h2o(teste[, -1]))
##
|
| | 0%
|
|=================================================================| 100%
##
|
| | 0%
|
|=================================================================| 100%
predicoes
## predict p0 p1
## 1 1 0.44147370 0.5585
## 2 0 0.99182652 0.0082
## 3 0 0.99545383 0.0045
## 4 1 0.72081741 0.2792
## 5 1 0.00000077 1.0000
## 6 1 0.00000653 1.0000
##
## [30 rows x 3 columns]
predicoes = as.data.frame(predicoes)
cm = table(predicoes$predict, teste$perc_oc, dnn=c("Predito", "Atual"))
cm
## Atual
## Predito 0 1
## 0 14 3
## 1 4 9
library(caret)
plot(mc)
cm_rna = confusionMatrix(cm)
cm_rna
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 14 3
## 1 4 9
##
## Accuracy : 0.767
## 95% CI : (0.577, 0.901)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.0435
##
## Kappa : 0.521
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.778
## Specificity : 0.750
## Pos Pred Value : 0.824
## Neg Pred Value : 0.692
## Prevalence : 0.600
## Detection Rate : 0.467
## Detection Prevalence : 0.567
## Balanced Accuracy : 0.764
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_rna)
fourfoldplot(cm_rna$table)
cm_rna = c("Rede Neural Artificial", cm_rna$overall[1], cm_rna$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_rna))
A caption
load(file = "treino.Rda")
treino$ocupacao = NULL
treino$faixas_ocupacao = NULL
head(treino)
dim(treino)
## [1] 190 10
treino$comercio[is.na(treino$comercio)] <- 0
load(file = "teste.Rda")
teste$ocupacao = NULL
teste$faixas_ocupacao = NULL
head(teste)
dim(teste)
## [1] 30 10
library(e1071)
classifier = svm(formula = perc_oc ~., data = treino, type = 'C-classification', kernel = 'radial')
print(classifier)
##
## Call:
## svm(formula = perc_oc ~ ., data = treino, type = "C-classification",
## kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.11
##
## Number of Support Vectors: 159
predicao = predict(classifier, newdata = teste[, -1])
predicao
## 9 15 19 27 44 48 60 64 75 78 88 96 100 101 106 116 134 138
## 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0
## 143 144 152 153 189 193 194 197 202 214 216 218
## 0 0 0 0 0 0 0 0 0 1 1 1
## Levels: 0 1
cm = table(predicao, teste$perc_oc, dnn=c("Predito", "Atual"))
cm
## Atual
## Predito 0 1
## 0 15 6
## 1 3 6
library(caret)
plot(mc)
cm_svm = confusionMatrix(cm)
cm_svm
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 15 6
## 1 3 6
##
## Accuracy : 0.7
## 95% CI : (0.506, 0.853)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.176
##
## Kappa : 0.348
##
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.833
## Specificity : 0.500
## Pos Pred Value : 0.714
## Neg Pred Value : 0.667
## Prevalence : 0.600
## Detection Rate : 0.500
## Detection Prevalence : 0.700
## Balanced Accuracy : 0.667
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_svm)
fourfoldplot(cm_svm$table)
cm_svm = c("Suport Vetor Machine", cm_svm$overall[1], cm_svm$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_svm))
A caption
load(file = "treino.Rda")
treino$ocupacao = NULL
treino$faixas_ocupacao = NULL
head(treino)
treino$comercio[is.na(treino$comercio)] <- 0
load(file = "teste.Rda")
teste$ocupacao = NULL
teste$faixas_ocupacao = NULL
head(teste)
dim(teste)
## [1] 30 10
dim(treino)
## [1] 190 10
teste$perc_oc = factor(as.character(teste$perc_oc))
teste$area = factor(as.character(teste$area))
teste$ocorrencias = factor(as.character(teste$ocorrencias))
teste$alvara_km2 = factor(as.character(teste$alvara_km2))
teste$distancia_medoide = factor(as.character(teste$distancia_medoide))
teste$distancia_cemtroide = factor(as.character(teste$distancia_cemtroide))
teste$distancia_exemplars = factor(as.character(teste$distancia_exemplars))
teste$alvaras = factor(as.character(teste$alvaras))
teste$comercio = factor(as.character(teste$comercio))
teste$servico = factor(as.character(teste$servico))
treino$perc_oc = factor(as.character(treino$perc_oc))
treino$area = factor(as.character(treino$area))
treino$ocorrencias = factor(as.character(treino$ocorrencias))
treino$alvara_km2 = factor(as.character(treino$alvara_km2))
treino$distancia_medoide = factor(as.character(treino$distancia_medoide))
treino$distancia_cemtroide = factor(as.character(treino$distancia_cemtroide))
treino$distancia_exemplars = factor(as.character(treino$distancia_exemplars))
treino$alvaras = factor(as.character(treino$alvaras))
treino$comercio = factor(as.character(treino$comercio))
treino$servico = factor(as.character(treino$servico))
treino = cbind(treino[, 2:10], treino$perc_oc)
teste = cbind(teste[, 2:10], teste$perc_oc)
library(class)
classifier = knn(train = treino[, -10], test = teste[, -10],
cl =treino[, 10], k = 10)
classifier
## [1] 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1
## Levels: 0 1
cm = table(classifier, teste[, 10], dnn=c("Predito", "Atual"))
cm
## Atual
## Predito 0 1
## 0 18 4
## 1 0 8
library(caret)
plot(mc)
cm_knn = confusionMatrix(cm)
cm_knn
## Confusion Matrix and Statistics
##
## Atual
## Predito 0 1
## 0 18 4
## 1 0 8
##
## Accuracy : 0.867
## 95% CI : (0.693, 0.962)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.00151
##
## Kappa : 0.706
##
## Mcnemar's Test P-Value : 0.13361
##
## Sensitivity : 1.000
## Specificity : 0.667
## Pos Pred Value : 0.818
## Neg Pred Value : 1.000
## Prevalence : 0.600
## Detection Rate : 0.600
## Detection Prevalence : 0.733
## Balanced Accuracy : 0.833
##
## 'Positive' Class : 0
##
source('matriz_confusao.R')
matriz(cm_knn)
fourfoldplot(cm_knn$table)
cm_knn = c("K-Nearest Neighbors", cm_knn$overall[1], cm_knn$byClass[1:4])
resultados = as.data.frame(cbind(resultados, cm_knn))
names(resultados) = as.matrix(resultados[1, ])
resultados <- resultados[-1, ]
resultados[] <- lapply(resultados, function(x) type.convert(as.character(x)))
nomes = c("N.B.", "R.D.", "R.Lo", "R.Li", "A.D.", "F.R.", "RNA", "SVM", "KNN")
nomes
## [1] "N.B." "R.D." "R.Lo" "R.Li" "A.D." "F.R." "RNA" "SVM" "KNN"
plot_df = data.frame()
plot_df = cbind(resultados[1,], resultados[2,], resultados[3,], resultados[4,], resultados[5,])
plot_df = data.frame(matrix(unlist(plot_df), nrow=9, byrow=F),stringsAsFactors=FALSE)
plot_df = cbind(plot_df, nomes, c(1:9))
names(plot_df) = c("AC", "SE", "ES", "PPV", "NPV", "TESTE", "CLASSE")
plot_df
ggplot(plot_df, aes(CLASSE)) +
ylim(0.15, 1.0) +
geom_line(aes(y = AC, colour = "Acurácia")) +
geom_line(aes(y = SE, colour = "Sensibilidade")) +
geom_line(aes(y = ES, colour = "Especificidade")) +
geom_line(aes(y = PPV, colour = "Pos. Pred. Val.")) +
geom_line(aes(y = NPV, colour = "Neg. Pred. Val.")) +
labs(title="Avaliação dos testes de aprendizado - Iguatemi",
x ="Testes Aplicados", y = "Desempenho") +
scale_x_continuous(breaks = 1:9, labels = nomes, limits = c(1, 9))
nomes = c("N.Bayes.", "Reg .Dedisão.", "R.Log", "R.Lin", "Arv.Decisão", "Flor. Randômica", "RNA", "MSV", "KNN")
nomes
## [1] "N.Bayes." "Reg .Dedisão." "R.Log" "R.Lin"
## [5] "Arv.Decisão" "Flor. Randômica" "RNA" "MSV"
## [9] "KNN"
plot_df = data.frame()
plot_df = cbind(resultados[1,], resultados[2,], resultados[3,])
plot_df = data.frame(matrix(unlist(plot_df), nrow=9, byrow=F),stringsAsFactors=FALSE)
plot_df = cbind(plot_df, nomes, c(1:9))
names(plot_df) = c("AC", "SE", "ES", "TESTE", "CLASSE")
plot_df
ggplot(plot_df, aes(CLASSE)) +
ylim(0.15, 1.0) +
geom_line(aes(y = AC, colour = "Acurácia")) +
geom_line(aes(y = SE, colour = "Sensibilidade")) +
geom_line(aes(y = ES, colour = "Especificidade")) +
labs(title="Avaliação dos testes de aprendizado - Iguatemi",
x ="Testes Aplicados", y = "Desempenho", color = "Medidas de Desempenho\n") +
scale_x_continuous(breaks = 1:9, labels = nomes, limits = c(1, 9)) +
theme(axis.text.x = element_text(angle = 45))
resultados$`Naive Bayes` = resultados$`Naive Bayes`*100
resultados$`Regras de Decisão` = resultados$`Regras de Decisão`*100
resultados$`Regressão Logística` = resultados$`Regressão Logística`*100
resultados$`Regressão Linear` = resultados$`Regressão Linear`*100
resultados$`Arvores de Decisão` = resultados$`Arvores de Decisão`*100
resultados$`Floresta Randômica` = resultados$`Floresta Randômica`*100
resultados$`Rede Neural Artificial`= resultados$`Rede Neural Artificial`*100
resultados$`Suport Vetor Machine` = resultados$`Suport Vetor Machine`*100
resultados$`K-Nearest Neighbors` = resultados$`K-Nearest Neighbors`*100
resultados
library(fmsb)
data=rbind(rep(100,5) , rep(0,5) , resultados)
radarchart(data)
colors_border=c( rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9) , rgb(0.5,0.2,0.1,0.9) , rgb(0.7,0.9,0.9,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.1), rgb(0.8,0.2,0.5,0.1) , rgb(0.7,0.5,0.1,0.1) , rgb(0.5,0.2,0.1,0.1) , rgb(0.7,0.9,0.9,0.1) )
radarchart( data , axistype=1 ,
#custom polygon
pcol=colors_border, pfcol=colors_in, plwd=2 , plty=1,
#custom the grid
cglcol="grey", cglty=1, axislabcol="red", caxislabels=seq(0,100,25), cglwd=0.8,
#custom labels
vlcex=0.99
)
legend(x=1.35, y=1.4, legend = rownames(data[-c(1,2),]), bty = "n", pch=20 , col=colors_border ,
text.col = "Black", cex=1.2, pt.cex=3)