DNI <- as.integer(params$dni)
library(readxl)
db.accidente <- read_excel(
'./Todas las bases.xlsx',
sheet = 'accidente',
col_names = TRUE,
)[,1:5]
New names:
* `` -> ...6
* `` -> ...7
* `` -> ...8
* `` -> ...9
db.seguros <- read_excel(
'./Todas las bases.xlsx',
sheet = 'seguros',
col_names = TRUE,
)[,1:8]
New names:
* `` -> ...9
* `` -> ...10
Obtenemos los datos según las restricciones del enunciado:
get_data <- function (datos, ratio) {
n <- round(ratio * nrow(datos))
set.seed(DNI)
cuales <- sample(
1:nrow(datos),
size = n,
replace = FALSE
)
return(datos[cuales,])
}
db.accidente.data <- get_data(db.accidente, 0.9)
db.seguros.data <- get_data(db.seguros, 0.75)
# Me fijo si hay datos faltantes
print(db.accidente.data[rowSums(is.na(db.accidente.data)) > 0,])
db.accidente.data$vehiculo <- NULL
db.accidente.cols.vars <- setdiff(colnames(db.accidente.data), c("grave"))
library(factoextra)
Loading required package: ggplot2
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
accidente.pca.cov <- prcomp(db.accidente.data[,db.accidente.cols.vars], center = TRUE, scale. = FALSE)
accidente.pca.cor <- prcomp(db.accidente.data[,db.accidente.cols.vars], center = TRUE, scale. = TRUE)
summary(accidente.pca.cov)
Importance of components:
PC1 PC2 PC3
Standard deviation 24.0567 13.3304 3.81237
Proportion of Variance 0.7507 0.2305 0.01885
Cumulative Proportion 0.7507 0.9811 1.00000
summary(accidente.pca.cor)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.2432 0.9127 0.7883
Proportion of Variance 0.5152 0.2777 0.2072
Cumulative Proportion 0.5152 0.7929 1.0000
fviz_eig(accidente.pca.cov)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
fviz_eig(accidente.pca.cor)
accidente.grave <- db.accidente.data[db.accidente.data$grave == 1,][,db.accidente.cols.vars]
accidente.no_grave <- db.accidente.data[db.accidente.data$grave == 0,][,db.accidente.cols.vars]
accidente.mean.conjunta <- apply(db.accidente.data[,db.accidente.cols.vars], 2, mean)
accidente.mean.grave <- apply(accidente.grave, 2, mean)
accidente.mean.grave.matrix <- as.matrix(accidente.mean.grave)
accidente.mean.no_grave <- apply(accidente.no_grave, 2, mean)
accidente.mean.no_grave.matrix <- as.matrix(accidente.mean.no_grave)
accidente.means.data <- cbind(
rep(1:length(db.accidente.cols.vars), 2),
rbind(accidente.mean.grave.matrix, accidente.mean.no_grave.matrix),
c(rep("grave", length(db.accidente.cols.vars)), rep("no grave", length(db.accidente.cols.vars)))
)
colnames(accidente.means.data) <- c('vars', 'means', 'grave')
accidente.means.data <- data.frame(accidente.means.data)
accidente.means.data$vars <- as.numeric(accidente.means.data$vars)
accidente.means.data$means <- as.numeric(accidente.means.data$means)
ggplot(accidente.means.data, aes(x = vars, y = means, colour = grave)) +
geom_line() +
scale_x_discrete(
limit = as.character(1:length(db.accidente.cols.vars)),
labels = db.accidente.cols.vars
) +
xlab("Variables") +
ylab("Medias")
require(reshape2)
Loading required package: reshape2
db.accidente.data.m <- melt(db.accidente.data, id.var = 'grave')
db.accidente.data.m$grave <- as.character(db.accidente.data.m$grave)
require(ggplot2)
ggplot(data = db.accidente.data.m, aes(x=variable, y=value)) +
geom_boxplot(aes(fill=grave)) +
facet_wrap(~variable, scales = "free")
Comparar vectores medios
(h0: Dos medias son iguales, p < 0.05 => Rechazo hipotesis de medias iguales)
library(Hotelling)
Loading required package: corpcor
fit <- hotelling.test(.~grave, data=db.accidente.data)
fit
Test stat: 6.9394
Numerator df: 3
Denominator df: 32
P-value: 0.0009969
library(mvnormtest)
library(car)
Loading required package: carData
par(mfrow=c(2,2))
for (var in db.accidente.cols.vars) {
print(var)
print(mshapiro.test(t(db.accidente.data[[var]])))
qqPlot(
db.accidente.data[[var]],
xlab = 'Cuantiles normales',
ylab = var,
col = 'green',
pch = 20,
col.lines = 'royalblue',
lwd = 1,
)
}
[1] "antigüedad"
Shapiro-Wilk normality test
data: Z
W = 0.87457, p-value = 0.0007387
[1] "edad conductor"
Shapiro-Wilk normality test
data: Z
W = 0.91839, p-value = 0.0113
[1] "potencia"
Shapiro-Wilk normality test
data: Z
W = 0.86769, p-value = 0.0004997
Test de normalidad multivariada de Shapiro-Wilk
(h0: Distribucion normal, p < 0.05 => Se rechaza hipotesis de normalidad)
mshapiro.test(t(db.accidente.data[, db.accidente.cols.vars]))
Shapiro-Wilk normality test
data: Z
W = 0.91757, p-value = 0.01069
mshapiro.test(t(db.accidente.data[db.accidente.data$grave == 0, db.accidente.cols.vars]))
Shapiro-Wilk normality test
data: Z
W = 0.88713, p-value = 0.01657
mshapiro.test(t(db.accidente.data[db.accidente.data$grave == 1, db.accidente.cols.vars]))
Shapiro-Wilk normality test
data: Z
W = 0.90161, p-value = 0.119
M Test de Box (h0: Homogeniedad de matrices de var-cov, p < 0.05 => Se rechaza hipotesis de Homogeniedad de matrices de var-cov)
(Homocedasticidad)
Este test es sensible a la falta de normalidad. Una alternativa robusta es el test de Levene para datos multivariados.
library(biotools)
Loading required package: rpanel
Loading required package: tcltk
Package `rpanel', version 1.1-4: type help(rpanel) for summary information
Loading required package: tkrplot
Loading required package: MASS
Loading required package: lattice
Loading required package: SpatialEpi
Loading required package: sp
---
biotools version 3.1
boxM(data = db.accidente.data[, db.accidente.cols.vars], grouping = db.accidente.data$grave)
Box's M-test for Homogeneity of Covariance Matrices
data: db.accidente.data[, db.accidente.cols.vars]
Chi-Sq (approx.) = 18.163, df = 6, p-value = 0.005837
library(corpcor)
library(corrplot)
corrplot 0.84 loaded
accidente.cor <- cor(db.accidente.data[,db.accidente.cols.vars])
corrplot(accidente.cor, tl.cex = 0.7, cl.cex = 0.7, tl.col = "royalblue")
accidente.cor.no_grave <- cor(db.accidente.data[db.accidente.data$grave == 0, db.accidente.cols.vars])
accidente.cor.grave <- cor(db.accidente.data[db.accidente.data$grave == 1, db.accidente.cols.vars])
par(mfrow = c(1, 2))
corrplot(accidente.cor.no_grave, tl.cex = 0.7, cl.cex = 0.7, tl.col = "royalblue")
corrplot(accidente.cor.grave, tl.cex = 0.7, cl.cex = 0.7, tl.col = "royalblue")
split_sets <- function (datos, ratio) {
if (missing(ratio)) {
ratio <- 2 /3
}
n <- round(ratio * nrow(datos))
set.seed(DNI)
cuales <- sample(
1:nrow(datos),
size = n,
replace = FALSE
)
return(list(
'training' = datos[cuales,],
'trainingIndexes' = cuales,
'validation' = datos[-cuales,],
'validationIndexes' = -cuales
))
}
db.accidente.data.splits <- split_sets(db.accidente.data)
training <- db.accidente.data.splits$training
ADL es solo válido:
models.lda <- lda(
formula = grave ~ antigüedad + `edad conductor` + potencia,
data = training
)
models.lda
Call:
lda(grave ~ antigüedad + `edad conductor` + potencia, data = training)
Prior probabilities of groups:
0 1
0.5833333 0.4166667
Group means:
antigüedad `edad conductor` potencia
0 6.0 48.35714 79.28571
1 5.6 30.90000 78.20000
Coefficients of linear discriminants:
LD1
antigüedad 0.10362744
`edad conductor` -0.09868078
potencia 0.01844371
Error
validation <- db.accidente.data.splits$validation
models.lda.predict <- predict(
object = models.lda,
newdata = validation[, db.accidente.cols.vars],
method = 'predictive'
)
table(
validation$grave,
models.lda.predict$class,
dnn = c("Real", "Predicha")
)
Predicha
Real 0 1
0 6 2
1 0 4
models.lda.predict.error <- mean(validation$grave != models.lda.predict$class) * 100
models.lda.predict.error
[1] 16.66667
library(caret)
models.lda.predict.confusionMatrix <- confusionMatrix(as.factor(validation$grave), models.lda.predict$class)
models.lda.predict.confusionMatrix
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 6 2
1 0 4
Accuracy : 0.8333
95% CI : (0.5159, 0.9791)
No Information Rate : 0.5
P-Value [Acc > NIR] : 0.01929
Kappa : 0.6667
Mcnemar's Test P-Value : 0.47950
Sensitivity : 1.0000
Specificity : 0.6667
Pos Pred Value : 0.7500
Neg Pred Value : 1.0000
Prevalence : 0.5000
Detection Rate : 0.5000
Detection Prevalence : 0.6667
Balanced Accuracy : 0.8333
'Positive' Class : 0
models.qda <- qda(
formula = grave ~ antigüedad + `edad conductor` + potencia,
data = training
)
models.qda
Call:
qda(grave ~ antigüedad + `edad conductor` + potencia, data = training)
Prior probabilities of groups:
0 1
0.5833333 0.4166667
Group means:
antigüedad `edad conductor` potencia
0 6.0 48.35714 79.28571
1 5.6 30.90000 78.20000
models.qda.predict <- predict(
object = models.qda,
newdata = validation[, db.accidente.cols.vars],
)
table(
validation$grave,
models.qda.predict$class,
dnn = c("Real", "Predicha")
)
Predicha
Real 0 1
0 5 3
1 1 3
models.qda.predict.error <- mean(validation$grave != models.qda.predict$class) * 100
models.qda.predict.error
[1] 33.33333
models.qda.predict.confusionMatrix <- confusionMatrix(as.factor(validation$grave), models.qda.predict$class)
models.qda.predict.confusionMatrix
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 5 3
1 1 3
Accuracy : 0.6667
95% CI : (0.3489, 0.9008)
No Information Rate : 0.5
P-Value [Acc > NIR] : 0.1938
Kappa : 0.3333
Mcnemar's Test P-Value : 0.6171
Sensitivity : 0.8333
Specificity : 0.5000
Pos Pred Value : 0.6250
Neg Pred Value : 0.7500
Prevalence : 0.5000
Detection Rate : 0.4167
Detection Prevalence : 0.6667
Balanced Accuracy : 0.6667
'Positive' Class : 0
# **Particiones por clase**
# library(klaR)
#
db.accidente.data.frame <- data.frame(
"Antiguedad" = db.accidente.data$antigüedad,
"EdadConductor" = db.accidente.data$antigüedad,
"Potencia" = db.accidente.data$potencia,
"Grave" = factor(db.accidente.data$grave)
)
#
# partimat(
# Grave ~ Vehiculo + Antiguedad + EdadConductor + Potencia,
# data = db.accidente.data.frame,
# method = 'qda',
# image.colors = c('cadetblue1', 'plum2'),
# col.mean = 'royalblue',
# subset = db.accidente.data.splits$trainingIndexes
# )
library(MASS)
models.robust.cov.noGrave <- cov.rob(
training[training$grave == 0, db.accidente.cols.vars],
method = 'mcd',
nsamp = 'best'
)
models.robust.cov.grave <- cov.rob(
training[training$grave == 1, db.accidente.cols.vars],
method = 'mcd',
nsamp = 'best'
)
models.robust.prom.noGrave <- rep(
models.robust.cov.noGrave$center,
nrow(validation[validation$grave == 0, ])
)
models.robust.prom.grave <- rep(
models.robust.cov.grave$center,
nrow(validation[validation$grave == 1, ])
)
models.robust.var.noGrave <- as.matrix(models.robust.cov.noGrave$cov)
models.robust.var.grave <- as.matrix(models.robust.cov.grave$cov)
models.robust.DR.noGrave <-
as.matrix(
validation[, db.accidente.cols.vars] -
models.robust.prom.noGrave
) %*% solve(models.robust.var.noGrave) %*% t(as.matrix(
validation[, db.accidente.cols.vars] -
models.robust.prom.noGrave
))
models.robust.DR.grave <-
as.matrix(
validation[, db.accidente.cols.vars] -
models.robust.prom.grave
) %*% solve(models.robust.var.grave) %*% t(as.matrix(
validation[, db.accidente.cols.vars] -
models.robust.prom.grave
))
models.robust.predict <- rep(-1, nrow(validation))
for (i in 1:nrow(validation)) {
ifelse(
models.robust.DR.noGrave[i] < models.robust.DR.grave[i],
models.robust.predict[i] <- 0,
models.robust.predict[i] <- 1
)
}
table(
validation$grave,
models.robust.predict,
dnn = c("Real", "Predicha")
)
Predicha
Real 0 1
0 5 3
1 3 1
models.robust.predict.error <- mean(validation$grave != models.robust.predict) * 100
models.robust.predict.error
[1] 50
models.robust.predict.confusionMatrix <- confusionMatrix(as.factor(validation$grave), as.factor(models.robust.predict))
models.robust.predict.confusionMatrix
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 5 3
1 3 1
Accuracy : 0.5
95% CI : (0.2109, 0.7891)
No Information Rate : 0.6667
P-Value [Acc > NIR] : 0.9336
Kappa : -0.125
Mcnemar's Test P-Value : 1.0000
Sensitivity : 0.6250
Specificity : 0.2500
Pos Pred Value : 0.6250
Neg Pred Value : 0.2500
Prevalence : 0.6667
Detection Rate : 0.4167
Detection Prevalence : 0.6667
Balanced Accuracy : 0.4375
'Positive' Class : 0
library(e1071)
db.accidente.data.frame.std <- data.frame(scale(subset(db.accidente.data.frame, select = -Grave)))
db.accidente.data.frame.std$Grave <- db.accidente.data.frame$Grave
models.svm <- svm(
Grave ~ Antiguedad + EdadConductor + Potencia,
data = db.accidente.data.frame.std[db.accidente.data.splits$trainingIndexes,],
method = 'C-classification',
kernel = 'radial',
cost = 10,
gamma = .1
)
models.svm.predict <- predict(models.svm, db.accidente.data.frame.std[db.accidente.data.splits$validationIndexes,])
models.svm.predictClass <- as.factor(as.integer(models.svm.predict) - 1)
table(
validation$grave,
models.svm.predict,
dnn = c("Real", "Predicha")
)
Predicha
Real 0 1
0 8 0
1 3 1
models.svm.predict.error <- mean(validation$grave != models.svm.predictClass) * 100
models.svm.predict.error
[1] 25
models.svm.predict.confusionMatrix <- confusionMatrix(as.factor(validation$grave), models.svm.predictClass)
models.svm.predict.confusionMatrix
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 8 0
1 3 1
Accuracy : 0.75
95% CI : (0.4281, 0.9451)
No Information Rate : 0.9167
P-Value [Acc > NIR] : 0.9862
Kappa : 0.3077
Mcnemar's Test P-Value : 0.2482
Sensitivity : 0.7273
Specificity : 1.0000
Pos Pred Value : 1.0000
Neg Pred Value : 0.2500
Prevalence : 0.9167
Detection Rate : 0.6667
Detection Prevalence : 0.6667
Balanced Accuracy : 0.8636
'Positive' Class : 0
library(gridExtra)
grid.arrange(
ggplot(
db.accidente.data.frame.std,
aes(x = EdadConductor, y = Antiguedad)
) +
geom_point(aes(color = Grave)),
ggplot(
db.accidente.data.frame.std,
aes(x = EdadConductor, y = Potencia)
) +
geom_point(aes(color = Grave)),
ggplot(
db.accidente.data.frame.std,
aes(x = Antiguedad, y = Potencia)
) +
geom_point(aes(color = Grave)),
nrow = 2
)
library(ggbiplot)
Loading required package: plyr
Loading required package: scales
Loading required package: grid
ggbiplot(
accidente.pca.cor,
choices = 1:2,
obs.scale = 1,
var.scale = 1,
alpha = 0.5,
groups = as.factor(db.accidente.data$grave)
) +
theme(legend.direction = "horizontal", legend.position = "top")
library(plotly)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following objects are masked from ‘package:plyr’:
arrange, mutate, rename, summarise
The following object is masked from ‘package:MASS’:
select
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
plot_ly(
x = db.accidente.data.frame.std$EdadConductor,
y = db.accidente.data.frame.std$Antiguedad,
z = db.accidente.data.frame.std$Potencia,
type = "scatter3d",
mode = "markers",
color = db.accidente.data.frame.std$Grave
)
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
Ver Código 9.9 (341)
par(mfcol = c(2,2))
for (var in db.accidente.cols.vars) {
print(var)
modelo_logistico <- glm(
grave ~ var,
data = data.frame(
var = training[[var]],
grave = training$grave
),
family = 'binomial'
)
print(summary(modelo_logistico))
plot(
training[[var]],
training$grave,
col = 'royalblue',
xlab = var,
ylab = 'P(grave)'
)
curve(
predict(
modelo_logistico,
data.frame(var = x),
type = 'response'
),
add = TRUE,
col = 'violet',
lwd = 2.5
)
predicciones <- ifelse(
test = modelo_logistico$fitted.values > 0.5,
yes = 1,
no = 0
)
print(predicciones)
print(
table(
training$grave,
predicciones,
dnn = c('Observaciones', 'Predicciones')
)
)
print(
mean(training$grave != predicciones) * 100
)
}
[1] "antigüedad"
Call:
glm(formula = grave ~ var, family = "binomial", data = data.frame(var = training[[var]],
grave = training$grave))
Deviance Residuals:
Min 1Q Median 3Q Max
-1.0815 -1.0454 -0.9708 1.2935 1.3645
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.20764 0.70056 -0.296 0.767
var -0.02222 0.09805 -0.227 0.821
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 32.601 on 23 degrees of freedom
Residual deviance: 32.550 on 22 degrees of freedom
AIC: 36.55
Number of Fisher Scoring iterations: 4
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Predicciones
Observaciones 0
0 14
1 10
[1] 41.66667
[1] "edad conductor"
Call:
glm(formula = grave ~ var, family = "binomial", data = data.frame(var = training[[var]],
grave = training$grave))
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9986 -0.7359 -0.2701 0.7339 1.7965
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.97611 1.78493 2.228 0.0259 *
var -0.11183 0.04719 -2.370 0.0178 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 32.601 on 23 degrees of freedom
Residual deviance: 23.182 on 22 degrees of freedom
AIC: 27.182
Number of Fisher Scoring iterations: 5
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 1 0
Predicciones
Observaciones 0 1
0 11 3
1 4 6
[1] 29.16667
[1] "potencia"
Call:
glm(formula = grave ~ var, family = "binomial", data = data.frame(var = training[[var]],
grave = training$grave))
Deviance Residuals:
Min 1Q Median 3Q Max
-1.052 -1.044 -1.016 1.320 1.363
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.196686 1.380563 -0.142 0.887
var -0.001775 0.016744 -0.106 0.916
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 32.601 on 23 degrees of freedom
Residual deviance: 32.590 on 22 degrees of freedom
AIC: 36.59
Number of Fisher Scoring iterations: 4
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Predicciones
Observaciones 0
0 14
1 10
[1] 41.66667
models.lg <- glm(
as.factor(grave) ~ antigüedad + `edad conductor` + potencia,
data = training,
family = 'binomial'
)
summary(models.lg)
Call:
glm(formula = as.factor(grave) ~ antigüedad + `edad conductor` +
potencia, family = "binomial", data = training)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.1820 -0.5342 -0.2028 0.7442 1.5620
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.10967 2.16626 1.436 0.1511
antigüedad 0.17703 0.16130 1.098 0.2724
`edad conductor` -0.16321 0.06457 -2.527 0.0115 *
potencia 0.02491 0.02385 1.044 0.2964
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 32.601 on 23 degrees of freedom
Residual deviance: 20.210 on 20 degrees of freedom
AIC: 28.21
Number of Fisher Scoring iterations: 5
models.lg.predict <- predict(
object = models.lg,
newdata = validation[, db.accidente.cols.vars],
method = 'response'
)
models.lg.predictClass <- ifelse(models.lg.predict > 0.5, 1, 0)
table(
validation$grave,
models.lg.predictClass,
dnn = c("Real", "Predicha")
)
Predicha
Real 0 1
0 7 1
1 1 3
models.lg.predict.confusionMatrix <- confusionMatrix(as.factor(validation$grave), as.factor(models.lg.predictClass))
models.lg.predict.confusionMatrix
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 7 1
1 1 3
Accuracy : 0.8333
95% CI : (0.5159, 0.9791)
No Information Rate : 0.6667
P-Value [Acc > NIR] : 0.1811
Kappa : 0.625
Mcnemar's Test P-Value : 1.0000
Sensitivity : 0.8750
Specificity : 0.7500
Pos Pred Value : 0.8750
Neg Pred Value : 0.7500
Prevalence : 0.6667
Detection Rate : 0.5833
Detection Prevalence : 0.6667
Balanced Accuracy : 0.8125
'Positive' Class : 0
Probamos solo edad conductor
models.lg2 <- glm(
as.factor(grave) ~ `edad conductor`,
data = training,
family = 'binomial'
)
summary(models.lg2)
Call:
glm(formula = as.factor(grave) ~ `edad conductor`, family = "binomial",
data = training)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9986 -0.7359 -0.2701 0.7339 1.7965
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.97611 1.78493 2.228 0.0259 *
`edad conductor` -0.11183 0.04719 -2.370 0.0178 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 32.601 on 23 degrees of freedom
Residual deviance: 23.182 on 22 degrees of freedom
AIC: 27.182
Number of Fisher Scoring iterations: 5
models.lg2.predict <- predict(
object = models.lg2,
newdata = validation[, db.accidente.cols.vars],
method = 'response'
)
models.lg2.predictClass <- ifelse(models.lg2.predict > 0.5, 1, 0)
table(
validation$grave,
models.lg2.predictClass,
dnn = c("Real", "Predicha")
)
Predicha
Real 0 1
0 6 2
1 1 3
models.lg2.predict.confusionMatrix <- confusionMatrix(as.factor(validation$grave), factor(models.lg2.predictClass, c(0, 1)))
models.lg2.predict.confusionMatrix
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 6 2
1 1 3
Accuracy : 0.75
95% CI : (0.4281, 0.9451)
No Information Rate : 0.5833
P-Value [Acc > NIR] : 0.1916
Kappa : 0.4706
Mcnemar's Test P-Value : 1.0000
Sensitivity : 0.8571
Specificity : 0.6000
Pos Pred Value : 0.7500
Neg Pred Value : 0.7500
Prevalence : 0.5833
Detection Rate : 0.5000
Detection Prevalence : 0.6667
Balanced Accuracy : 0.7286
'Positive' Class : 0
resultados.accidente <- data.frame(
modelo = c(
'lda',
'qda',
'rda',
'svm',
'log(edad conductor)',
'log'
),
exactitud = c(
0.8333,
0.6667,
0.5,
0.75,
0.75,
0.8333
),
sensibilidad = c(
1,
0.8333,
0.6250,
0.7273,
0.8571,
0.8750
),
especificidad = c(
0.6667,
0.5000,
0.2500,
1,
0.6,
0.7500
)
)
ggplot(resultados.accidente, aes(x = modelo)) +
geom_line(aes(y = exactitud, group = 1, color = 'exactitud')) +
geom_point(aes(y = exactitud, group = 1, color = 'exactitud')) +
geom_line(aes(y = sensibilidad, group = 2, color = 'sensibilidad')) +
geom_point(aes(y = sensibilidad, group = 2, color = 'sensibilidad')) +
geom_line(aes(y = especificidad, group = 3, color = 'especificidad')) +
geom_point(aes(y = especificidad, group = 3, color = 'especificidad')) +
labs(color = 'métrica') +
ylab('valor')
# Me fijo si hay datos faltantes
print(db.seguros.data[rowSums(is.na(db.seguros.data)) > 0,])
db.seguros.data.m <- melt(db.seguros.data)
No id variables; using all as measure variables
ggplot(data = db.seguros.data.m, aes(x=variable, y=value)) +
geom_boxplot(aes()) +
facet_wrap(~variable, scales = "free")
par(mfcol = c(3,3))
for (k in colnames(db.seguros.data)){
hist(
db.seguros.data[[k]],
proba = T,
main = names(db.seguros.data[, k]),
10
)
x0 <- seq(
min(db.seguros.data[, k]),
max(db.seguros.data[, k]),
le = 50
)
lines(
x0,
dnorm(
x0,
mean(db.seguros.data[[k]]),
sd(db.seguros.data[[k]])
),
col = "red",
lwd = 2
)
grid()
}
Normalidad
par(mfcol = c(2,4))
for (var in colnames(db.seguros.data)) {
print(var)
print(mshapiro.test(t(db.seguros.data[[var]])))
qqPlot(
db.seguros.data[[var]],
xlab = 'Cuantiles normales',
ylab = var,
col = 'green',
pch = 20,
col.lines = 'royalblue',
lwd = 1,
)
grid()
}
[1] "edad"
Shapiro-Wilk normality test
data: Z
W = 0.94301, p-value < 2.2e-16
[1] "sexo"
Shapiro-Wilk normality test
data: Z
W = 0.63648, p-value < 2.2e-16
[1] "BMI"
Shapiro-Wilk normality test
data: Z
W = 0.99278, p-value = 0.0001045
[1] "hijos"
Shapiro-Wilk normality test
data: Z
W = 0.6201, p-value < 2.2e-16
[1] "fuma"
Shapiro-Wilk normality test
data: Z
W = 0.81612, p-value < 2.2e-16
[1] "region"
Shapiro-Wilk normality test
data: Z
W = 0.49732, p-value < 2.2e-16
[1] "cargos"
Shapiro-Wilk normality test
data: Z
W = 0.8606, p-value < 2.2e-16
[1] "primadelseguro"
Shapiro-Wilk normality test
data: Z
W = 0.8143, p-value < 2.2e-16
Estandarizo datos
db.seguros.data.std <- data.frame(scale(db.seguros.data))
db.seguros.data.std.m <- melt(db.seguros.data.std)
No id variables; using all as measure variables
ggplot(data = db.seguros.data.std.m, aes(x=variable, y=value)) +
geom_boxplot(aes()) +
facet_wrap(~variable, scales = "free")
seguros.cor <- cor(db.seguros.data)
corrplot(seguros.cor, tl.cex = 0.7, cl.cex = 0.7, tl.col = "royalblue")
seguros.pca.cov <- prcomp(db.seguros.data, center = TRUE, scale. = FALSE)
seguros.pca.cor <- prcomp(db.seguros.data, center = TRUE, scale. = TRUE)
summary(seguros.pca.cov)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 1.224e+04 260.75618 13.49 5.845 1.201 0.9754 0.4986 0.2117
Proportion of Variance 9.995e-01 0.00045 0.00 0.000 0.000 0.0000 0.0000 0.0000
Cumulative Proportion 9.995e-01 1.00000 1.00 1.000 1.000 1.0000 1.0000 1.0000
summary(seguros.pca.cor)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 1.3816 1.2309 1.0474 1.0005 0.9848 0.9123 0.74030 0.35758
Proportion of Variance 0.2386 0.1894 0.1371 0.1251 0.1212 0.1040 0.06851 0.01598
Cumulative Proportion 0.2386 0.4280 0.5651 0.6902 0.8115 0.9155 0.98402 1.00000
fviz_eig(seguros.pca.cov)
fviz_eig(seguros.pca.cor)
Ver: https://uc-r.github.io/kmeans_clustering
library(cluster)
library(pracma)
Attaching package: ‘pracma’
The following object is masked from ‘package:e1071’:
sigmoid
The following object is masked from ‘package:car’:
logit
esc01 <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
metrica = function(datA_esc, kmax, nstart, f) {
sil = array()
# within-cluster sum of square
wss = array()
datA_dist <- dist(
datA_esc,
method = "euclidean",
diag = FALSE,
upper = FALSE,
p = 2
)
for (i in 2:kmax) {
if (strcmp(f, "kmeans") == TRUE) { # centroide: tipico kmeans
CL <- kmeans(
datA_esc,
centers = i,
nstart = nstart,
iter.max = kmax
)
wss[i] <- CL$tot.withinss
CL_sil <- silhouette(CL$cluster, datA_dist)
sil[i] <- summary(CL_sil)$avg.width
}
if (strcmp(f, "pam") == TRUE) { # medoide: ojo porque este metodo tarda muchisimo
CL <- pam(
x = datA_esc,
k = i,
diss = F,
metric = "euclidean"
)
wss[i] <- CL$objective[1]
sil[i] <- CL$silinfo$avg.width
}
}
return(data.frame(wss, sil))
}
kmax <- 7
nstart <- 25
# 2 opciones de escalamiento
# m1 <- metrica(apply(db.seguros.data, 2, esc01), kmax, "kmeans") # definida en la funcion esc01
m1 <- metrica(db.seguros.data.std, kmax, nstart, "kmeans") # tipica de la normal
did not converge in 7 iterations
Gráficos de los indicadores de clustering
par(mfrow = c(2, 1))
plot(
2:kmax,
m1$sil[2:kmax],
col = 1,
type = "b",
pch = 19,
frame = FALSE,
xlab = "Number of clusters K",
ylab = "sil"
)
plot(
2:kmax,
m1$wss[2:kmax],
type = "b",
pch = 19,
frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares"
)
fviz_nbclust(db.seguros.data.std, kmeans, method = 'silhouette')
fviz_nbclust(db.seguros.data.std, kmeans, method = 'wss')
gap_stat <- clusGap(
db.seguros.data.std,
FUN = kmeans,
nstart = nstart,
K.max = kmax,
B = 50
)
Clustering k = 1,2,..., K.max (= 7): .. done
Bootstrapping, b = 1,2,..., B (= 50) [one "." per sample]:
did not converge in 10 iterations
.
did not converge in 10 iterations
....
did not converge in 10 iterations
.
did not converge in 10 iterations
.......
did not converge in 10 iterations
.......
did not converge in 10 iterations
...
did not converge in 10 iterations
......
did not converge in 10 iterations
........
did not converge in 10 iterationsdid not converge in 10 iterations
......
did not converge in 10 iterations
....... 50
fviz_gap_stat(gap_stat)
library(NbClust)
res.nbclust <- NbClust(
data = db.seguros.data.std,
distance = "euclidean",
min.nc = 2,
max.nc = kmax,
method = 'kmeans'
)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 11 proposed 2 as the best number of clusters
* 2 proposed 3 as the best number of clusters
* 3 proposed 4 as the best number of clusters
* 3 proposed 5 as the best number of clusters
* 3 proposed 6 as the best number of clusters
* 2 proposed 7 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
factoextra::fviz_nbclust(res.nbclust) +
theme_minimal() +
ggtitle("NbClust's optimal number of clusters")
la condición tiene longitud > 1 y sólo el primer elemento será usadola condición tiene longitud > 1 y sólo el primer elemento será usadola condición tiene longitud > 1 y sólo el primer elemento será usadola condición tiene longitud > 1 y sólo el primer elemento será usado
Among all indices:
===================
* 2 proposed 0 as the best number of clusters
* 11 proposed 2 as the best number of clusters
* 2 proposed 3 as the best number of clusters
* 3 proposed 4 as the best number of clusters
* 3 proposed 5 as the best number of clusters
* 3 proposed 6 as the best number of clusters
* 2 proposed 7 as the best number of clusters
Conclusion
=========================
* According to the majority rule, the best number of clusters is 2 .
library(ggbiplot)
plot_ggbiplot <- function (clusters) {
for (i in 1:4) {
print(ggbiplot(
seguros.pca.cor,
choices = i:(i+1),
obs.scale = 1,
var.scale = 1,
alpha = 0.5,
groups = as.factor(clusters)
) +
theme(legend.direction = "horizontal", legend.position = "top")
)
}
}
generate_clusters <- function (ncluster) {
CL <- kmeans(
db.seguros.data.std,
# apply(db.seguros.data, 2, esc01),
ncluster,
nstart = nstart,
iter.max = kmax
)
plot_ggbiplot(CL$cluster)
return(CL)
}
models.kmeans6 <- generate_clusters(6)
models.kmeans5 <- generate_clusters(5)
models.kmeans4 <- generate_clusters(4)
models.kmeans3 <- generate_clusters(3)
models.kmeans2 <- generate_clusters(2)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mtibble [30m 3.0.1 [32m✓[30m [34mdplyr [30m 0.8.5
[32m✓[30m [34mtidyr [30m 1.1.0 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.5.0
[32m✓[30m [34mpurrr [30m 0.3.4 [39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32marrange()[30m masks [34mplotly[30m::arrange(), [34mplyr[30m::arrange()
[31mx[30m [34mreadr[30m::[32mcol_factor()[30m masks [34mscales[30m::col_factor()
[31mx[30m [34mdplyr[30m::[32mcombine()[30m masks [34mgridExtra[30m::combine()
[31mx[30m [34mpurrr[30m::[32mcompact()[30m masks [34mplyr[30m::compact()
[31mx[30m [34mdplyr[30m::[32mcount()[30m masks [34mplyr[30m::count()
[31mx[30m [34mpurrr[30m::[32mcross()[30m masks [34mpracma[30m::cross()
[31mx[30m [34mpurrr[30m::[32mdiscard()[30m masks [34mscales[30m::discard()
[31mx[30m [34mdplyr[30m::[32mfailwith()[30m masks [34mplyr[30m::failwith()
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mplotly[30m::filter(), [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mid()[30m masks [34mplyr[30m::id()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31mx[30m [34mpurrr[30m::[32mlift()[30m masks [34mcaret[30m::lift()
[31mx[30m [34mdplyr[30m::[32mmutate()[30m masks [34mplotly[30m::mutate(), [34mplyr[30m::mutate()
[31mx[30m [34mdplyr[30m::[32mrecode()[30m masks [34mcar[30m::recode()
[31mx[30m [34mdplyr[30m::[32mrename()[30m masks [34mplotly[30m::rename(), [34mplyr[30m::rename()
[31mx[30m [34mdplyr[30m::[32mselect()[30m masks [34mplotly[30m::select(), [34mMASS[30m::select()
[31mx[30m [34mpurrr[30m::[32msome()[30m masks [34mcar[30m::some()
[31mx[30m [34mdplyr[30m::[32msummarise()[30m masks [34mplotly[30m::summarise(), [34mplyr[30m::summarise()
[31mx[30m [34mdplyr[30m::[32msummarize()[30m masks [34mplyr[30m::summarize()[39m
library(ggiraphExtra)
plot_radar <- function (clusters) {
df <- as.data.frame(db.seguros.data.std) %>% rownames_to_column()
cluster_pos <- as.data.frame(clusters) %>% rownames_to_column()
colnames(cluster_pos) <- c("rowname", "cluster")
final <- inner_join(cluster_pos, df)
ggRadar(
final[-1],
aes(group = cluster),
rescale = FALSE,
legend.position = 'none',
size = 1,
interactive = FALSE,
use.label = TRUE
) +
facet_wrap(~cluster) +
scale_y_discrete(breaks = NULL) +
theme(axis.text.x = element_text(size = 10)) +
scale_fill_manual(values = rep('#1c6193', nrow(final))) +
scale_color_manual(values = rep('#1c6193', nrow(final)))
}
plot_radar(models.kmeans6$cluster)
Joining, by = "rowname"
plot_radar(models.kmeans2$cluster)
Joining, by = "rowname"
mat_dist <- dist(
x = db.seguros.data.std,
method = 'euclidean'
)
# Dendrogramas (según el tipo de segmentación jerárquica aplicada)
hc_complete <- hclust(d = mat_dist, method = "complete")
hc_average <- hclust(d = mat_dist, method = "average")
hc_single <- hclust(d = mat_dist, method = "single")
hc_ward <- hclust(d = mat_dist, method = "ward.D2")
#calculo del coeficiente de correlacion cofenetico
cor(x = mat_dist, cophenetic(hc_complete))
[1] 0.6185278
cor(x = mat_dist, cophenetic(hc_average))
[1] 0.7228582
cor(x = mat_dist, cophenetic(hc_single))
[1] 0.6075149
cor(x = mat_dist, cophenetic(hc_ward))
[1] 0.6638337
# construccion de un dendograma usando los resultados de la técnica de Ward
plot(hc_ward) # no se ve bien si hay muchos datos
rect.hclust(hc_ward, k = 2, border = "red") # con 2 grupos
rect.hclust(hc_ward, k = 6, border = "blue") # con 6 grupos
clusters6 <- cutree(hc_ward, k = 6)
plot_ggbiplot(clusters6)
plot_radar(clusters6)
Joining, by = "rowname"
clusters4 <- cutree(hc_ward, k = 4)
plot_ggbiplot(clusters4)
plot_radar(clusters4)
Joining, by = "rowname"
clusters2 <- cutree(hc_ward, k = 2)
plot_ggbiplot(clusters2)
plot_radar(clusters2)
Joining, by = "rowname"