Se pretende generar un modelo discriminante que permita diferenciar entre billetes verdaderos y falsos. Se han registrado múltiples variables para 100 billetes verdaderos y 100 billetes falsos:
Status: si es verdadero (genuine) o falso (counterfeit). Length: longitud (mm) Left: Anchura del borde izquierdo (mm) Right: Anchura del borde derecho (mm) Bottom: Anchura del borde inferior (mm) Top: Anchura del borde superior (mm) Diagonal: longitud diagonal (mm)
# Análisis preliminar
# ---------------------------
library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
library(knitr)
data(banknote)
str(banknote)
## 'data.frame': 200 obs. of 7 variables:
## $ Status : Factor w/ 2 levels "counterfeit",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Length : num 215 215 215 215 215 ...
## $ Left : num 131 130 130 130 130 ...
## $ Right : num 131 130 130 130 130 ...
## $ Bottom : num 9 8.1 8.7 7.5 10.4 9 7.9 7.2 8.2 9.2 ...
## $ Top : num 9.7 9.5 9.6 10.4 7.7 10.1 9.6 10.7 11 10 ...
## $ Diagonal: num 141 142 142 142 142 ...
summary(banknote)
## Status Length Left Right
## counterfeit:100 Min. :213.8 Min. :129.0 Min. :129.0
## genuine :100 1st Qu.:214.6 1st Qu.:129.9 1st Qu.:129.7
## Median :214.9 Median :130.2 Median :130.0
## Mean :214.9 Mean :130.1 Mean :130.0
## 3rd Qu.:215.1 3rd Qu.:130.4 3rd Qu.:130.2
## Max. :216.3 Max. :131.0 Max. :131.1
## Bottom Top Diagonal
## Min. : 7.200 Min. : 7.70 Min. :137.8
## 1st Qu.: 8.200 1st Qu.:10.10 1st Qu.:139.5
## Median : 9.100 Median :10.60 Median :140.4
## Mean : 9.418 Mean :10.65 Mean :140.5
## 3rd Qu.:10.600 3rd Qu.:11.20 3rd Qu.:141.5
## Max. :12.700 Max. :12.30 Max. :142.4
# Se recodifican las clases de la variable Status: verdadero = 0, falso = 1
levels(banknote$Status)
## [1] "counterfeit" "genuine"
levels(banknote$Status) <- c("falso", "verdadero")
kable(head(banknote))
| Status | Length | Left | Right | Bottom | Top | Diagonal |
|---|---|---|---|---|---|---|
| verdadero | 214.8 | 131.0 | 131.1 | 9.0 | 9.7 | 141.0 |
| verdadero | 214.6 | 129.7 | 129.7 | 8.1 | 9.5 | 141.7 |
| verdadero | 214.8 | 129.7 | 129.7 | 8.7 | 9.6 | 142.2 |
| verdadero | 214.8 | 129.7 | 129.6 | 7.5 | 10.4 | 142.0 |
| verdadero | 215.0 | 129.6 | 129.7 | 10.4 | 7.7 | 141.8 |
| verdadero | 215.7 | 130.8 | 130.5 | 9.0 | 10.1 | 141.4 |
# Histogramas
library(ggplot2)
library(ggpubr)
p1 <- ggplot(data = banknote, aes(x = Length, fill = Status)) +
geom_histogram(position = "identity", alpha = 0.5)
p2 <- ggplot(data = banknote, aes(x = Left, fill = Status)) +
geom_histogram(position = "identity", alpha = 0.5)
p3 <- ggplot(data = banknote, aes(x = Right, fill = Status)) +
geom_histogram(position = "identity", alpha = 0.5)
p4 <- ggplot(data = banknote, aes(x = Bottom, fill = Status)) +
geom_histogram(position = "identity", alpha = 0.5)
p5 <- ggplot(data = banknote, aes(x = Top, fill = Status)) +
geom_histogram(position = "identity", alpha = 0.5)
ggarrange(p1, p2, p3, p4, p5, ncol = 5, common.legend = TRUE, legend = "bottom")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Se solapan en el length y left. El que mejor discrimina es el
bottom.
# Correlaciones
#------------------------
pairs(x = banknote[, -1], col = c("pink",
"lightblue")[banknote$Status],
pch = 19)
# Representación mediante Histograma de cada variable para cada tipo de billete
par(mfcol = c(2, 6))
for (k in 2:7) {
j0 <- names(banknote)[k]
x0 <- seq(min(banknote[, k]), max(banknote[, k]), le = 50)
for (i in 1:2) {
i0 <- levels(banknote$Status)[i]
x <- banknote[banknote$Status == i0, j0]
hist(x, proba = T, col = "lightblue", main = paste( i0), xlab = j0)
lines(x0, dnorm(x0, mean(x), sd(x)), col = "red", lwd = 2)
}
}
# Representación de cuantiles normales de cada variable para cada tipo de billete
for (k in 2:7) {
j0 <- names(banknote)[k]
x0 <- seq(min(banknote[, k]), max(banknote[, k]), le = 50)
for (i in 1:2) {
i0 <- levels(banknote$Status)[i]
x <- banknote[banknote$Status == i0, j0]
qqnorm(x, main = paste(i0, j0), pch = 19, col = i + 1)
# los colores 2 y 3 son el rojo y verde
qqline(x)
}
}
#Contraste de normalidad Shapiro-Wilk para cada variable en cada tipo de billete
library(reshape2)
library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
datos_tidy <- melt(banknote, value.name = "valor")
## Using Status as id variables
datos_tidy %>% group_by(Status, variable) %>%
summarise(p_value_Shapiro.test = round(shapiro.test(valor)$p.value,5))
## `summarise()` has grouped output by 'Status'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
## # Groups: Status [2]
## Status variable p_value_Shapiro.test
## <fct> <fct> <dbl>
## 1 falso Length 0.0029
## 2 falso Left 0.0237
## 3 falso Right 0.0168
## 4 falso Bottom 0.0112
## 5 falso Top 0.0491
## 6 falso Diagonal 0.00003
## 7 verdadero Length 0.257
## 8 verdadero Left 0.00825
## 9 verdadero Right 0.0117
## 10 verdadero Bottom 0.0403
## 11 verdadero Top 0.0498
## 12 verdadero Diagonal 0.0033
La mayorÃa de las variables no acepta la normalidad y se recomienda el QDA.
detach(package:mclust, unload = TRUE)
library(MVN)
outliers <- mvn(data=banknote[,-1], mvnTest = "hz",
multivariateOutlierMethod = "quan")
royston_test <- mvn(data = banknote[,-1], mvnTest =
"royston", multivariatePlot = "qq")
royston_test$multivariateNormality
## Test H p value MVN
## 1 Royston 67.03927 5.820549e-13 NO
No hay normalidad multivariante y se debe usar QDA.
hz_test <- mvn(data = banknote[,-1], mvnTest = "hz")
hz_test$multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 1.780591 0 NO
No se acepta la normalidad multivariante; se debe emplear el QDA.
library(biotools)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## ---
## biotools version 4.2
boxM(data = banknote[,-1], grouping = banknote[,1])
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: banknote[, -1]
## Chi-Sq (approx.) = 121.9, df = 21, p-value = 3.198e-16
Las matrices de varianzas y covarianzas son diferentes (en gran parte debido a la falta de normalidad multivariante).
library(MASS)
modelo <- qda(formula = Status ~., data = banknote, prior =
c(0.01, 0.99))
modelo
## Call:
## qda(Status ~ ., data = banknote, prior = c(0.01, 0.99))
##
## Prior probabilities of groups:
## falso verdadero
## 0.01 0.99
##
## Group means:
## Length Left Right Bottom Top Diagonal
## falso 214.823 130.300 130.193 10.530 11.133 139.450
## verdadero 214.969 129.943 129.720 8.305 10.168 141.517
predicciones <- predict(object = modelo, newdata = banknote)
table(banknote$Status, predicciones$class,
dnn = c("Clase real", "Clase perdida"))
## Clase perdida
## Clase real falso verdadero
## falso 99 1
## verdadero 0 100
training_error <- mean(banknote$Status != predicciones$class) * 100
paste("training_error", training_error, "%")
## [1] "training_error 0.5 %"
library(klaR)
par(mcol=c(1,1))
## Warning in par(mcol = c(1, 1)): "mcol" is not a graphical parameter
partimat(formula = Status ~., data = banknote, prior =
c(0.01, 0.99),
method = "qda", prec = 200,
image.colors = c("darkgoldenrod1", "skyblue2"),
col.mean = "firebrick")
library(klaR)
par(mcol=c(1,1))
## Warning in par(mcol = c(1, 1)): "mcol" is not a graphical parameter
partimat(formula = Status ~., data = banknote, prior =
c(0.01, 0.99),
method = "lda", prec = 200,
image.colors = c("darkgoldenrod1", "skyblue2"),
col.mean = "firebrick")