set.seed(45678)
grupoA_x <- seq(from = -3, to = 4, length.out = 100)
grupoA_y <- 6 + 0.15 * grupoA_x - 0.6 * grupoA_x^2 + rnorm(100, sd = 1)
grupoA <- data.frame(variable_z = grupoA_x, variable_w = grupoA_y, grupo = "A")
grupoB_x <- rnorm(n = 100, mean = 1.5, sd = 0.8)
grupoB_y <- rnorm(n = 100, mean = 2.5, sd = 0.8)
grupoB <- data.frame(variable_z = grupoB_x, variable_w = grupoB_y, grupo = "B")
datos <- rbind(grupoA, grupoB)
datos$grupo <- as.factor(datos$grupo)
head(datos)
## variable_z variable_w grupo
## 1 -3.000000 1.5904800 A
## 2 -2.929293 -0.3074243 A
## 3 -2.858586 1.3392827 A
## 4 -2.787879 0.4965088 A
## 5 -2.717172 1.5408306 A
## 6 -2.646465 -0.2658212 A
tail(datos)
## variable_z variable_w grupo
## 195 1.494375 2.270472 B
## 196 1.669732 2.596286 B
## 197 1.714790 1.579820 B
## 198 1.298446 2.835773 B
## 199 2.272506 2.882408 B
## 200 1.987788 1.660372 B
#Plotting
#-------------------------------------
plot(datos[,1:2], col = datos$grupo, pch = 19)
La separación entre los grupos no es de tipo lineal, sino que muestra
cierta curvatura. En este tipo de escenarios, el método QDA es más
adecuado que el LDA.
Exploración de los datos (EDA)
library(ggplot2)
library(ggpubr)
p1 <- ggplot(data = datos, aes(x= variable_z, fill = grupo))+
geom_histogram(position = "identity", alpha = 0.4)
p2 <- ggplot(data = datos, aes(x= variable_w, fill = grupo))+
geom_histogram(position = "identity", alpha = 0.4)
ggarrange(p1, p2, nrow = 2, common.legend = T, legend = "bottom")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Normalidad
par(mfcol = c(2,2))
for (k in 1:2) {
j0 <- names(datos)[k]
x0 <- seq(min(datos[, k]), max(datos[, k]), le = 50)
for (i in 1:2) {
i0 <- levels(datos$grupo)[i]
x <- datos[datos$grupo == i0, j0]
hist(x, proba = T, col = "green", main = paste("grupo", i0),
xlab = j0)
lines(x0, dnorm(x0, mean(x), sd(x)), col = "red", lwd = 2)
}
}
par(mfcol = c(2,2))
for (k in 1:2) {
j0 <- names(datos)[k]
x0 <- seq(min(datos[, k]), max(datos[, k]), le = 50)
for (i in 1:2) {
i0 <- levels(datos$grupo)[i]
x <- datos[datos$grupo == i0, j0]
qqnorm(x, main = paste(i0, j0), pch =19, col = i+1)
qqline(x)
}
}
library(reshape2)
datos_tidy <- melt(datos, value.name = "valor")
## Using grupo as id variables
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
datos_tidy %>%
group_by(grupo, variable) %>%
summarise(p_value_Shapiro.test = round(shapiro.test(valor)$p.value,4))
## `summarise()` has grouped output by 'grupo'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: grupo [2]
## grupo variable p_value_Shapiro.test
## <fct> <fct> <dbl>
## 1 A variable_z 0.0017
## 2 A variable_w 0.0034
## 3 B variable_z 0.167
## 4 B variable_w 0.008
Sólo se acepta la normalidad para B / z. Esto no quiere decir que no haya normalidad multivariante.
library(MVN)
outliers <- mvn(data = datos[, -3], mvnTest = "hz",
multivariateOutlierMethod = "quan")
royston_test <- mvn(data = datos[, -3], mvnTest = "royston",
multivariatePlot ="qq")
Entonces, vemos que no hay normalidad multivariante.
royston_test$multivariateNormality
## Test H p value MVN
## 1 Royston 32.68255 7.855187e-08 NO
No hay normalidad multivariante.
hz_test <- mvn(data = datos[, -3], mvnTest = "hz")
hz_test$multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 7.821592 1.44329e-15 NO
No hay noramlidad multivariante.
Al no cumplirse, no se usa el LDA, sino el QDA.
Modelo discriminante
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
modelo_qda <- qda(grupo ~ variable_z + variable_w, data = datos)
modelo_qda
## Call:
## qda(grupo ~ variable_z + variable_w, data = datos)
##
## Prior probabilities of groups:
## A B
## 0.5 0.5
##
## Group means:
## variable_z variable_w
## A 0.500000 3.427573
## B 1.559052 2.547156
Matriz de confusión
#Predicciones
#----------------------
predicciones <- predict(object = modelo_qda, newdata = datos)
#Matriz de confusión
#-------------------------
table(datos$grupo, predicciones$class,
dnn = c("Clase real", "Clase predicha"))
## Clase predicha
## Clase real A B
## A 84 16
## B 12 88
Training error
training_error <- mean(datos$grupo != predicciones$class) * 100
paste("training error", training_error, "%")
## [1] "training error 14 %"
# Mapa territorial
#-------------------------------
library(klaR)
partimat(formula = grupo ~variable_z + variable_w, data = datos,
method = "qda", prec = 400,
image.colors = c("darkgoldenrod1", "skyblue2"),
col.mean= "firebrick")
# Mapa territorial
#-------------------------------
library(klaR)
partimat(formula = grupo ~variable_z + variable_w, data = datos,
method = "lda", prec = 400,
image.colors = c("darkgoldenrod1", "skyblue2"),
col.mean= "firebrick")