set.seed(45678)
grupoA_x <- seq(from = -3, to = 4, length.out = 100)
grupoA_y <- 6 + 0.15 * grupoA_x - 0.6 * grupoA_x^2 + rnorm(100, sd = 1)
grupoA <- data.frame(variable_z = grupoA_x, variable_w = grupoA_y, grupo = "A")

grupoB_x <- rnorm(n = 100, mean = 1.5, sd = 0.8)
grupoB_y <- rnorm(n = 100, mean = 2.5, sd = 0.8)
grupoB <- data.frame(variable_z = grupoB_x, variable_w = grupoB_y, grupo = "B")

datos <- rbind(grupoA, grupoB)
datos$grupo <- as.factor(datos$grupo)
head(datos)
##   variable_z variable_w grupo
## 1  -3.000000  1.5904800     A
## 2  -2.929293 -0.3074243     A
## 3  -2.858586  1.3392827     A
## 4  -2.787879  0.4965088     A
## 5  -2.717172  1.5408306     A
## 6  -2.646465 -0.2658212     A
tail(datos)
##     variable_z variable_w grupo
## 195   1.494375   2.270472     B
## 196   1.669732   2.596286     B
## 197   1.714790   1.579820     B
## 198   1.298446   2.835773     B
## 199   2.272506   2.882408     B
## 200   1.987788   1.660372     B
#Plotting
#-------------------------------------
plot(datos[,1:2], col = datos$grupo, pch = 19)

La separación entre los grupos no es de tipo lineal, sino que muestra cierta curvatura. En este tipo de escenarios, el método QDA es más adecuado que el LDA.

Exploración de los datos (EDA)

library(ggplot2)
library(ggpubr)

p1 <- ggplot(data = datos, aes(x= variable_z, fill = grupo))+
  geom_histogram(position = "identity", alpha = 0.4)

p2 <- ggplot(data = datos, aes(x= variable_w, fill = grupo))+
  geom_histogram(position = "identity", alpha = 0.4)

ggarrange(p1, p2, nrow = 2, common.legend = T, legend = "bottom")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Normalidad

par(mfcol = c(2,2))
for (k in 1:2) {
  j0 <- names(datos)[k]
  x0 <- seq(min(datos[, k]), max(datos[, k]), le = 50)
  for (i in 1:2) {
    i0 <- levels(datos$grupo)[i]
    x <- datos[datos$grupo == i0, j0]
    hist(x, proba = T, col = "green", main = paste("grupo", i0),
    xlab = j0)
    lines(x0, dnorm(x0, mean(x), sd(x)), col = "red", lwd = 2)
  }
}

par(mfcol = c(2,2))
for (k in 1:2) {
  j0 <- names(datos)[k]
  x0 <- seq(min(datos[, k]), max(datos[, k]), le = 50)
  for (i in 1:2) {
    i0 <- levels(datos$grupo)[i]
    x <- datos[datos$grupo == i0, j0]
    qqnorm(x, main = paste(i0, j0), pch =19, col = i+1)
    qqline(x)
  }
}

library(reshape2)
datos_tidy <- melt(datos, value.name = "valor")
## Using grupo as id variables
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
datos_tidy %>%
  group_by(grupo, variable) %>%
  summarise(p_value_Shapiro.test = round(shapiro.test(valor)$p.value,4))
## `summarise()` has grouped output by 'grupo'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   grupo [2]
##   grupo variable   p_value_Shapiro.test
##   <fct> <fct>                     <dbl>
## 1 A     variable_z               0.0017
## 2 A     variable_w               0.0034
## 3 B     variable_z               0.167 
## 4 B     variable_w               0.008

Sólo se acepta la normalidad para B / z. Esto no quiere decir que no haya normalidad multivariante.

library(MVN)
outliers <- mvn(data = datos[, -3], mvnTest = "hz",
multivariateOutlierMethod = "quan")

royston_test <- mvn(data = datos[, -3], mvnTest = "royston",
                    multivariatePlot ="qq")

Entonces, vemos que no hay normalidad multivariante.

royston_test$multivariateNormality
##      Test        H      p value MVN
## 1 Royston 32.68255 7.855187e-08  NO

No hay normalidad multivariante.

hz_test <- mvn(data = datos[, -3], mvnTest = "hz")
hz_test$multivariateNormality
##            Test       HZ     p value MVN
## 1 Henze-Zirkler 7.821592 1.44329e-15  NO

No hay noramlidad multivariante.

Al no cumplirse, no se usa el LDA, sino el QDA.

Modelo discriminante

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
modelo_qda <- qda(grupo ~ variable_z + variable_w, data = datos)
modelo_qda
## Call:
## qda(grupo ~ variable_z + variable_w, data = datos)
## 
## Prior probabilities of groups:
##   A   B 
## 0.5 0.5 
## 
## Group means:
##   variable_z variable_w
## A   0.500000   3.427573
## B   1.559052   2.547156

Matriz de confusión

#Predicciones
#----------------------
predicciones <- predict(object = modelo_qda, newdata = datos)
#Matriz de confusión
#-------------------------
table(datos$grupo, predicciones$class,
      dnn = c("Clase real", "Clase predicha"))
##           Clase predicha
## Clase real  A  B
##          A 84 16
##          B 12 88

Training error

training_error <- mean(datos$grupo != predicciones$class) * 100
paste("training error", training_error, "%")
## [1] "training error 14 %"
# Mapa territorial
#-------------------------------
library(klaR)
partimat(formula = grupo ~variable_z + variable_w, data = datos,
         method = "qda", prec = 400,
         image.colors = c("darkgoldenrod1", "skyblue2"),
         col.mean= "firebrick")

# Mapa territorial
#-------------------------------
library(klaR)
partimat(formula = grupo ~variable_z + variable_w, data = datos,
         method = "lda", prec = 400,
         image.colors = c("darkgoldenrod1", "skyblue2"),
         col.mean= "firebrick")