EJEMPLO 1 ANALISIS DE DISCRIMINANTE EN R Y PYTHON

import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
df = iris.frame
print(df.head())

   sepal length (cm)  sepal width (cm)  ...  petal width (cm)  target
0                5.1               3.5  ...               0.2       0
1                4.9               3.0  ...               0.2       0
2                4.7               3.2  ...               0.2       0
3                4.6               3.1  ...               0.2       0
4                5.0               3.6  ...               0.2       0

[5 rows x 5 columns]

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

X = df[iris.feature_names]
y = df['target']

lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X, y)

print("Varianza explicada por cada discriminante:", lda.explained_variance_ratio_)

Varianza explicada por cada discriminante: [0.9912126 0.0087874]

import matplotlib.pyplot as plt
plt.figure(figsize=(8,6))
for i, target_name in enumerate(iris.target_names):
    plt.scatter(X_lda[y==i,0], X_lda[y==i,1], label=target_name)
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.title('Iris - LDA')
plt.legend()
plt.show()

import numpy as np

# Nuevas observaciones
nuevas_flores = np.array([
    [5.1, 3.5, 1.4, 0.2],  # probablemente setosa
    [6.0, 2.2, 4.0, 1.0],  # probablemente versicolor
    [6.9, 3.1, 5.4, 2.1]   # probablemente virginica
])

# Predecir el grupo
predicciones = lda.predict(nuevas_flores)

C:\Users\MINEDU~1\AppData\Local\Programs\Python\PYTHON~1\Lib\site-packages\sklearn\utils\validation.py:2739: UserWarning: X does not have valid feature names, but LinearDiscriminantAnalysis was fitted with feature names
  warnings.warn(

nombres_grupos = iris.target_names[predicciones]
print("Predicciones:", nombres_grupos)

Predicciones: ['setosa' 'versicolor' 'virginica']

# Graficar datos originales
plt.figure(figsize=(8,6))
colores = ['red', 'green', 'blue']
for i, target_name in enumerate(iris.target_names):
    plt.scatter(X_lda[y==i,0], X_lda[y==i,1], alpha=0.5, label=target_name, color=colores[i])

# Graficar nuevas flores
for i, nombre in enumerate(nombres_grupos):
    plt.scatter(nuevas_flores[i,0], nuevas_flores[i,1],
                color=colores[predicciones[i]],
                edgecolor='black',
                marker='X',
                s=200,
                label=f'Nueva flor {i+1}: {nombre}')

plt.xlabel('LD1')
plt.ylabel('LD2')
plt.title('Clasificación de nuevas flores en el espacio LDA')
plt.legend(loc='best', fontsize='small')
plt.show()

import numpy as np
import matplotlib.pyplot as plt

# Nuevas observaciones
nuevas_flores = np.array([
    [5.1, 3.5, 1.4, 0.2],  # setosa
    [6.0, 2.2, 4.0, 1.0],  # versicolor
    [6.9, 3.1, 5.4, 2.1]   # virginica
])

# Transformar nuevas flores al espacio LDA
nuevas_flores_lda = lda.transform(nuevas_flores)

C:\Users\MINEDU~1\AppData\Local\Programs\Python\PYTHON~1\Lib\site-packages\sklearn\utils\validation.py:2739: UserWarning: X does not have valid feature names, but LinearDiscriminantAnalysis was fitted with feature names
  warnings.warn(

predicciones = lda.predict(nuevas_flores)

C:\Users\MINEDU~1\AppData\Local\Programs\Python\PYTHON~1\Lib\site-packages\sklearn\utils\validation.py:2739: UserWarning: X does not have valid feature names, but LinearDiscriminantAnalysis was fitted with feature names
  warnings.warn(

nombres_grupos = iris.target_names[predicciones]

# Graficar datos originales
plt.figure(figsize=(8,6))
colores = ['red', 'green', 'blue']
for i, target_name in enumerate(iris.target_names):
    plt.scatter(X_lda[y==i,0], X_lda[y==i,1], alpha=0.5, label=target_name, color=colores[i])

# Graficar nuevas flores como estrellas
for i, nombre in enumerate(nombres_grupos):
    plt.scatter(nuevas_flores_lda[i,0], nuevas_flores_lda[i,1],
                color=colores[predicciones[i]],
                edgecolor='black',
                marker='*',
                s=400,
                label=f'Nueva flor {i+1}: {nombre}')

plt.xlabel('LD1')
plt.ylabel('LD2')
plt.title('Clasificación de nuevas flores en el espacio LDA')
plt.legend(loc='best', fontsize='small')
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Predicciones sobre los datos originales
y_pred = lda.predict(X)

# Matriz de confusión
cm = confusion_matrix(y, y_pred)
print("Matriz de confusión:\n", cm)

Matriz de confusión:
 [[50  0  0]
 [ 0 48  2]
 [ 0  1 49]]

# Visualización de la matriz de confusión
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=iris.target_names)
disp.plot(cmap='Blues')

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay object at 0x000001BA184F4CE0>

plt.title('Matriz de confusión del modelo LDA')
plt.show()

from sklearn.metrics import accuracy_score

# Predicciones sobre los datos originales
y_pred = lda.predict(X)

# Calcular precisión
precision = accuracy_score(y, y_pred)
print(f"Precisión del modelo LDA: {precision:.2%}")

Precisión del modelo LDA: 98.00%

# Obtener los eigenvalores del modelo LDA
eigenvalues = lda.explained_variance_ratio_ / (1 - lda.explained_variance_ratio_)

# Calcular Lambda de Wilks
lambda_wilks = 1 / np.prod(1 + eigenvalues)
print(f"Lambda de Wilks: {lambda_wilks:.4f}")

Lambda de Wilks: 0.0087

from scipy.stats import shapiro

for clase in np.unique(y):
    print(f"\nGrupo: {iris.target_names[clase]}")
    for i, var in enumerate(iris.feature_names):
        stat, p = shapiro(df[df['target']==clase][var])
        print(f"{var}: p-valor={p:.3f}")


Grupo: setosa
sepal length (cm): p-valor=0.460
sepal width (cm): p-valor=0.272
petal length (cm): p-valor=0.055
petal width (cm): p-valor=0.000

Grupo: versicolor
sepal length (cm): p-valor=0.465
sepal width (cm): p-valor=0.338
petal length (cm): p-valor=0.158
petal width (cm): p-valor=0.027

Grupo: virginica
sepal length (cm): p-valor=0.258
sepal width (cm): p-valor=0.181
petal length (cm): p-valor=0.110
petal width (cm): p-valor=0.087

import pandas as pd
import numpy as np
from pingouin import multivariate_normality

# Evaluar normalidad multivariante para cada grupo
for clase in np.unique(y):
    grupo = df[df['target'] == clase][iris.feature_names]
    print(f"\nGrupo: {iris.target_names[clase]}")
    mardia = multivariate_normality(grupo, alpha=0.05)
    print(mardia)


Grupo: setosa
HZResults(hz=np.float64(0.9488453160016705), pval=np.float64(0.049953556179215215), normal=False)

Grupo: versicolor
HZResults(hz=np.float64(0.8388008907266101), pval=np.float64(0.22619914870362656), normal=True)

Grupo: virginica
HZResults(hz=np.float64(0.7570095243363526), pval=np.float64(0.4970236922155551), normal=True)

#from bioinfokit.analys import stat

# Preparar datos
#df_boxm = df.copy()
#df_boxm['target'] = df_boxm['target'].astype(str)
#res = stat()
#res.box_m(df=df_boxm, group_col='target')
#print(res.boxm)
import numpy as np
import pandas as pd
from scipy.stats import chi2
from sklearn.datasets import load_iris

# Cargar datos
iris = load_iris(as_frame=True)
df = iris.frame
df['target'] = df['target'].astype(str)

# Selecciona solo las columnas numéricas
X = df[iris.feature_names].values
y = df['target'].values

def box_m_test(X, y):
    groups = np.unique(y)
    n_groups = len(groups)
    n_features = X.shape[1]
    n_total = X.shape[0]
    cov_pooled = np.cov(X, rowvar=False)
    logdet_pooled = np.linalg.slogdet(cov_pooled)[1]
    sum_logdet = 0
    sum_nk = 0
    for g in groups:
        Xg = X[y == g]
        cov_g = np.cov(Xg, rowvar=False)
        logdet_g = np.linalg.slogdet(cov_g)[1]
        sum_logdet += (Xg.shape[0] - 1) * logdet_g
        sum_nk += Xg.shape[0] - 1
    M = sum_nk * logdet_pooled - sum_logdet
    c = ((2 * n_features**2 + 3 * n_features - 1) / (6 * (n_features + 1) * (n_groups - 1))) * \
        (sum([1/(X[y == g].shape[0] - 1) for g in groups]) - 1/(n_total - n_groups))
    chi2_stat = M * (1 - c)
    df_box = (n_groups - 1) * n_features * (n_features + 1) / 2
    p_value = 1 - chi2.cdf(chi2_stat, df_box)
    return chi2_stat, df_box, p_value

chi2_stat, df_box, p_value = box_m_test(X, y)
print(f"Box's M: {chi2_stat:.4f}, df: {df_box:.0f}, p-value: {p_value:.4f}")

Box's M: 663.5329, df: 20, p-value: 0.0000

data(iris)
head(iris)

  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa

summary(iris)

  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
       Species  
 setosa    :50  
 versicolor:50  
 virginica :50

pairs(iris[1:4], col=iris$Species, pch=19, main="Matriz de dispersión de Iris")

#install.packages("MVN") # Solo si no lo tienes
library(MVN)

Warning: package 'MVN' was built under R version 4.4.2

for (sp in unique(iris$Species)) {
  cat("\nGrupo:", sp, "\n")
  grupo <- subset(iris, Species == sp)[,1:4]
  print(mvn(grupo, mvnTest = "mardia")$multivariateNormality)
}


Grupo: setosa 
             Test        Statistic           p value Result
1 Mardia Skewness 25.6643445196298 0.177185884467652    YES
2 Mardia Kurtosis 1.29499223711605 0.195322907441935    YES
3             MVN             <NA>              <NA>    YES

Grupo: versicolor 
             Test         Statistic           p value Result
1 Mardia Skewness  25.1850115362466 0.194444483140265    YES
2 Mardia Kurtosis -0.57186635893429 0.567412516528727    YES
3             MVN              <NA>              <NA>    YES

Grupo: virginica 
             Test         Statistic           p value Result
1 Mardia Skewness  26.2705981752915 0.157059707690356    YES
2 Mardia Kurtosis 0.152614173978342 0.878702546726567    YES
3             MVN              <NA>              <NA>    YES

#install.packages("biotools") # Solo si no lo tienes
library(biotools)

Warning: package 'biotools' was built under R version 4.4.3

Cargando paquete requerido: MASS

---
biotools version 4.3

boxM(iris[,1:4], iris$Species)


    Box's M-test for Homogeneity of Covariance Matrices

data:  iris[, 1:4]
Chi-Sq (approx.) = 140.94, df = 20, p-value < 2.2e-16

Interpretación:

Si el p-valor es menor a 0.05, se rechaza la igualdad de matrices de covarianza.

library(MASS)
lda_model <- lda(Species ~ ., data=iris)
print(lda_model)

Call:
lda(Species ~ ., data = iris)

Prior probabilities of groups:
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

Group means:
           Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa            5.006       3.428        1.462       0.246
versicolor        5.936       2.770        4.260       1.326
virginica         6.588       2.974        5.552       2.026

Coefficients of linear discriminants:
                    LD1         LD2
Sepal.Length  0.8293776 -0.02410215
Sepal.Width   1.5344731 -2.16452123
Petal.Length -2.2012117  0.93192121
Petal.Width  -2.8104603 -2.83918785

Proportion of trace:
   LD1    LD2 
0.9912 0.0088

plot(lda_model, col=as.numeric(iris$Species), pch=19, main="LDA de Iris")

pred <- predict(lda_model)$class
conf_mat <- table(Predicted=pred, Actual=iris$Species)
print(conf_mat)

            Actual
Predicted    setosa versicolor virginica
  setosa         50          0         0
  versicolor      0         48         1
  virginica       0          2        49

# Precisión
accuracy <- sum(diag(conf_mat)) / sum(conf_mat)
cat(sprintf("Precisión del modelo: %.2f%%\n", accuracy*100))

Precisión del modelo: 98.00%

# Extraer eigenvalues
eigvals <- lda_model$svd^2
lambda_wilks <- prod(1 / (1 + eigvals))
cat(sprintf("Lambda de Wilks: %.4f\n", lambda_wilks))

Lambda de Wilks: 0.0000

Interpretación:
Un valor bajo (cercano a 0) indica buena separación entre grupos.

nuevas_flores <- data.frame(
  Sepal.Length = c(5.1, 6.0, 6.9),
  Sepal.Width  = c(3.5, 2.2, 3.1),
  Petal.Length = c(1.4, 4.0, 5.4),
  Petal.Width  = c(0.2, 1.0, 2.1)
)

pred_nuevas <- predict(lda_model, nuevas_flores)
print(pred_nuevas$class)

[1] setosa     versicolor virginica 
Levels: setosa versicolor virginica

print(pred_nuevas$posterior) # Probabilidades de pertenencia

        setosa   versicolor    virginica
1 1.000000e+00 3.896358e-22 2.611168e-42
2 2.716128e-18 9.999988e-01 1.220169e-06
3 2.140232e-36 8.290895e-04 9.991709e-01

# Proyectar datos originales y nuevas flores en el espacio LDA
lda_pred <- predict(lda_model)
lda_nuevas <- predict(lda_model, nuevas_flores)

plot(lda_pred$x[,1], lda_pred$x[,2], col=as.numeric(iris$Species), pch=19,
     xlab="LD1", ylab="LD2", main="LDA de Iris con nuevas flores")
legend("topright", legend=levels(iris$Species), col=1:3, pch=19)

# Agregar nuevas flores como estrellas
points(lda_nuevas$x[,1], lda_nuevas$x[,2], pch=8, cex=2, lwd=2, col=as.numeric(pred_nuevas$class))
text(lda_nuevas$x[,1], lda_nuevas$x[,2], labels=paste("Nueva", 1:3), pos=3)

library(rgl)

Warning: package 'rgl' was built under R version 4.4.3

library(MASS)

lda_model <- lda(Species ~ ., data=iris)
lda_pred <- predict(lda_model)
X_lda <- lda_pred$x

# Usar LD1, LD2 y Petal.Length
open3d()

wgl 
  1

plot3d(X_lda[,1], X_lda[,2], iris$Petal.Length,
       col=as.numeric(iris$Species), size=8, type="s",
       xlab="LD1", ylab="LD2", zlab="Petal.Length",
       main="LDA 3D: LD1, LD2 y Petal.Length")

legend3d("topright", legend=levels(iris$Species), col=1:3, pch=16, cex=1.2)

# Instala plotly si no lo tienes
#install.packages("plotly")
library(plotly)

Cargando paquete requerido: ggplot2


Adjuntando el paquete: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:MASS':

    select

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

library(MASS)

# Ajustar LDA
lda_model <- lda(Species ~ ., data=iris)
lda_pred <- predict(lda_model)
X_lda <- lda_pred$x

# Proyectar nuevas flores
nuevas_flores <- data.frame(
  Sepal.Length = c(5.1, 6.0, 6.9),
  Sepal.Width  = c(3.5, 2.2, 3.1),
  Petal.Length = c(1.4, 4.0, 5.4),
  Petal.Width  = c(0.2, 1.0, 2.1)
)
lda_nuevas <- predict(lda_model, nuevas_flores)

# Data frame para plotly
df_lda <- data.frame(
  LD1 = X_lda[,1],
  LD2 = X_lda[,2],
  Petal.Length = iris$Petal.Length,
  Species = iris$Species
)

df_nuevas <- data.frame(
  LD1 = lda_nuevas$x[,1],
  LD2 = lda_nuevas$x[,2],
  Petal.Length = nuevas_flores$Petal.Length,
  Species = lda_nuevas$class,
  label = paste("Nueva", 1:3)
)

# Gráfico 3D interactivo
fig <- plot_ly(df_lda, x = ~LD1, y = ~LD2, z = ~Petal.Length, 
               color = ~Species, colors = c('red', 'green', 'blue'),
               type = 'scatter3d', mode = 'markers',
               marker = list(size = 5),
               text = ~Species, hoverinfo = 'text')

# Agregar nuevas flores como estrellas grandes
fig <- fig %>%
  add_trace(data = df_nuevas, x = ~LD1, y = ~LD2, z = ~Petal.Length,
            type = 'scatter3d', mode = 'markers+text',
            marker = list(symbol = 'star', size = 12, color = 'black'),
            text = ~label, textposition = 'top center',
            hoverinfo = 'text')

fig <- fig %>%
  layout(title = "LDA 3D de Iris (LD1, LD2, Petal.Length)",
         scene = list(
           xaxis = list(title = "LD1"),
           yaxis = list(title = "LD2"),
           zaxis = list(title = "Petal.Length")
         ))

fig