tarea 3 seminario

Analysis Factorial Exploratorio en R: ILPD (Conjunto de datos de pacientes con problemas hepáticos de la India)

Importar y preparar los datos

# Establecer mirror por defecto (evita errores CRAN)
options(repos = c(CRAN = "https://cloud.r-project.org"))

# Cargar librerías necesarias
library(readxl)
library(psych)
Warning: package 'psych' was built under R version 4.3.3
library(GPArotation)
Warning: package 'GPArotation' was built under R version 4.3.3

Attaching package: 'GPArotation'
The following objects are masked from 'package:psych':

    equamax, varimin
library(corrplot)
corrplot 0.92 loaded
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
# Importar datos desde Excel
datos<- read_excel("C:/Users/MINEDUCYT/Downloads/ilpd+indian+liver+patient+dataset/Indian Liver Patient Dataset (ILPD) india.xlsx")
View(datos)
# verificar valores perdidos sumando cauntos hay
sum(is.na(datos))
[1] 4
#otra forma es Ver si hay algún valor perdido en toda la base:
any(is.na(datos))
[1] TRUE
# Seleccionar las variables independientes
datos_factoriales <- datos %>% select(-Genero,-borrar,Tp)


# Conversión segura: los factores/character se convierten a números por sus niveles
datos_factoriales <- datos_factoriales %>%
  mutate(across(everything(), ~ as.numeric(as.character(.))))


datos_factoriales. <- na.omit(datos_factoriales)
sum(is.na(datos_factoriales.))  # debe dar 0
[1] 0
# Escalar los datos
datos_esc <- scale(datos_factoriales.)

Verificar supuestos: KMO y test de Bartlett

# Matriz de correlación
cor_matrix <- cor(datos_esc)

# Visualizar matriz de correlación
corrplot(cor_matrix, method = "color", tl.cex = 0.7)

# Prueba de adecuación KMO
KMO(cor_matrix)
Kaiser-Meyer-Olkin factor adequacy
Call: KMO(r = cor_matrix)
Overall MSA =  0.49
MSA for each item = 
        Edad Tuberculosis  Basededatos       Alkfos     Sargento         Sgot 
        0.88         0.59         0.56         0.88         0.53         0.55 
          Tp         Alba     Relacion 
        0.34         0.45         0.38 
# Prueba de esfericidad de Bartlett
cortest.bartlett(cor_matrix, n = nrow(datos_esc))
$chisq
[1] 2953.766

$p.value
[1] 0

$df
[1] 36
#p-value = 0:
#Técnicamente esto significa p < 0.0001 o un valor extremadamente pequeño.
#Esto implica que rechazas la hipótesis nula de que las variables no están correlacionadas.


#La prueba de esfericidad de Bartlett es significativa ->  los datos presentan correlaciones suficientes entre variables como para aplicar un Análisis Factorial Exploratorio (AFE)
# Instala si no la tienes
install.packages("ggcorrplot")
Installing package into 'C:/Users/MINEDUCYT/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'ggcorrplot' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\MINEDUCYT\AppData\Local\Temp\RtmpInheve\downloaded_packages
library(ggcorrplot)
Warning: package 'ggcorrplot' was built under R version 4.3.3
Loading required package: ggplot2

Attaching package: 'ggplot2'
The following objects are masked from 'package:psych':

    %+%, alpha
# Matriz de correlaciones
cor_matrix <- cor(datos_esc)

# Gráfico con ggcorrplot
ggcorrplot(cor_matrix,
           hc.order = TRUE,          # Agrupa por jerarquía
           type = "lower",           # Solo triángulo inferior
           lab = TRUE,               # Etiquetas de correlación
           lab_size = 3,
           colors = c("red", "white", "blue"),
           title = "Matriz de correlaciones (ggcorrplot)",
           ggtheme = ggplot2::theme_minimal())

# Seleccionar las variables independientes y las que el KMO es pequeño
datos_factoriales2 <- datos %>% select(-Genero,-borrar,-Tp,-Relacion)


# Conversión segura: los factores/character se convierten a números por sus niveles
datos_factoriales2 <- datos_factoriales2 %>%
  mutate(across(everything(), ~ as.numeric(as.character(.))))


datos_factoriales2. <- na.omit(datos_factoriales2)
sum(is.na(datos_factoriales2.))  # debe dar 0
[1] 0
# Escalar los datos
datos_esc2 <- scale(datos_factoriales2.)

# Matriz de correlación
cor_matrix2 <- cor(datos_esc2)

# Prueba de adecuación KMO
KMO(cor_matrix2)
Kaiser-Meyer-Olkin factor adequacy
Call: KMO(r = cor_matrix2)
Overall MSA =  0.58
MSA for each item = 
        Edad Tuberculosis  Basededatos       Alkfos     Sargento         Sgot 
        0.50         0.57         0.57         0.86         0.55         0.56 
        Alba 
        0.71 
# Prueba de esfericidad de Bartlett
cortest.bartlett(cor_matrix2, n = nrow(datos_esc2))
$chisq
[1] 1587.782

$p.value
[1] 4.940656e-324

$df
[1] 21

Aplicar análisis factorial

# Análisis factorial sin rotación, 3 factores iniciales
factores_sin_rot <- fa(datos_esc2, nfactors = 3, rotate = "none", fm = "ml")
print(factores_sin_rot)
Factor Analysis using method =  ml
Call: fa(r = datos_esc2, nfactors = 3, rotate = "none", fm = "ml")
Standardized loadings (pattern matrix) based upon correlation matrix
               ML1   ML2   ML3   h2    u2 com
Edad          0.01 -0.05  0.51 0.27 0.733 1.0
Tuberculosis  0.89 -0.08  0.01 0.79 0.209 1.0
Basededatos   0.98 -0.09 -0.01 0.97 0.032 1.0
Alkfos        0.25  0.08  0.19 0.10 0.896 2.1
Sargento      0.31  0.80 -0.08 0.75 0.255 1.3
Sgot          0.34  0.86  0.04 0.86 0.143 1.3
Alba         -0.24  0.02 -0.51 0.32 0.681 1.4

                       ML1  ML2  ML3
SS loadings           2.08 1.40 0.57
Proportion Var        0.30 0.20 0.08
Cumulative Var        0.30 0.50 0.58
Proportion Explained  0.51 0.35 0.14
Cumulative Proportion 0.51 0.86 1.00

Mean item complexity =  1.3
Test of the hypothesis that 3 factors are sufficient.

df null model =  21  with the objective function =  2.74 with Chi Square =  1587.78
df of  the model are 3  and the objective function was  0 

The root mean square of the residuals (RMSR) is  0.01 
The df corrected root mean square of the residuals is  0.01 

The harmonic n.obs is  583 with the empirical chi square  0.67  with prob <  0.88 
The total n.obs was  583  with Likelihood Chi Square =  1.27  with prob <  0.74 

Tucker Lewis Index of factoring reliability =  1.008
RMSEA index =  0  and the 90 % confidence intervals are  0 0.049
BIC =  -17.83
Fit based upon off diagonal values = 1
Measures of factor score adequacy             
                                                   ML1  ML2   ML3
Correlation of (regression) scores with factors   0.99 0.94  0.67
Multiple R square of scores with factors          0.97 0.89  0.45
Minimum correlation of possible factor scores     0.94 0.78 -0.10

Determinar número óptimo de factores (Scree Plot y eigenvalores)

# Scree plot y eigenvalores
fa.parallel(datos_esc2, fa = "fa", fm = "ml", n.iter = 100)

Parallel analysis suggests that the number of factors =  3  and the number of components =  NA 
#Este gráfico sirve para decidir cuántos factores conservar,se basa en el criterio de eigenvalores > 1 y el punto de quiebre en la curva.
library(ggplot2)

# Calcular valores propios (eigenvalores)
eigen_vals <- eigen(cor(datos_esc2))$values

# Crear dataframe
df_eigen <- data.frame(
  Factor = 1:length(eigen_vals),
  Eigenvalue = eigen_vals
)

# Gráfico tipo scree plot
ggplot(df_eigen, aes(x = Factor, y = Eigenvalue)) +
  geom_line(color = "steelblue") +
  geom_point(size = 2, color = "red") +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray") +
  ggtitle("Scree Plot de Eigenvalores") +
  xlab("Número de Factor") + ylab("Eigenvalor") +
  theme_minimal()

Interpretar matriz de cargas factoriales

# Extraer factores con la cantidad óptima en este caso seria 3 factores
modelo_factorial <- fa(datos_esc2, nfactors = 3, rotate = "none", fm = "ml")
print(modelo_factorial$loadings)

Loadings:
             ML1    ML2    ML3   
Edad                        0.514
Tuberculosis  0.886              
Basededatos   0.980              
Alkfos        0.248         0.189
Sargento      0.312  0.802       
Sgot          0.342  0.859       
Alba         -0.238        -0.512

                 ML1   ML2   ML3
SS loadings    2.077 1.405 0.569
Proportion Var 0.297 0.201 0.081
Cumulative Var 0.297 0.497 0.579

Rotación de factores (Varimax)

# Aplicar rotación Varimax
modelo_rotado <- fa(datos_esc2, nfactors = 3 , rotate = "varimax", fm = "ml")
print(modelo_rotado$loadings)

Loadings:
             ML1    ML2    ML3   
Edad                        0.512
Tuberculosis  0.874  0.124  0.106
Basededatos   0.970  0.132       
Alkfos        0.201  0.136  0.213
Sargento      0.130  0.852       
Sgot          0.134  0.913       
Alba         -0.178        -0.535

                 ML1   ML2   ML3
SS loadings    1.814 1.615 0.621
Proportion Var 0.259 0.231 0.089
Cumulative Var 0.259 0.490 0.579
# Cargas con más claridad para interpretación
print(modelo_rotado)
Factor Analysis using method =  ml
Call: fa(r = datos_esc2, nfactors = 3, rotate = "varimax", fm = "ml")
Standardized loadings (pattern matrix) based upon correlation matrix
               ML1   ML2   ML3   h2    u2 com
Edad         -0.04 -0.06  0.51 0.27 0.733 1.0
Tuberculosis  0.87  0.12  0.11 0.79 0.209 1.1
Basededatos   0.97  0.13  0.10 0.97 0.032 1.1
Alkfos        0.20  0.14  0.21 0.10 0.896 2.7
Sargento      0.13  0.85 -0.05 0.75 0.255 1.1
Sgot          0.13  0.91  0.07 0.86 0.143 1.1
Alba         -0.18 -0.03 -0.53 0.32 0.681 1.2

                       ML1  ML2  ML3
SS loadings           1.81 1.62 0.62
Proportion Var        0.26 0.23 0.09
Cumulative Var        0.26 0.49 0.58
Proportion Explained  0.45 0.40 0.15
Cumulative Proportion 0.45 0.85 1.00

Mean item complexity =  1.3
Test of the hypothesis that 3 factors are sufficient.

df null model =  21  with the objective function =  2.74 with Chi Square =  1587.78
df of  the model are 3  and the objective function was  0 

The root mean square of the residuals (RMSR) is  0.01 
The df corrected root mean square of the residuals is  0.01 

The harmonic n.obs is  583 with the empirical chi square  0.67  with prob <  0.88 
The total n.obs was  583  with Likelihood Chi Square =  1.27  with prob <  0.74 

Tucker Lewis Index of factoring reliability =  1.008
RMSEA index =  0  and the 90 % confidence intervals are  0 0.049
BIC =  -17.83
Fit based upon off diagonal values = 1
Measures of factor score adequacy             
                                                   ML1  ML2   ML3
Correlation of (regression) scores with factors   0.98 0.94  0.68
Multiple R square of scores with factors          0.96 0.89  0.46
Minimum correlation of possible factor scores     0.92 0.79 -0.09

Gráficos de factores

# Mapa de factores con rotación
fa.diagram(modelo_rotado)

# Otra forma: gráfico de carga factorial
plot(modelo_rotado$loadings, main = "Cargas factoriales (rotadas)",
     xlab = "Factor 1", ylab = "Factor 2")
abline(h = 0, v = 0, col = "gray")

# Cargas factoriales como matriz
load_matrix <- modelo_rotado$loadings[,1:3]  # Ajustado número de factores

# Convertir a data frame
load_df <- as.data.frame(unclass(load_matrix))

# Visualizar con heatmap
heatmap(as.matrix(load_df),
        Colv = NA, Rowv = NA,
        col = colorRampPalette(c("white", "red"))(100),
        scale = "none",
        main = "Mapa de calor de cargas factoriales")

install.packages("plotly")
Installing package into 'C:/Users/MINEDUCYT/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'plotly' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\MINEDUCYT\AppData\Local\Temp\RtmpInheve\downloaded_packages
library(plotly)
Warning: package 'plotly' was built under R version 4.3.3

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
df_load <- as.data.frame(load_matrix)
df_load$Variable <- rownames(df_load)

# Gráfico 3D interactivo
plot_ly(df_load, x = ~ML1, y = ~ML2, z = ~ML3,
        text = ~Variable,
        type = 'scatter3d', mode = 'text+markers',
        marker = list(size = 5),
        textposition = "top center") %>%
  layout(title = 'Gráfico 3D de Cargas Factoriales')