options(repos = c(CRAN = "https://cloud.r-project.org/"))
# Paquetes necesarios
install.packages("readxl") # Para leer archivos Excel
## Installing package into 'C:/Users/john/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readxl'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar
## C:\Users\john\AppData\Local\R\win-library\4.4\00LOCK\readxl\libs\x64\readxl.dll
## a C:\Users\john\AppData\Local\R\win-library\4.4\readxl\libs\x64\readxl.dll:
## Permission denied
## Warning: restored 'readxl'
##
## The downloaded binary packages are in
## C:\Users\john\AppData\Local\Temp\RtmpO8rfrL\downloaded_packages
#install.packages("dplyr") # Para manipulación de datos
#install.packages("FactoMineR") # Para realizar FAMD
#install.packages("tibble") # Para trabajar con tibbles
#install.packages("ggplot2") # Para gráficos
#install.packages("factoextra") # Para visualización de resultados de FAMD
#install.packages("gridExtra") # Para combinar múltiples gráficos
#install.packages("MASS") # Para métodos estadísticos
#install.packages("pcaPP") # Para ACP robusto
install.packages("kernlab") # Para ACP con kernels
## Installing package into 'C:/Users/john/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'kernlab' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\john\AppData\Local\Temp\RtmpO8rfrL\downloaded_packages
install.packages("skewsamp") # Para cálculos de sesgo
## Installing package into 'C:/Users/john/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'skewsamp' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\john\AppData\Local\Temp\RtmpO8rfrL\downloaded_packages
install.packages("mice") # Para imputación múltiple
## Installing package into 'C:/Users/john/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'mice' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\john\AppData\Local\Temp\RtmpO8rfrL\downloaded_packages
# Cargamos las librerías necesarias
library(readxl) # Para leer archivos Excel
library(dplyr) # Para manipulación de datos
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(FactoMineR) # Para realizar ACP
library(tibble) # Para trabajar con tibbles
library(ggplot2) # Para gráficos
library(factoextra) # Para visualización de resultados de ACP
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gridExtra) # Para combinar múltiples gráficos
##
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(MASS) # Para métodos estadísticos
##
## Adjuntando el paquete: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(pcaPP) # Para ACP robusto
library(kernlab) # Para ACP con kernels
##
## Adjuntando el paquete: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(skewsamp) # Para cálculos de sesgo
library(mice) # Para imputación múltiple
##
## Adjuntando el paquete: 'mice'
## The following object is masked from 'package:kernlab':
##
## convergence
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(readxl) # Asegúrate de que el paquete está cargado
library(dplyr)
wdi = read_xlsx("Agua rio Bog.xlsx") %>%
na.omit() #%>% # Eliminamos filas con valores faltantes
#subset(select = -Fecha) # Eliminamos la columna "Fecha"
# Creación de la matriz estandarizada y ponderada
Z = wdi %>%
dplyr::select(where(is.numeric)) %>% # Seleccionamos solo las columnas numéricas
mutate(across(everything(), # Estandarizamos las variables
function(x) (x - mean(x)) / (sd(x) * sqrt(length(x) - 1)))) %>%
as.matrix() # Convertimos a matriz
# Descomposición en valores singulares (SVD)
dvs = svd(Z) # Calculamos la descomposición SVD
D = diag(dvs$d) # Matriz diagonal con valores singulares
lamb = dvs$d^2 # Valores propios (autovalores)
P = dvs$u # Vectores propios de Z %*% t(Z)
Q = dvs$v # Vectores propios de t(Z) %*% Z
# Componentes principales
F = data.frame(Z %*% Q) # Calculamos las componentes principales
colnames(F) = paste("CP", 1:ncol(F), sep = "") # Nombramos las columnas
head(F) # Mostramos las primeras filas
## CP1 CP2 CP3 CP4 CP5 CP6
## 1 -0.04492023 -0.290077335 -0.006784473 -0.26719070 0.49478029 -0.18689664
## 2 -0.25433621 -0.269206923 0.109988921 -0.05304857 0.18169397 -0.09921270
## 3 1.03467544 -0.406577752 -0.047416482 0.10023863 -0.23590200 -0.30005561
## 4 0.24273039 -0.093091213 0.152164375 0.13896174 0.20272064 0.30416217
## 5 0.29017636 0.051138090 0.064587176 0.01421614 0.02658339 0.01137724
## 6 0.68133738 0.008174033 -0.156407587 0.26153334 -0.02827428 0.12272733
## CP7 CP8 CP9 CP10
## 1 -0.04448099 -0.135684986 -0.03791545 -4.058273e-04
## 2 0.13482910 -0.052038933 0.09637545 4.987719e-04
## 3 -0.13358841 0.004750649 -0.09130271 4.962123e-05
## 4 -0.23143496 0.088386973 0.08389008 9.546353e-05
## 5 0.35051761 0.062686522 -0.01933843 -1.986313e-05
## 6 0.04448455 -0.188123222 0.01634191 1.481159e-05
# Gráfico del primer plano principal (CP1 vs CP2)
ggplot(F, aes(x = CP1, y = CP2, label = rownames(F))) +
geom_point() +
geom_text(vjust = -1, size = 2) +
geom_vline(xintercept = 0, lty = 2) +
geom_hline(yintercept = 0, lty = 2)

# Correlaciones entre componentes y variables
Rcv = cor(Z, F) # Correlaciones entre variables y componentes
cos2v = Rcv^2 # Coseno al cuadrado de las variables
cos2i = t(apply(F, 1, function(x) x^2 / sum(x^2))) # Coseno al cuadrado de los individuos
# Contribuciones de variables e individuos
ctrv = apply(Rcv, 2, function(x) x^2 / sum(x^2)) # Contribuciones de las variables
ctri = apply(F, 2, function(x) x^2 / sum(x^2)) # Contribuciones de los individuos
# Inercia de cada componente
inercia = lamb / sum(lamb) * 100 # Porcentaje de varianza explicada por cada componente
cumsum(inercia) # Inercia acumulada
## [1] 40.95547 62.83471 73.12380 82.50980 88.16548 93.65600 96.72506
## [8] 99.09830 99.99999 100.00000
# Realizamos el ACP usando la función PCA del paquete FactoMineR
acp = PCA(wdi, scale.unit = TRUE, ncp = 5, # Realizamos ACP con 5 componentes principales
quali.sup = 1:2, graph = FALSE) # Las primeras dos columnas son cualitativas
# Valores propios y porcentajes de inercia
colnames(acp$eig) = c("v. propio", "inercia", "iner. acum.")
acp$eig
## v. propio inercia iner. acum.
## comp 1 4.095547e+00 4.095547e+01 40.95547
## comp 2 2.187925e+00 2.187925e+01 62.83471
## comp 3 1.028908e+00 1.028908e+01 73.12380
## comp 4 9.386004e-01 9.386004e+00 82.50980
## comp 5 5.655682e-01 5.655682e+00 88.16548
## comp 6 5.490521e-01 5.490521e+00 93.65600
## comp 7 3.069052e-01 3.069052e+00 96.72506
## comp 8 2.373249e-01 2.373249e+00 99.09830
## comp 9 9.016884e-02 9.016884e-01 99.99999
## comp 10 7.287223e-07 7.287223e-06 100.00000
# Histograma de valores propios (Scree Plot)
fviz_screeplot(acp) +
xlab("Componentes") +
ylab("% de varianza explicada")

# Vectores propios (Q)
rownames(acp$svd$V) = rownames(acp$var$coord)
colnames(acp$svd$V) = paste("CP", 1:ncol(acp$svd$V), sep = "")
acp$svd$V
## CP1 CP2 CP3 CP4 CP5
## D.T. 0.45372283 0.08675132 0.122360379 0.26052805 0.019239790
## D.C. 0.43230389 -0.12197427 0.115259836 0.29175423 -0.086082320
## D.M. 0.29313203 0.45417557 0.081601752 0.08214713 0.209960923
## Cl 0.39631135 -0.29941414 0.021602148 0.19998337 0.005127288
## O.D. 0.04174326 0.58803059 0.001591291 -0.26780820 0.425903167
## DQO -0.35576199 -0.16868420 -0.047719971 0.18473531 0.492413796
## DBO -0.31796590 0.05096443 -0.447945099 0.52600368 -0.100840959
## Temp. -0.19473588 -0.27106383 0.685021400 0.18270048 0.412106470
## pH -0.27948712 0.23007687 0.527053327 -0.05975373 -0.584391732
## Cond. 0.13716179 -0.42240920 -0.121010504 -0.61707770 0.052255092
# Correlaciones entre variables y componentes
acp$var$cor
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## D.T. 0.91821961 0.12831938 0.124116403 0.25240320 0.01446914
## D.C. 0.87487312 -0.18041989 0.116913959 0.28265556 -0.06473758
## D.M. 0.59322467 0.67179992 0.082772839 0.07958528 0.15789958
## Cl 0.80203337 -0.44288247 0.021912166 0.19374668 0.00385594
## O.D. 0.08447775 0.86979339 0.001614128 -0.25945632 0.32029736
## DQO -0.71997178 -0.24951150 -0.048404812 0.17897414 0.37031619
## DBO -0.64348211 0.07538472 -0.454373671 0.50959969 -0.07583671
## Temp. -0.39409589 -0.40094772 0.694852314 0.17700277 0.30992166
## pH -0.56561086 0.34032131 0.534617202 -0.05789024 -0.43948753
## Cond. 0.27758059 -0.62481227 -0.122747156 -0.59783347 0.03929806
# Coseno al cuadrado entre variables y componentes
acp$var$cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## D.T. 0.843127252 0.016465864 1.540488e-02 0.063707377 2.093561e-04
## D.C. 0.765402984 0.032551337 1.366887e-02 0.079894168 4.190954e-03
## D.M. 0.351915511 0.451315139 6.851343e-03 0.006333817 2.493228e-02
## Cl 0.643257532 0.196144884 4.801430e-04 0.037537775 1.486827e-05
## O.D. 0.007136491 0.756540533 2.605410e-06 0.067317581 1.025904e-01
## DQO 0.518359369 0.062255991 2.343026e-03 0.032031742 1.371341e-01
## DBO 0.414069227 0.005682856 2.064554e-01 0.259691847 5.751206e-03
## Temp. 0.155311567 0.160759070 4.828197e-01 0.031329980 9.605143e-02
## pH 0.319915643 0.115818594 2.858156e-01 0.003351280 1.931493e-01
## Cond. 0.077050983 0.390390371 1.506686e-02 0.357404860 1.544337e-03
# Representación gráfica de las variables en componentes 1 y 2
fviz_pca_var(acp, axes = c(1,2), col.var = "cos2",
title = "Variables", repel = TRUE) +
scale_color_gradient2(low="red", mid="blue", high="black", midpoint=.3) #library(factoextra) - fviz_pca_var

# Componentes 1 y 3, 2 y 3
fviz_pca_var(acp, axes = c(1,3), col.var = "cos2",
title = "Variables", repel = TRUE) +
scale_color_gradient2(low="red", mid="blue", high="black", midpoint=.3)

fviz_pca_var(acp, axes = c(2,3), col.var = "cos2",
title = "Variables", repel = TRUE) +
scale_color_gradient2(low="red", mid="blue", high="black", midpoint=.3)

# Contribuciones de las variables
acp$var$contrib
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## D.T. 20.586440 0.7525791 1.497206e+00 6.7874864 0.037016951
## D.C. 18.688665 1.4877723 1.328483e+00 8.5120532 0.741016589
## D.M. 8.592639 20.6275450 6.658846e-01 0.6748151 4.408358907
## Cl 15.706268 8.9648830 4.666528e-02 3.9993350 0.002628909
## O.D. 0.174250 34.5779978 2.532208e-04 7.1721234 18.139350767
## DQO 12.656659 2.8454358 2.277196e-01 3.4127133 24.247134648
## DBO 10.110231 0.2597373 2.006548e+01 27.6679873 1.016889910
## Temp. 3.792206 7.3475598 4.692543e+01 3.3379465 16.983174243
## pH 7.811305 5.2935367 2.777852e+01 0.3570508 34.151369612
## Cond. 1.881336 17.8429533 1.464354e+00 38.0784889 0.273059465
# Gráficos de contribuciones de las variables
c1 = fviz_contrib(acp, choice = "var", axes = 1, title = "CP1") +
ylab("Contribuciones (%)")
c2 = fviz_contrib(acp, choice = "var", axes = 2, title = "CP2") +
ylab("Contribuciones (%)")
c3 = fviz_contrib(acp, choice = "var", axes = 3, title = "CP3") +
ylab("Contribuciones (%)")
grid.arrange(c1, c2, c3, ncol=2, nrow = 2)

# library(gridExtra) - fviz_contrib
# Coordenadas de los individuos (componentes principales)
head(acp$ind$coord, 10) # Primeras filas de coordenadas de los individuos
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## 1 -0.1796809 1.16030934 -0.02713789 -1.06876278 -1.9791212
## 2 -1.0173448 1.07682769 0.43995569 -0.21219429 -0.7267759
## 3 4.1387017 1.62631101 -0.18966593 0.40095453 0.9436080
## 4 0.9709216 0.37236485 0.60865750 0.55584694 -0.8108826
## 5 1.1607054 -0.20455236 0.25834870 0.05686454 -0.1063336
## 6 2.7253495 -0.03269613 -0.62563035 1.04613336 0.1130971
## 7 2.2278396 -0.96151013 0.93232866 0.44910249 0.3481944
## 8 2.1955484 -1.46882681 -0.49186611 -0.30248628 -0.2707238
## 9 -2.0635799 0.44730865 -2.92349116 1.47622201 -0.4025234
## 10 -2.3195227 3.32172249 0.69058844 -0.15674048 0.7203402
# Representación de los individuos en componentes 1 y 2
fviz_pca_ind(acp, axes = c(1,2), geom = c("point","text"),
habillage = 2, addEllipses = TRUE,
title = "Individuos") +
scale_color_brewer(palette = "Set1")

# Componentes 1 y 3, 2 y 3
fviz_pca_ind(acp, axes = c(1,3), geom = c("point","text"),
habillage = 2, addEllipses = TRUE,
title = "Individuos") +
scale_color_brewer(palette = "Set1")

fviz_pca_ind(acp, axes = c(2,3), geom = c("point","text"),
habillage = 2, addEllipses = TRUE,
title = "Individuos") +
scale_color_brewer(palette = "Set1")

# Representación simultánea de variables e individuos
fviz_pca_biplot(acp, axes = c(1,2), geom = c("point","text"),
col.var = "#3c3c3c",
habillage = 2, addEllipses = TRUE,
title = "Variables e individuos") +
scale_color_brewer(palette = "Set1")

# Componentes 1 y 3, 2 y 3
fviz_pca_biplot(acp, axes = c(1,3), geom = c("point","text"),
col.var = "#3c3c3c",
habillage = 2, addEllipses = TRUE,
title = "Variables e individuos") +
scale_color_brewer(palette = "Set1")

fviz_pca_biplot(acp, axes = c(2,3), geom = c("point","text"),
col.var = "#3c3c3c",
habillage = 2, addEllipses = TRUE,
title = "Variables e individuos") +
scale_color_brewer(palette = "Set1")

# Selección del número óptimo de componentes utilizando el criterio de suma de errores cuadráticos
k.opt = function(acp) {
# Obtener los valores propios
s = get_eigenvalue(acp)
# Calcular la varianza acumulada
varianza_acumulada = cumsum(s$eigenvalue) / sum(s$eigenvalue)
# Encontrar el primer número de componentes que explica al menos el 70% de la varianza
k = which(varianza_acumulada >= 0.70)[1]
return(k)
}