options(repos = c(CRAN = "https://cloud.r-project.org/"))
# Paquetes necesarios
install.packages("readxl")       # Para leer archivos Excel
## Installing package into 'C:/Users/john/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readxl'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar
## C:\Users\john\AppData\Local\R\win-library\4.4\00LOCK\readxl\libs\x64\readxl.dll
## a C:\Users\john\AppData\Local\R\win-library\4.4\readxl\libs\x64\readxl.dll:
## Permission denied
## Warning: restored 'readxl'
## 
## The downloaded binary packages are in
##  C:\Users\john\AppData\Local\Temp\RtmpO8rfrL\downloaded_packages
#install.packages("dplyr")        # Para manipulación de datos
#install.packages("FactoMineR")   # Para realizar FAMD
#install.packages("tibble")       # Para trabajar con tibbles
#install.packages("ggplot2")      # Para gráficos
#install.packages("factoextra")   # Para visualización de resultados de FAMD
#install.packages("gridExtra")    # Para combinar múltiples gráficos
#install.packages("MASS")         # Para métodos estadísticos
#install.packages("pcaPP")        # Para ACP robusto
install.packages("kernlab")      # Para ACP con kernels
## Installing package into 'C:/Users/john/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'kernlab' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\john\AppData\Local\Temp\RtmpO8rfrL\downloaded_packages
install.packages("skewsamp")     # Para cálculos de sesgo
## Installing package into 'C:/Users/john/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'skewsamp' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\john\AppData\Local\Temp\RtmpO8rfrL\downloaded_packages
install.packages("mice")         # Para imputación múltiple
## Installing package into 'C:/Users/john/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'mice' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\john\AppData\Local\Temp\RtmpO8rfrL\downloaded_packages
# Cargamos las librerías necesarias
library(readxl)       # Para leer archivos Excel
library(dplyr)        # Para manipulación de datos
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(FactoMineR)   # Para realizar ACP
library(tibble)       # Para trabajar con tibbles
library(ggplot2)      # Para gráficos
library(factoextra)   # Para visualización de resultados de ACP
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gridExtra)    # Para combinar múltiples gráficos
## 
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(MASS)         # Para métodos estadísticos
## 
## Adjuntando el paquete: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(pcaPP)        # Para ACP robusto
library(kernlab)      # Para ACP con kernels
## 
## Adjuntando el paquete: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(skewsamp)     # Para cálculos de sesgo
library(mice)         # Para imputación múltiple
## 
## Adjuntando el paquete: 'mice'
## The following object is masked from 'package:kernlab':
## 
##     convergence
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(readxl)  # Asegúrate de que el paquete está cargado
library(dplyr) 

wdi = read_xlsx("Agua rio Bog.xlsx") %>% 
  na.omit() #%>%                               # Eliminamos filas con valores faltantes
  #subset(select = -Fecha)                     # Eliminamos la columna "Fecha"
# Creación de la matriz estandarizada y ponderada
Z = wdi %>%
  dplyr::select(where(is.numeric)) %>%               # Seleccionamos solo las columnas numéricas
  mutate(across(everything(),                 # Estandarizamos las variables
                function(x) (x - mean(x)) / (sd(x) * sqrt(length(x) - 1)))) %>%
  as.matrix()                                 # Convertimos a matriz

# Descomposición en valores singulares (SVD)
dvs = svd(Z)                                  # Calculamos la descomposición SVD
D = diag(dvs$d)                              # Matriz diagonal con valores singulares
lamb = dvs$d^2                               # Valores propios (autovalores)
P = dvs$u                                    # Vectores propios de Z %*% t(Z)
Q = dvs$v                                    # Vectores propios de t(Z) %*% Z

# Componentes principales
F = data.frame(Z %*% Q)                      # Calculamos las componentes principales
colnames(F) = paste("CP", 1:ncol(F), sep = "")  # Nombramos las columnas
head(F)                                       # Mostramos las primeras filas
##           CP1          CP2          CP3         CP4         CP5         CP6
## 1 -0.04492023 -0.290077335 -0.006784473 -0.26719070  0.49478029 -0.18689664
## 2 -0.25433621 -0.269206923  0.109988921 -0.05304857  0.18169397 -0.09921270
## 3  1.03467544 -0.406577752 -0.047416482  0.10023863 -0.23590200 -0.30005561
## 4  0.24273039 -0.093091213  0.152164375  0.13896174  0.20272064  0.30416217
## 5  0.29017636  0.051138090  0.064587176  0.01421614  0.02658339  0.01137724
## 6  0.68133738  0.008174033 -0.156407587  0.26153334 -0.02827428  0.12272733
##           CP7          CP8         CP9          CP10
## 1 -0.04448099 -0.135684986 -0.03791545 -4.058273e-04
## 2  0.13482910 -0.052038933  0.09637545  4.987719e-04
## 3 -0.13358841  0.004750649 -0.09130271  4.962123e-05
## 4 -0.23143496  0.088386973  0.08389008  9.546353e-05
## 5  0.35051761  0.062686522 -0.01933843 -1.986313e-05
## 6  0.04448455 -0.188123222  0.01634191  1.481159e-05
# Gráfico del primer plano principal (CP1 vs CP2)
ggplot(F, aes(x = CP1, y = CP2, label = rownames(F))) +
  geom_point() +
  geom_text(vjust = -1, size = 2) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_hline(yintercept = 0, lty = 2)

# Correlaciones entre componentes y variables
Rcv = cor(Z, F)                              # Correlaciones entre variables y componentes
cos2v = Rcv^2                               # Coseno al cuadrado de las variables
cos2i = t(apply(F, 1, function(x) x^2 / sum(x^2))) # Coseno al cuadrado de los individuos
# Contribuciones de variables e individuos
ctrv = apply(Rcv, 2, function(x) x^2 / sum(x^2))  # Contribuciones de las variables
ctri = apply(F, 2, function(x) x^2 / sum(x^2))    # Contribuciones de los individuos
# Inercia de cada componente
inercia = lamb / sum(lamb) * 100               # Porcentaje de varianza explicada por cada componente
cumsum(inercia)                               # Inercia acumulada
##  [1]  40.95547  62.83471  73.12380  82.50980  88.16548  93.65600  96.72506
##  [8]  99.09830  99.99999 100.00000
# Realizamos el ACP usando la función PCA del paquete FactoMineR
acp = PCA(wdi, scale.unit = TRUE, ncp = 5,    # Realizamos ACP con 5 componentes principales
          quali.sup = 1:2, graph = FALSE)    # Las primeras dos columnas son cualitativas

# Valores propios y porcentajes de inercia
colnames(acp$eig) = c("v. propio", "inercia", "iner. acum.")
acp$eig
##            v. propio      inercia iner. acum.
## comp 1  4.095547e+00 4.095547e+01    40.95547
## comp 2  2.187925e+00 2.187925e+01    62.83471
## comp 3  1.028908e+00 1.028908e+01    73.12380
## comp 4  9.386004e-01 9.386004e+00    82.50980
## comp 5  5.655682e-01 5.655682e+00    88.16548
## comp 6  5.490521e-01 5.490521e+00    93.65600
## comp 7  3.069052e-01 3.069052e+00    96.72506
## comp 8  2.373249e-01 2.373249e+00    99.09830
## comp 9  9.016884e-02 9.016884e-01    99.99999
## comp 10 7.287223e-07 7.287223e-06   100.00000
# Histograma de valores propios (Scree Plot)
fviz_screeplot(acp) + 
  xlab("Componentes") + 
  ylab("% de varianza explicada")

# Vectores propios (Q)
rownames(acp$svd$V) = rownames(acp$var$coord)
colnames(acp$svd$V) = paste("CP", 1:ncol(acp$svd$V), sep = "")
acp$svd$V
##               CP1         CP2          CP3         CP4          CP5
## D.T.   0.45372283  0.08675132  0.122360379  0.26052805  0.019239790
## D.C.   0.43230389 -0.12197427  0.115259836  0.29175423 -0.086082320
## D.M.   0.29313203  0.45417557  0.081601752  0.08214713  0.209960923
## Cl     0.39631135 -0.29941414  0.021602148  0.19998337  0.005127288
## O.D.   0.04174326  0.58803059  0.001591291 -0.26780820  0.425903167
## DQO   -0.35576199 -0.16868420 -0.047719971  0.18473531  0.492413796
## DBO   -0.31796590  0.05096443 -0.447945099  0.52600368 -0.100840959
## Temp. -0.19473588 -0.27106383  0.685021400  0.18270048  0.412106470
## pH    -0.27948712  0.23007687  0.527053327 -0.05975373 -0.584391732
## Cond.  0.13716179 -0.42240920 -0.121010504 -0.61707770  0.052255092
# Correlaciones entre variables y componentes
acp$var$cor
##             Dim.1       Dim.2        Dim.3       Dim.4       Dim.5
## D.T.   0.91821961  0.12831938  0.124116403  0.25240320  0.01446914
## D.C.   0.87487312 -0.18041989  0.116913959  0.28265556 -0.06473758
## D.M.   0.59322467  0.67179992  0.082772839  0.07958528  0.15789958
## Cl     0.80203337 -0.44288247  0.021912166  0.19374668  0.00385594
## O.D.   0.08447775  0.86979339  0.001614128 -0.25945632  0.32029736
## DQO   -0.71997178 -0.24951150 -0.048404812  0.17897414  0.37031619
## DBO   -0.64348211  0.07538472 -0.454373671  0.50959969 -0.07583671
## Temp. -0.39409589 -0.40094772  0.694852314  0.17700277  0.30992166
## pH    -0.56561086  0.34032131  0.534617202 -0.05789024 -0.43948753
## Cond.  0.27758059 -0.62481227 -0.122747156 -0.59783347  0.03929806
# Coseno al cuadrado entre variables y componentes
acp$var$cos2
##             Dim.1       Dim.2        Dim.3       Dim.4        Dim.5
## D.T.  0.843127252 0.016465864 1.540488e-02 0.063707377 2.093561e-04
## D.C.  0.765402984 0.032551337 1.366887e-02 0.079894168 4.190954e-03
## D.M.  0.351915511 0.451315139 6.851343e-03 0.006333817 2.493228e-02
## Cl    0.643257532 0.196144884 4.801430e-04 0.037537775 1.486827e-05
## O.D.  0.007136491 0.756540533 2.605410e-06 0.067317581 1.025904e-01
## DQO   0.518359369 0.062255991 2.343026e-03 0.032031742 1.371341e-01
## DBO   0.414069227 0.005682856 2.064554e-01 0.259691847 5.751206e-03
## Temp. 0.155311567 0.160759070 4.828197e-01 0.031329980 9.605143e-02
## pH    0.319915643 0.115818594 2.858156e-01 0.003351280 1.931493e-01
## Cond. 0.077050983 0.390390371 1.506686e-02 0.357404860 1.544337e-03
# Representación gráfica de las variables en componentes 1 y 2
fviz_pca_var(acp, axes = c(1,2), col.var = "cos2", 
             title = "Variables", repel = TRUE) +
  scale_color_gradient2(low="red", mid="blue", high="black", midpoint=.3) #library(factoextra) - fviz_pca_var

# Componentes 1 y 3, 2 y 3
fviz_pca_var(acp, axes = c(1,3), col.var = "cos2", 
             title = "Variables", repel = TRUE) +
  scale_color_gradient2(low="red", mid="blue", high="black", midpoint=.3)

fviz_pca_var(acp, axes = c(2,3), col.var = "cos2", 
             title = "Variables", repel = TRUE) +
  scale_color_gradient2(low="red", mid="blue", high="black", midpoint=.3)

# Contribuciones de las variables
acp$var$contrib
##           Dim.1      Dim.2        Dim.3      Dim.4        Dim.5
## D.T.  20.586440  0.7525791 1.497206e+00  6.7874864  0.037016951
## D.C.  18.688665  1.4877723 1.328483e+00  8.5120532  0.741016589
## D.M.   8.592639 20.6275450 6.658846e-01  0.6748151  4.408358907
## Cl    15.706268  8.9648830 4.666528e-02  3.9993350  0.002628909
## O.D.   0.174250 34.5779978 2.532208e-04  7.1721234 18.139350767
## DQO   12.656659  2.8454358 2.277196e-01  3.4127133 24.247134648
## DBO   10.110231  0.2597373 2.006548e+01 27.6679873  1.016889910
## Temp.  3.792206  7.3475598 4.692543e+01  3.3379465 16.983174243
## pH     7.811305  5.2935367 2.777852e+01  0.3570508 34.151369612
## Cond.  1.881336 17.8429533 1.464354e+00 38.0784889  0.273059465
# Gráficos de contribuciones de las variables
c1 = fviz_contrib(acp, choice = "var", axes = 1, title = "CP1") + 
  ylab("Contribuciones (%)")
c2 = fviz_contrib(acp, choice = "var", axes = 2, title = "CP2") + 
  ylab("Contribuciones (%)")
c3 = fviz_contrib(acp, choice = "var", axes = 3, title = "CP3") + 
  ylab("Contribuciones (%)")
grid.arrange(c1, c2, c3, ncol=2, nrow = 2)

# library(gridExtra) - fviz_contrib
# Coordenadas de los individuos (componentes principales)
head(acp$ind$coord, 10) # Primeras filas de coordenadas de los individuos
##         Dim.1       Dim.2       Dim.3       Dim.4      Dim.5
## 1  -0.1796809  1.16030934 -0.02713789 -1.06876278 -1.9791212
## 2  -1.0173448  1.07682769  0.43995569 -0.21219429 -0.7267759
## 3   4.1387017  1.62631101 -0.18966593  0.40095453  0.9436080
## 4   0.9709216  0.37236485  0.60865750  0.55584694 -0.8108826
## 5   1.1607054 -0.20455236  0.25834870  0.05686454 -0.1063336
## 6   2.7253495 -0.03269613 -0.62563035  1.04613336  0.1130971
## 7   2.2278396 -0.96151013  0.93232866  0.44910249  0.3481944
## 8   2.1955484 -1.46882681 -0.49186611 -0.30248628 -0.2707238
## 9  -2.0635799  0.44730865 -2.92349116  1.47622201 -0.4025234
## 10 -2.3195227  3.32172249  0.69058844 -0.15674048  0.7203402
# Representación de los individuos en componentes 1 y 2
fviz_pca_ind(acp, axes = c(1,2), geom = c("point","text"), 
             habillage = 2, addEllipses = TRUE, 
             title = "Individuos") +
  scale_color_brewer(palette = "Set1")

# Componentes 1 y 3, 2 y 3
fviz_pca_ind(acp, axes = c(1,3), geom = c("point","text"), 
             habillage = 2, addEllipses = TRUE,
             title = "Individuos") +
  scale_color_brewer(palette = "Set1")

fviz_pca_ind(acp, axes = c(2,3), geom = c("point","text"), 
             habillage = 2, addEllipses = TRUE,
             title = "Individuos") +
  scale_color_brewer(palette = "Set1")

# Representación simultánea de variables e individuos
fviz_pca_biplot(acp, axes = c(1,2), geom = c("point","text"),
                col.var = "#3c3c3c",
                habillage = 2, addEllipses = TRUE,
                title = "Variables e individuos") +
  scale_color_brewer(palette = "Set1")

# Componentes 1 y 3, 2 y 3
fviz_pca_biplot(acp, axes = c(1,3), geom = c("point","text"),
                col.var = "#3c3c3c",
                habillage = 2, addEllipses = TRUE,
                title = "Variables e individuos") +
  scale_color_brewer(palette = "Set1")

fviz_pca_biplot(acp, axes = c(2,3), geom = c("point","text"),
                col.var = "#3c3c3c",
                habillage = 2, addEllipses = TRUE,
                title = "Variables e individuos") +
  scale_color_brewer(palette = "Set1")

# Selección del número óptimo de componentes utilizando el criterio de suma de errores cuadráticos
k.opt = function(acp) {
  # Obtener los valores propios
  s = get_eigenvalue(acp)
  
  # Calcular la varianza acumulada
  varianza_acumulada = cumsum(s$eigenvalue) / sum(s$eigenvalue)
  
  # Encontrar el primer número de componentes que explica al menos el 70% de la varianza
  k = which(varianza_acumulada >= 0.70)[1]
  
  return(k)
}