Análisis de Componentes Principales

Autor/a

Daniel Perdomo

Fecha de publicación

8 de abril de 2025

############# Variables y unidad de medida ############
# Ambient temperature (AT) C
# Ambient pressure (AP) mbar
# Ambient humidity (AH) (%) 
# Air filter difference pressure (AFDP) mbar 
# Gas turbine exhaust pressure (GTEP) mbar 
# Turbine inlet temperature (TIT) C 
# Turbine after temperature (TAT) C 
# Compressor discharge pressure (CDP) mbar 
# Turbine energy yield (TEY) MWH 
# Carbon monoxide (CO) mg/m3 
# Nitrogen oxides (NOx) mg/m3 

Aplicación en R

# Librerias

library(tidyverse)
library(psych)
library(corrplot)
library(FactoMineR)
library(factoextra)
library(ggplot2)
library(GGally)
library(ade4)

options(scipen=999) 
options(digits = 2) # Número de decimales

Exploración inicial en R

ruta <- file.choose()
gasturbine <- read.csv(ruta, header=TRUE, sep=",")

# Estrucura de la base
str(gasturbine)
'data.frame':   7384 obs. of  11 variables:
 $ AT  : num  1.953 1.219 0.949 1.008 1.286 ...
 $ AP  : num  1020 1020 1022 1022 1022 ...
 $ AH  : num  85 87.5 78.3 76.9 76.7 ...
 $ AFDP: num  2.53 2.39 2.78 2.82 2.84 ...
 $ GTEP: num  20.1 18.6 22.3 23.4 23.5 ...
 $ TIT : num  1049 1046 1069 1075 1076 ...
 $ TAT : num  545 548 550 550 550 ...
 $ TEY : num  116 109 126 132 134 ...
 $ CDP : num  10.8 10.3 11.3 11.7 11.7 ...
 $ CO  : num  7.45 6.47 3.63 3.2 2.38 ...
 $ NOX : num  113.2 112 88.1 87.1 82.5 ...
head(gasturbine)
    AT   AP AH AFDP GTEP  TIT TAT TEY CDP  CO NOX
1 1.95 1020 85  2.5   20 1049 545 116  11 7.4 113
2 1.22 1020 88  2.4   19 1046 548 109  10 6.5 112
3 0.95 1022 78  2.8   22 1069 550 126  11 3.6  88
4 1.01 1022 77  2.8   23 1075 550 132  12 3.2  87
5 1.29 1022 77  2.8   23 1076 550 134  12 2.4  83
6 1.83 1022 76  2.8   23 1076 550 134  12 2.1  81
tail(gasturbine)
      AT   AP AH AFDP GTEP  TIT TAT TEY CDP   CO NOX
7379 3.4 1028 93  3.3   20 1058 550 117  11  5.3  67
7380 3.6 1028 93  3.2   19 1037 542 109  10 11.0  89
7381 4.2 1029 94  3.2   19 1038 542 109  10 11.1  89
7382 5.5 1028 95  3.3   19 1038 543 108  10 11.4  96
7383 5.9 1029 94  4.0   24 1077 550 131  12  3.3  65
7384 6.0 1029 95  3.9   23 1068 548 125  11 12.0 109
# Resumen
summary(gasturbine)
       AT           AP             AH          AFDP          GTEP   
 Min.   :-6   Min.   : 989   Min.   :24   Min.   :2.4   Min.   :18  
 1st Qu.:11   1st Qu.:1010   1st Qu.:59   1st Qu.:3.1   1st Qu.:23  
 Median :17   Median :1014   Median :71   Median :3.5   Median :25  
 Mean   :17   Mean   :1015   Mean   :69   Mean   :3.6   Mean   :26  
 3rd Qu.:24   3rd Qu.:1018   3rd Qu.:80   3rd Qu.:4.2   3rd Qu.:30  
 Max.   :37   Max.   :1037   Max.   :97   Max.   :5.2   Max.   :41  
      TIT            TAT           TEY           CDP             CO    
 Min.   :1016   Min.   :516   Min.   :100   Min.   : 9.9   Min.   : 0  
 1st Qu.:1070   1st Qu.:545   1st Qu.:126   1st Qu.:11.5   1st Qu.: 2  
 Median :1080   Median :550   Median :132   Median :11.9   Median : 3  
 Mean   :1079   Mean   :547   Mean   :134   Mean   :12.1   Mean   : 3  
 3rd Qu.:1100   3rd Qu.:550   3rd Qu.:147   3rd Qu.:13.1   3rd Qu.: 4  
 Max.   :1100   Max.   :551   Max.   :180   Max.   :15.2   Max.   :41  
      NOX     
 Min.   : 26  
 1st Qu.: 52  
 Median : 57  
 Mean   : 60  
 3rd Qu.: 65  
 Max.   :120  

Correlaciones

# Matriz de correlaciones
cor_gt <- cor(gasturbine)
cor_gt
        AT     AP     AH   AFDP   GTEP    TIT    TAT   TEY    CDP     CO    NOX
AT    1.00 -0.493 -0.466  0.469  0.194  0.330  0.208  0.11  0.201 -0.391 -0.594
AP   -0.49  1.000  0.084 -0.094 -0.044 -0.082 -0.290  0.05  0.029  0.201  0.214
AH   -0.47  0.084  1.000 -0.245 -0.298 -0.261  0.026 -0.18 -0.222  0.159  0.065
AFDP  0.47 -0.094 -0.245  1.000  0.844  0.915 -0.520  0.88  0.923 -0.641 -0.584
GTEP  0.19 -0.044 -0.298  0.844  1.000  0.893 -0.621  0.93  0.938 -0.557 -0.367
TIT   0.33 -0.082 -0.261  0.915  0.893  1.000 -0.396  0.95  0.952 -0.738 -0.520
TAT   0.21 -0.290  0.026 -0.520 -0.621 -0.396  1.000 -0.63 -0.657  0.026  0.054
TEY   0.11  0.050 -0.183  0.885  0.932  0.952 -0.634  1.00  0.991 -0.617 -0.403
CDP   0.20  0.029 -0.222  0.923  0.938  0.952 -0.657  0.99  1.000 -0.613 -0.443
CO   -0.39  0.201  0.159 -0.641 -0.557 -0.738  0.026 -0.62 -0.613  1.000  0.678
NOX  -0.59  0.214  0.065 -0.584 -0.367 -0.520  0.054 -0.40 -0.443  0.678  1.000
# Mapa de calor de correlaciones
corrplot(cor_gt, method = "color", main="mapa de calor",col = colorRampPalette(c("royalblue4", "white", "red3"))(100), tl.cex = 0.8)  

# Dispersión, histograma y correlación 
pairs.panels(gasturbine, pch=20, stars=T, main="Correlaciones")

Autovalores y Autovectores

### Usando matriz de correlaciones
eigen_cor <- eigen(cor_gt)
eigen_valores <- eigen_cor$values
eigen_vectores <- eigen_cor$vectors

# Eigen valores
eigen_valores
 [1] 5.97794 2.15542 1.05344 0.72396 0.56477 0.27026 0.13942 0.08056 0.03212
[10] 0.00144 0.00066
# Eigen vectores
eigen_vectores
        [,1]    [,2]    [,3]    [,4]   [,5]   [,6]   [,7]   [,8]   [,9]   [,10]
 [1,] -0.162  0.5444 -0.2074 -0.0597 -0.402 -0.399  0.294  0.279 -0.363 -0.0961
 [2,]  0.039 -0.4689 -0.0406 -0.8250 -0.078 -0.248  0.051  0.156 -0.034 -0.0158
 [3,]  0.124 -0.2001  0.8329  0.2410 -0.157 -0.332  0.105  0.195 -0.095 -0.0095
 [4,] -0.391  0.0157  0.0033  0.0257 -0.187 -0.351  0.209 -0.276  0.753 -0.0204
 [5,] -0.377 -0.1395 -0.1154  0.1692  0.108  0.048 -0.276  0.809  0.225 -0.0035
 [6,] -0.395 -0.0061  0.0381 -0.0046  0.252 -0.272 -0.187 -0.179 -0.261 -0.0417
 [7,]  0.217  0.4525  0.1271 -0.2010  0.541 -0.418 -0.362  0.023  0.153  0.0847
 [8,] -0.387 -0.1880  0.0280  0.0692  0.149 -0.056 -0.144 -0.224 -0.264 -0.6342
 [9,] -0.396 -0.1559 -0.0110  0.0644  0.028 -0.089 -0.035 -0.160 -0.279  0.7607
[10,]  0.301 -0.2296 -0.3088  0.2579 -0.437 -0.380 -0.588 -0.114 -0.010  0.0032
[11,]  0.248 -0.3270 -0.3665  0.3355  0.436 -0.374  0.495  0.083 -0.048 -0.0134
        [,11]
 [1,]  0.0712
 [2,]  0.0125
 [3,]  0.0055
 [4,]  0.0235
 [5,]  0.0091
 [6,] -0.7535
 [7,]  0.2494
 [8,]  0.4911
 [9,]  0.3505
[10,] -0.0060
[11,]  0.0083
# La varianza media en los autovalores de cor(gasturbine) es 1 debido a que se
# trabaja con la matriz de correlaciones


### Usando matriz de varianzas  covrianzas
eigen_cov <- eigen(cov(gasturbine))
eigen_valores2 <- eigen_cov$values
eigen_vectores2 <- eigen_cov$vectors

# Eigen valores
eigen_valores2
 [1] 718.2600 190.7223 131.3550  46.8968  24.1699  14.7371   1.8693   1.2843
 [9]   0.1922   0.0180   0.0022
# Eigen vectores
eigen_vectores2
        [,1]     [,2]    [,3]    [,4]    [,5]   [,6]    [,7]    [,8]   [,9]
 [1,]  0.105  0.37467 -0.3963  0.1544  0.0025  0.784  0.0132  0.0644  0.221
 [2,] -0.018 -0.14258  0.2799 -0.7460  0.4678  0.344 -0.0679  0.0231  0.045
 [3,] -0.160 -0.87055 -0.3754  0.1640  0.0524  0.211 -0.0359  0.0124  0.015
 [4,]  0.021 -0.00017 -0.0058 -0.0011 -0.0122  0.038  0.0196  0.0035 -0.089
 [5,]  0.153 -0.01766  0.0718  0.0238 -0.1683  0.027 -0.9696  0.0078  0.033
 [6,]  0.731 -0.06785 -0.0102  0.2215  0.4024  0.026  0.0338 -0.0594 -0.490
 [7,] -0.097  0.12130 -0.1875  0.2387  0.6970 -0.315 -0.1380 -0.1420  0.501
 [8,]  0.581 -0.23424  0.2351 -0.0380 -0.2662 -0.028  0.1811 -0.0234  0.667
 [9,]  0.041 -0.01162  0.0125 -0.0057 -0.0288  0.023  0.0115  0.0056 -0.046
[10,] -0.061 -0.00579  0.0687 -0.0006 -0.0944  0.138  0.0064 -0.9807 -0.041
[11,] -0.232 -0.08224  0.7232  0.5336  0.1465  0.317  0.0163  0.0950  0.019
         [,10]    [,11]
 [1,] -0.01477 -0.00234
 [2,] -0.00090 -0.00044
 [3,] -0.00410 -0.00021
 [4,]  0.99471 -0.01421
 [5,]  0.01613 -0.00020
 [6,] -0.05631 -0.04085
 [7,]  0.07058  0.06164
 [8,]  0.04264 -0.00823
 [9,]  0.00782  0.99712
[10,] -0.00513 -0.00079
[11,]  0.00048 -0.00040
# cota de varianza media autovalores de cov(gasturbine)
var_med=(sum(eigen_valores2)/ncol(gasturbine))
var_med
[1] 103
# Se pueden considerar tomar valores mayores a var_med=103

Análisis de Componentes Principales (ACP)

# Analisis de componenetes principales usando martiz de correlaciones (datos estandarizados)
acp <- dudi.pca(gasturbine, scannf=FALSE, scale=TRUE, nf=4)

# Resumen de componenetes y varianzas acumuladas
summary(acp)
Class: pca dudi
Call: dudi.pca(df = gasturbine, scale = TRUE, scannf = FALSE, nf = 4)

Total inertia: 11

Eigenvalues:
    Ax1     Ax2     Ax3     Ax4     Ax5 
 5.9779  2.1554  1.0534  0.7240  0.5648 

Projected inertia (%):
    Ax1     Ax2     Ax3     Ax4     Ax5 
 54.345  19.595   9.577   6.581   5.134 

Cumulative projected inertia (%):
    Ax1   Ax1:2   Ax1:3   Ax1:4   Ax1:5 
  54.34   73.94   83.52   90.10   95.23 

(Only 5 dimensions (out of 11) are shown)
# Autovectores 
acp$c1
        CS1     CS2     CS3     CS4
AT    0.162  0.5444 -0.2074 -0.0597
AP   -0.039 -0.4689 -0.0406 -0.8250
AH   -0.124 -0.2001  0.8329  0.2410
AFDP  0.391  0.0157  0.0033  0.0257
GTEP  0.377 -0.1395 -0.1154  0.1692
TIT   0.395 -0.0061  0.0381 -0.0046
TAT  -0.217  0.4525  0.1271 -0.2010
TEY   0.387 -0.1880  0.0280  0.0692
CDP   0.396 -0.1559 -0.0110  0.0644
CO   -0.301 -0.2296 -0.3088  0.2579
NOX  -0.248 -0.3270 -0.3665  0.3355

Gráficos de sedimentación (scree)

plot(acp$eig,type="b",pch=20,col="royalblue4", xlab = "Número de componentes", ylab = "Varianza explicada")
title("Gráfico de sedimentación")
abline(h=1,lty=3,col="red3")

screeplot(acp, main ="Valores Propios")

fviz_eig(acp, addlabels=TRUE, hjust = -0.3)

Análisis de cargas (loadings)

# acp$co corresponde a las coordenadas de las variables

# PC1*PC2
fviz_pca_var(acp, col.var="royalblue4", axes=c(1,2))+theme_minimal() +
ggtitle("Contribución de las variables en componenetes 1 y 2")

# PC2*PC3
fviz_pca_var(acp, col.var="firebrick3", axes=c(1,3))+theme_minimal() +
ggtitle("Contribución de las variables en componenetes 1 y 3")

# PC2*PC3
fviz_pca_var(acp, col.var="gold3", axes=c(2,3))+theme_minimal() +
ggtitle("Contribución de las variables en componenetes 2 y 3")

contrib <- acp$co*acp$co
contrib
      Comp1    Comp2    Comp3    Comp4
AT   0.1565 0.638813 0.045332 0.002582
AP   0.0092 0.473831 0.001733 0.492760
AH   0.0922 0.086339 0.730883 0.042032
AFDP 0.9155 0.000531 0.000012 0.000477
GTEP 0.8510 0.041944 0.014035 0.020729
TIT  0.9323 0.000079 0.001533 0.000015
TAT  0.2807 0.441261 0.017018 0.029244
TEY  0.8963 0.076176 0.000824 0.003463
CDP  0.9363 0.052355 0.000128 0.003006
CO   0.5415 0.113634 0.100473 0.048170
NOX  0.3665 0.230454 0.141474 0.081484
contrib <- as.matrix(contrib)
corrplot(contrib, is.corr = FALSE, col = colorRampPalette(c("white", "firebrick3"))(200))

Análisis de puntuaciones (scores)

# Puntuaciones de las obervaciones
acp$li[1:10,]
   Axis1 Axis2 Axis3  Axis4
1   -4.9  -3.2 -0.96  1.489
2   -5.5  -2.7 -0.50  1.140
3   -2.8  -2.1  0.11 -0.100
4   -2.2  -2.1  0.11 -0.042
5   -1.9  -1.9  0.35 -0.256
6   -1.7  -1.8  0.41 -0.356
7   -2.0  -1.8  0.29 -0.363
8   -2.1  -2.1 -0.14 -0.202
9   -2.0  -2.2 -0.19 -0.251
10  -2.0  -2.2 -0.21 -0.354
# Correlaiones de las componentes principales
cor(acp$li)
                     Axis1                Axis2                Axis3
Axis1  1.00000000000000000 -0.00000000000000070 -0.00000000000000086
Axis2 -0.00000000000000070  1.00000000000000000  0.00000000000000057
Axis3 -0.00000000000000086  0.00000000000000057  1.00000000000000000
Axis4  0.00000000000000090  0.00000000000000328 -0.00000000000000049
                     Axis4
Axis1  0.00000000000000090
Axis2  0.00000000000000328
Axis3 -0.00000000000000049
Axis4  1.00000000000000000
# PC1*PC2
fviz_pca_biplot(acp, repel = FALSE, axes=c(1,2), col.var = "royalblue4", col.ind ="black")+theme_minimal() +
ggtitle("Puntuaciones (scores) en componenetes 1 y 2") 

# PC1*PC3
fviz_pca_biplot(acp, repel = FALSE, axes=c(1,3), col.var = "firebrick3", col.ind ="black")+theme_minimal() +
ggtitle("Puntuaciones (scores) en componenetes 1 y 3")

# PC2*PC3
fviz_pca_biplot(acp, repel = FALSE, axes=c(2,3), col.var = "gold3", col.ind ="black")+theme_minimal() +
ggtitle("Puntuaciones (scores) en componenetes 2 y 3")

Aplicación en Python

# Cargar lirerias
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
import seaborn as sns

Exploración inicial en Python

gasturbine=pd.read_csv(r"C:\Users\MINEDUCYT\Documents\Seminario\gas+turbine+co+and+nox+emission+data+set\gt_2015.csv", sep=",")

pd.set_option('display.max_rows', None)  # Mostrar todas las filas
pd.set_option('display.max_columns', None)  # Mostrar todas las columnas

print("Información sobre la base de datos: \n")
Información sobre la base de datos: 
print(gasturbine.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7384 entries, 0 to 7383
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      7384 non-null   float64
 1   AP      7384 non-null   float64
 2   AH      7384 non-null   float64
 3   AFDP    7384 non-null   float64
 4   GTEP    7384 non-null   float64
 5   TIT     7384 non-null   float64
 6   TAT     7384 non-null   float64
 7   TEY     7384 non-null   float64
 8   CDP     7384 non-null   float64
 9   CO      7384 non-null   float64
 10  NOX     7384 non-null   float64
dtypes: float64(11)
memory usage: 634.7 KB
None
print("Visualizar datos: \n")
Visualizar datos: 
print(gasturbine.head())
        AT      AP      AH    AFDP    GTEP     TIT     TAT     TEY     CDP  \
0  1.95320  1020.1  84.985  2.5304  20.116  1048.7  544.92  116.27  10.799   
1  1.21910  1020.1  87.523  2.3937  18.584  1045.5  548.50  109.18  10.347   
2  0.94915  1022.2  78.335  2.7789  22.264  1068.8  549.95  125.88  11.256   
3  1.00750  1021.7  76.942  2.8170  23.358  1075.2  549.63  132.21  11.702   
4  1.28580  1021.6  76.732  2.8377  23.483  1076.2  549.68  133.58  11.737   

       CO      NOX  
0  7.4491  113.250  
1  6.4684  112.020  
2  3.6335   88.147  
3  3.1972   87.078  
4  2.3833   82.515  
print(gasturbine.tail())
          AT      AP      AH    AFDP    GTEP     TIT     TAT     TEY     CDP  \
7379  3.6268  1028.5  93.200  3.1661  19.087  1037.0  541.59  109.08  10.411   
7380  4.1674  1028.6  94.036  3.1923  19.016  1037.6  542.28  108.79  10.344   
7381  5.4820  1028.5  95.219  3.3128  18.857  1038.0  543.48  107.81  10.462   
7382  5.8837  1028.7  94.200  3.9831  23.563  1076.9  550.11  131.41  11.771   
7383  6.0392  1028.8  94.547  3.8752  22.524  1067.9  548.23  125.41  11.462   

           CO      NOX  
7379  10.9930   89.172  
7380  11.1440   88.849  
7381  11.4140   96.147  
7382   3.3134   64.738  
7383  11.9810  109.240  
print("Medias y varianzas: \n")
Medias y varianzas: 
print(gasturbine.mean())
AT        17.225259
AP      1014.509110
AH        68.647464
AFDP       3.598909
GTEP      26.130149
TIT     1078.974689
TAT      546.642484
TEY      133.993380
CDP       12.097025
CO         3.129986
NOX       59.890509
dtype: float64
print(gasturbine.var())
AT       65.541708
AP       47.546957
AH      183.361833
AFDP      0.372375
GTEP     20.014320
TIT     390.554390
TAT      30.129848
TEY     261.766784
CDP       1.291861
CO        4.995057
NOX     123.931746
dtype: float64

Correlaciones

cor_gt= gasturbine.corr()

print("Matriz de correlaciones:\n")
Matriz de correlaciones:
print(cor_gt)
            AT        AP        AH      AFDP      GTEP       TIT       TAT  \
AT    1.000000 -0.493098 -0.466288  0.468976  0.193578  0.330112  0.208277   
AP   -0.493098  1.000000  0.084381 -0.094144 -0.043730 -0.081605 -0.290147   
AH   -0.466288  0.084381  1.000000 -0.245456 -0.297708 -0.260683  0.026251   
AFDP  0.468976 -0.094144 -0.245456  1.000000  0.843958  0.915128 -0.519807   
GTEP  0.193578 -0.043730 -0.297708  0.843958  1.000000  0.892851 -0.620652   
TIT   0.330112 -0.081605 -0.260683  0.915128  0.892851  1.000000 -0.396161   
TAT   0.208277 -0.290147  0.026251 -0.519807 -0.620652 -0.396161  1.000000   
TEY   0.109437  0.050326 -0.182732  0.884954  0.932337  0.951813 -0.633933   
CDP   0.200909  0.029420 -0.221706  0.922991  0.938142  0.951590 -0.656613   
CO   -0.390647  0.200945  0.158999 -0.640789 -0.557177 -0.738092  0.025768   
NOX  -0.593580  0.214236  0.065351 -0.584452 -0.366655 -0.520081  0.054455   

           TEY       CDP        CO       NOX  
AT    0.109437  0.200909 -0.390647 -0.593580  
AP    0.050326  0.029420  0.200945  0.214236  
AH   -0.182732 -0.221706  0.158999  0.065351  
AFDP  0.884954  0.922991 -0.640789 -0.584452  
GTEP  0.932337  0.938142 -0.557177 -0.366655  
TIT   0.951813  0.951590 -0.738092 -0.520081  
TAT  -0.633933 -0.656613  0.025768  0.054455  
TEY   1.000000  0.991207 -0.616791 -0.403278  
CDP   0.991207  1.000000 -0.612653 -0.443093  
CO   -0.616791 -0.612653  1.000000  0.678394  
NOX  -0.403278 -0.443093  0.678394  1.000000  
print("Mapa de calor:\n")
Mapa de calor:
plt.clf()  # Limpia la figura actual antes de graficar
plt.figure(figsize=(10, 8))
sns.heatmap(cor_gt, annot=True, cmap="coolwarm", center=0, linewidths=0.5)
plt.show()

Autovalores y Autovectores

### usando la matriz de correlaciones
eigenvalores, eigenvectores = np.linalg.eig(cor_gt)

print("Autovalores (Matriz de correlaciones):\n")
Autovalores (Matriz de correlaciones):
print(eigenvalores)
[5.97794266e+00 2.15541717e+00 1.05344415e+00 7.23961606e-01
 5.64769827e-01 2.70263701e-01 1.39416544e-01 8.05621247e-02
 3.21171230e-02 1.44249402e-03 6.62604594e-04]
print("Autovectores (Matriz de correlaciones):\n")
Autovectores (Matriz de correlaciones):
print(eigenvectores)
[[-0.16180989 -0.544404    0.20744257  0.0597164  -0.40177671 -0.39883275
   0.29444045  0.27898943  0.36264519 -0.09610488  0.07122729]
 [ 0.03924278  0.46886299  0.04055504  0.8250117  -0.07816598 -0.24820253
   0.05108821  0.15628742  0.03439303 -0.01575343  0.01254007]
 [ 0.12420936  0.20014239 -0.83294863 -0.24095192 -0.1569623  -0.33168388
   0.10467024  0.19469407  0.09497537 -0.00948361  0.00552722]
 [-0.3913334  -0.01569281 -0.00334615 -0.02566472 -0.18680991 -0.35124973
   0.20929655 -0.27628743 -0.75270069 -0.02037484  0.02354711]
 [-0.37729364  0.13949764  0.11542329 -0.16921148  0.10841048  0.0484399
  -0.27649242  0.80940884 -0.22461244 -0.0034743   0.00905849]
 [-0.39491983  0.00605929 -0.03814273  0.00459056  0.25246254 -0.27215755
  -0.18686203 -0.17940666  0.26100566 -0.0417482  -0.75349026]
 [ 0.21671066 -0.45246202 -0.12710261  0.2009845   0.54128314 -0.41757241
  -0.36222363  0.02309572 -0.15269027  0.08468051  0.24937147]
 [-0.38721347  0.1879941  -0.02797079 -0.06916172  0.14868318 -0.0563032
  -0.14366195 -0.2239324   0.26403162 -0.63415876  0.49114864]
 [-0.39575355  0.1558526   0.01100148 -0.06443842  0.02818856 -0.08922904
  -0.03464295 -0.15994885  0.27874622  0.76074877  0.3504817 ]
 [ 0.30095799  0.2296089   0.30882984 -0.25794721 -0.43716531 -0.3799865
  -0.58832795 -0.1140741   0.00999469  0.00315148 -0.00597155]
 [ 0.24759088  0.32698363  0.36646488 -0.33548824  0.43639208 -0.37367483
   0.4953724   0.08302722  0.04822736 -0.01337433  0.00828121]]
### usando la matriz de covarianzas
cov_gt=gasturbine.cov()
eigenvalores2, eigenvectores2 = np.linalg.eig(cov_gt)

print("Autovalores (Matriz de correlaciones):\n")
Autovalores (Matriz de correlaciones):
print(eigenvalores2)
[7.18259979e+02 1.90722347e+02 1.31354959e+02 4.68968296e+01
 2.41698576e+01 1.47370615e+01 1.86927493e+00 1.28427642e+00
 1.92161079e-01 1.79539052e-02 2.17925728e-03]
print("Autovectores (Matriz de correlaciones):\n")
Autovectores (Matriz de correlaciones):
print(eigenvectores2)
[[ 1.05251664e-01  3.74670428e-01 -3.96291931e-01  1.54423292e-01
  -2.45879610e-03 -7.83608689e-01 -1.32245809e-02 -6.43738168e-02
   2.21492909e-01  1.47716856e-02 -2.34491049e-03]
 [-1.81330690e-02 -1.42582490e-01  2.79905774e-01 -7.45954471e-01
  -4.67844235e-01 -3.44259073e-01  6.78863173e-02 -2.31474987e-02
   4.48131462e-02  8.96308835e-04 -4.38923292e-04]
 [-1.59724640e-01 -8.70553240e-01 -3.75401815e-01  1.64047818e-01
  -5.24189286e-02 -2.10610019e-01  3.58641460e-02 -1.23752864e-02
   1.50442784e-02  4.09921306e-03 -2.07842652e-04]
 [ 2.12156234e-02 -1.72755581e-04 -5.78139359e-03 -1.07280272e-03
   1.21984007e-02 -3.77694240e-02 -1.95940115e-02 -3.46304865e-03
  -8.88606162e-02 -9.94709023e-01 -1.42054205e-02]
 [ 1.52929838e-01 -1.76580196e-02  7.17883934e-02  2.37916897e-02
   1.68337346e-01 -2.66916195e-02  9.69591902e-01 -7.81782506e-03
   3.31846362e-02 -1.61339557e-02 -1.95146524e-04]
 [ 7.30595973e-01 -6.78500192e-02 -1.01718436e-02  2.21468622e-01
  -4.02401299e-01 -2.59634532e-02 -3.38147415e-02  5.94195803e-02
  -4.90265042e-01  5.63052480e-02 -4.08529118e-02]
 [-9.67075359e-02  1.21299356e-01 -1.87514050e-01  2.38676172e-01
  -6.97040350e-01  3.15076568e-01  1.38027236e-01  1.41957140e-01
   5.00682379e-01 -7.05838038e-02  6.16402985e-02]
 [ 5.80573738e-01 -2.34240128e-01  2.35121826e-01 -3.80099782e-02
   2.66238084e-01  2.75457840e-02 -1.81118005e-01  2.34026159e-02
   6.66678521e-01 -4.26359572e-02 -8.22735427e-03]
 [ 4.10992004e-02 -1.16176804e-02  1.25040038e-02 -5.72276092e-03
   2.88001514e-02 -2.31321939e-02 -1.15071656e-02 -5.62224519e-03
  -4.62780171e-02 -7.81587240e-03  9.97123599e-01]
 [-6.05822739e-02 -5.79234535e-03  6.87156586e-02 -5.95958703e-04
   9.44104673e-02 -1.38129745e-01 -6.40758271e-03  9.80713882e-01
  -4.13898743e-02  5.13344404e-03 -7.91837593e-04]
 [-2.31723189e-01 -8.22399319e-02  7.23233360e-01  5.33604688e-01
  -1.46506522e-01 -3.17216838e-01 -1.63028381e-02 -9.50390390e-02
   1.87411658e-02 -4.75371721e-04 -3.99424067e-04]]
print("Cota de varianza media autovalores de matriz de covarianzas \n")
Cota de varianza media autovalores de matriz de covarianzas 
var_med=(sum(eigenvalores2)/gasturbine.shape[1])
print("Se pueden considerar tomar valores mayores a var_med =", round(var_med,2))
Se pueden considerar tomar valores mayores a var_med = 102.68

Análisis de Componentes Principales (ACP)

### Modelo PCA
pca_pipe = make_pipeline(StandardScaler(), PCA())
pca_pipe.fit(gasturbine)
Pipeline(steps=[('standardscaler', StandardScaler()), ('pca', PCA())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Se extrae el modelo entrenado del pipeline
acp_gt = pca_pipe.named_steps['pca']

print("      Resumen del modelo: \n")
      Resumen del modelo: 
print("Varianzas explicadas:\n")
Varianzas explicadas:
print(acp_gt.explained_variance_)
[5.97875235e+00 2.15570912e+00 1.05358683e+00 7.24059664e-01
 5.64846323e-01 2.70300307e-01 1.39435427e-01 8.05730366e-02
 3.21214731e-02 1.44268940e-03 6.62694341e-04]
print("\nPorcentaje de varianza explicada:\n")

Porcentaje de varianza explicada:
print(acp_gt.explained_variance_ratio_)
[5.43449332e-01 1.95947016e-01 9.57676498e-02 6.58146915e-02
 5.13427115e-02 2.45694273e-02 1.26742313e-02 7.32382952e-03
 2.91973845e-03 1.31135820e-04 6.02367812e-05]
print("\nVarianza acumulada:\n")

Varianza acumulada:
print(np.cumsum(acp_gt.explained_variance_ratio_))
[0.54344933 0.73939635 0.835164   0.90097869 0.9523214  0.97689083
 0.98956506 0.99688889 0.99980863 0.99993976 1.        ]
print("\nComponentes principales:")

Componentes principales:
print(acp_gt.components_)
[[ 0.16180989 -0.03924278 -0.12420936  0.3913334   0.37729364  0.39491983
  -0.21671066  0.38721347  0.39575355 -0.30095799 -0.24759088]
 [ 0.544404   -0.46886299 -0.20014239  0.01569281 -0.13949764 -0.00605929
   0.45246202 -0.1879941  -0.1558526  -0.2296089  -0.32698363]
 [-0.20744257 -0.04055504  0.83294863  0.00334615 -0.11542329  0.03814273
   0.12710261  0.02797079 -0.01100148 -0.30882984 -0.36646488]
 [ 0.0597164   0.8250117  -0.24095192 -0.02566472 -0.16921148  0.00459056
   0.2009845  -0.06916172 -0.06443842 -0.25794721 -0.33548824]
 [-0.40177671 -0.07816598 -0.1569623  -0.18680991  0.10841048  0.25246254
   0.54128314  0.14868318  0.02818856 -0.43716531  0.43639208]
 [ 0.39883275  0.24820253  0.33168388  0.35124973 -0.0484399   0.27215755
   0.41757241  0.0563032   0.08922904  0.3799865   0.37367483]
 [-0.29444045 -0.05108821 -0.10467024 -0.20929655  0.27649242  0.18686203
   0.36222363  0.14366195  0.03464295  0.58832795 -0.4953724 ]
 [ 0.27898943  0.15628742  0.19469407 -0.27628743  0.80940884 -0.17940666
   0.02309572 -0.2239324  -0.15994885 -0.1140741   0.08302722]
 [-0.36264519 -0.03439303 -0.09497537  0.75270069  0.22461244 -0.26100566
   0.15269027 -0.26403162 -0.27874622 -0.00999469 -0.04822736]
 [-0.09610488 -0.01575343 -0.00948361 -0.02037484 -0.0034743  -0.0417482
   0.08468051 -0.63415876  0.76074877  0.00315148 -0.01337433]
 [-0.07122729 -0.01254007 -0.00552722 -0.02354711 -0.00905849  0.75349026
  -0.24937147 -0.49114864 -0.3504817   0.00597155 -0.00828121]]

Gráfico de sedimentación (scree)

plt.clf()  # Limpia la figura actual antes de graficar

prop_varianza_acum = np.cumsum(acp_gt.explained_variance_ratio_)
componentes = range(1, len(prop_varianza_acum) + 1)

plt.figure(figsize=(8, 5))
plt.plot(componentes, prop_varianza_acum, 'bo-', linewidth=2, markersize=8)
plt.xticks(componentes)
([<matplotlib.axis.XTick object at 0x000001D597B87ED0>, <matplotlib.axis.XTick object at 0x000001D59885A210>, <matplotlib.axis.XTick object at 0x000001D59885A990>, <matplotlib.axis.XTick object at 0x000001D59885B110>, <matplotlib.axis.XTick object at 0x000001D59885B890>, <matplotlib.axis.XTick object at 0x000001D59888C050>, <matplotlib.axis.XTick object at 0x000001D59888C7D0>, <matplotlib.axis.XTick object at 0x000001D59888CF50>, <matplotlib.axis.XTick object at 0x000001D59888D6D0>, <matplotlib.axis.XTick object at 0x000001D59888DE50>, <matplotlib.axis.XTick object at 0x000001D59888E5D0>], [Text(1, 0, '1'), Text(2, 0, '2'), Text(3, 0, '3'), Text(4, 0, '4'), Text(5, 0, '5'), Text(6, 0, '6'), Text(7, 0, '7'), Text(8, 0, '8'), Text(9, 0, '9'), Text(10, 0, '10'), Text(11, 0, '11')])
plt.title('Porcentaje de varianza explicada acumulada')
plt.xlabel('Componente principal')
plt.ylabel('Por. varianza acumulada')
plt.ylim(0, 1.1)
(0.0, 1.1)
plt.grid(True, linestyle='--', alpha=0.7)

# Añadir etiquetas
for i, v in enumerate(prop_varianza_acum):
    plt.text(componentes[i], v + 0.03, f"{v:.0%}", 
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

Análisis de cargas (loadings)

plt.clf()  # Limpia la figura actual antes de graficar
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 2))
ncomponentes = acp_gt.components_
plt.imshow(ncomponentes.T, cmap='viridis', aspect='auto')
plt.yticks(range(len(gasturbine.columns)), gasturbine.columns)
([<matplotlib.axis.YTick object at 0x000001D5988F8410>, <matplotlib.axis.YTick object at 0x000001D5988FA0D0>, <matplotlib.axis.YTick object at 0x000001D5988FA850>, <matplotlib.axis.YTick object at 0x000001D5988FAFD0>, <matplotlib.axis.YTick object at 0x000001D5988FB750>, <matplotlib.axis.YTick object at 0x000001D5988FBED0>, <matplotlib.axis.YTick object at 0x000001D598030690>, <matplotlib.axis.YTick object at 0x000001D598030E10>, <matplotlib.axis.YTick object at 0x000001D598031590>, <matplotlib.axis.YTick object at 0x000001D598031D10>, <matplotlib.axis.YTick object at 0x000001D598032490>], [Text(0, 0, 'AT'), Text(0, 1, 'AP'), Text(0, 2, 'AH'), Text(0, 3, 'AFDP'), Text(0, 4, 'GTEP'), Text(0, 5, 'TIT'), Text(0, 6, 'TAT'), Text(0, 7, 'TEY'), Text(0, 8, 'CDP'), Text(0, 9, 'CO'), Text(0, 10, 'NOX')])
plt.xticks(range(len(gasturbine.columns)), np.arange(acp_gt.n_components_) + 1)
([<matplotlib.axis.XTick object at 0x000001D5988CBC50>, <matplotlib.axis.XTick object at 0x000001D598032D50>, <matplotlib.axis.XTick object at 0x000001D5980334D0>, <matplotlib.axis.XTick object at 0x000001D598033C50>, <matplotlib.axis.XTick object at 0x000001D598050410>, <matplotlib.axis.XTick object at 0x000001D598050B90>, <matplotlib.axis.XTick object at 0x000001D598051310>, <matplotlib.axis.XTick object at 0x000001D598051A90>, <matplotlib.axis.XTick object at 0x000001D598052210>, <matplotlib.axis.XTick object at 0x000001D598052990>, <matplotlib.axis.XTick object at 0x000001D598053110>], [Text(0, 0, '1'), Text(1, 0, '2'), Text(2, 0, '3'), Text(3, 0, '4'), Text(4, 0, '5'), Text(5, 0, '6'), Text(6, 0, '7'), Text(7, 0, '8'), Text(8, 0, '9'), Text(9, 0, '10'), Text(10, 0, '11')])
plt.grid(False)
plt.colorbar();
plt.show()

Análisis de puntuaciones (acores)

plt.clf()  # Limpia la figura actual antes de graficar

# Obtener scores y loadings
scores = acp_gt.transform(StandardScaler().fit_transform(gasturbine))[:, :2]  # Primeras 2 componentes
loadings = acp_gt.components_.T[:, :2]  # Primeras 2 componentes

# Escalar los loadings para mejor visualización
scale_factor = 1.0 / (loadings.max() - loadings.min())
loadings_scaled = loadings * scale_factor * 7  # Factor de escala ajustable

# Crear la figura
plt.figure(figsize=(12, 8))

# Graficar scores (observaciones)
plt.scatter(scores[:, 0], scores[:, 1], alpha=0.5, label='Observaciones')

# Graficar loadings (variables)
for i, (x, y) in enumerate(loadings_scaled):
    plt.arrow(0, 0, x, y, color='red', head_width=0.05, alpha=0.8)
    plt.text(x * 1.15, y * 1.15, gasturbine.columns[i], color='darkred', 
             ha='center', va='center', fontsize=12)

# Añadir círculo de correlación
circle = plt.Circle((0, 0), 1, color='gray', fill=False, linestyle='--', alpha=0.5)
plt.gca().add_artist(circle)

# Configuración del gráfico
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(0, color='gray', linestyle='--', alpha=0.5)

plt.xlabel(f'Componente 1 ({acp_gt.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
plt.ylabel(f'Componente 2 ({acp_gt.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
plt.title('Puntuaciones (scores) de las observaciones"', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.3)
plt.legend()

# Ajustar límites
max_val = max(np.abs(scores).max(), np.abs(loadings_scaled).max()) * 1.2
plt.xlim(-max_val, max_val)
(-10.651644601708835, 10.651644601708835)
plt.ylim(-max_val, max_val)
(-10.651644601708835, 10.651644601708835)
plt.tight_layout()
plt.show()

plt.clf()  # Limpia la figura actual antes de graficar

# Obtener scores y loadings
scores = acp_gt.transform(StandardScaler().fit_transform(gasturbine))[:, [0, 2]]  # componentes 1 y 3
loadings = acp_gt.components_.T[:, [0, 2]]  # componentes 1 y 3

# Escalar los loadings para mejor visualización
scale_factor = 1.0 / (loadings.max() - loadings.min())
loadings_scaled = loadings * scale_factor * 7  # Factor de escala ajustable

# Crear la figura
plt.figure(figsize=(12, 8))

# Graficar scores (observaciones)
plt.scatter(scores[:, 0], scores[:, 1], alpha=0.5, label='Observaciones')

# Graficar loadings (variables)
for i, (x, y) in enumerate(loadings_scaled):
    plt.arrow(0, 0, x, y, color='red', head_width=0.05, alpha=0.8)
    plt.text(x * 1.15, y * 1.15, gasturbine.columns[i], color='darkred', 
             ha='center', va='center', fontsize=12)

# Añadir círculo de correlación
circle = plt.Circle((0, 0), 1, color='gray', fill=False, linestyle='--', alpha=0.5)
plt.gca().add_artist(circle)

# Configuración del gráfico
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(0, color='gray', linestyle='--', alpha=0.5)

plt.xlabel(f'Componente 1 ({acp_gt.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
plt.ylabel(f'Componente 3 ({acp_gt.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
plt.title('Puntuaciones (scores) de las observaciones"', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.3)
plt.legend()

# Ajustar límites
max_val = max(np.abs(scores).max(), np.abs(loadings_scaled).max()) * 1.2
plt.xlim(-max_val, max_val)
(-10.651644601708835, 10.651644601708835)
plt.ylim(-max_val, max_val)
(-10.651644601708835, 10.651644601708835)
plt.tight_layout()
plt.show()

plt.clf()  # Limpia la figura actual antes de graficar

# Obtener scores y loadings
scores = acp_gt.transform(StandardScaler().fit_transform(gasturbine))[:, [1, 2]]  # componentes 2 y 3
loadings = acp_gt.components_.T[:, [1, 2]]  # componentes 2 y 3

# Escalar los loadings para mejor visualización
scale_factor = 1.0 / (loadings.max() - loadings.min())
loadings_scaled = loadings * scale_factor * 7  # Factor de escala ajustable

# Crear la figura
plt.figure(figsize=(12, 8))

# Graficar scores (observaciones)
plt.scatter(scores[:, 0], scores[:, 1], alpha=0.5, label='Observaciones')

# Graficar loadings (variables)
for i, (x, y) in enumerate(loadings_scaled):
    plt.arrow(0, 0, x, y, color='red', head_width=0.05, alpha=0.8)
    plt.text(x * 1.15, y * 1.15, gasturbine.columns[i], color='darkred', 
             ha='center', va='center', fontsize=12)

# Añadir círculo de correlación
circle = plt.Circle((0, 0), 1, color='gray', fill=False, linestyle='--', alpha=0.5)
plt.gca().add_artist(circle)

# Configuración del gráfico
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(0, color='gray', linestyle='--', alpha=0.5)

plt.xlabel(f'Componente 2 ({acp_gt.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
plt.ylabel(f'Componente 3 ({acp_gt.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
plt.title('Puntuaciones (scores) de las observaciones"', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.3)
plt.legend()

# Ajustar límites
max_val = max(np.abs(scores).max(), np.abs(loadings_scaled).max()) * 1.2
plt.xlim(-max_val, max_val)
(-8.949839286896347, 8.949839286896347)
plt.ylim(-max_val, max_val)
(-8.949839286896347, 8.949839286896347)
plt.tight_layout()
plt.show()