############# Variables y unidad de medida ############
# Ambient temperature (AT) C
# Ambient pressure (AP) mbar
# Ambient humidity (AH) (%)
# Air filter difference pressure (AFDP) mbar
# Gas turbine exhaust pressure (GTEP) mbar
# Turbine inlet temperature (TIT) C
# Turbine after temperature (TAT) C
# Compressor discharge pressure (CDP) mbar
# Turbine energy yield (TEY) MWH
# Carbon monoxide (CO) mg/m3
# Nitrogen oxides (NOx) mg/m3
Análisis de Componentes Principales
Aplicación en R
# Librerias
library(tidyverse)
library(psych)
library(corrplot)
library(FactoMineR)
library(factoextra)
library(ggplot2)
library(GGally)
library(ade4)
options(scipen=999)
options(digits = 2) # Número de decimales
Exploración inicial en R
<- file.choose()
ruta <- read.csv(ruta, header=TRUE, sep=",")
gasturbine
# Estrucura de la base
str(gasturbine)
'data.frame': 7384 obs. of 11 variables:
$ AT : num 1.953 1.219 0.949 1.008 1.286 ...
$ AP : num 1020 1020 1022 1022 1022 ...
$ AH : num 85 87.5 78.3 76.9 76.7 ...
$ AFDP: num 2.53 2.39 2.78 2.82 2.84 ...
$ GTEP: num 20.1 18.6 22.3 23.4 23.5 ...
$ TIT : num 1049 1046 1069 1075 1076 ...
$ TAT : num 545 548 550 550 550 ...
$ TEY : num 116 109 126 132 134 ...
$ CDP : num 10.8 10.3 11.3 11.7 11.7 ...
$ CO : num 7.45 6.47 3.63 3.2 2.38 ...
$ NOX : num 113.2 112 88.1 87.1 82.5 ...
head(gasturbine)
AT AP AH AFDP GTEP TIT TAT TEY CDP CO NOX
1 1.95 1020 85 2.5 20 1049 545 116 11 7.4 113
2 1.22 1020 88 2.4 19 1046 548 109 10 6.5 112
3 0.95 1022 78 2.8 22 1069 550 126 11 3.6 88
4 1.01 1022 77 2.8 23 1075 550 132 12 3.2 87
5 1.29 1022 77 2.8 23 1076 550 134 12 2.4 83
6 1.83 1022 76 2.8 23 1076 550 134 12 2.1 81
tail(gasturbine)
AT AP AH AFDP GTEP TIT TAT TEY CDP CO NOX
7379 3.4 1028 93 3.3 20 1058 550 117 11 5.3 67
7380 3.6 1028 93 3.2 19 1037 542 109 10 11.0 89
7381 4.2 1029 94 3.2 19 1038 542 109 10 11.1 89
7382 5.5 1028 95 3.3 19 1038 543 108 10 11.4 96
7383 5.9 1029 94 4.0 24 1077 550 131 12 3.3 65
7384 6.0 1029 95 3.9 23 1068 548 125 11 12.0 109
# Resumen
summary(gasturbine)
AT AP AH AFDP GTEP
Min. :-6 Min. : 989 Min. :24 Min. :2.4 Min. :18
1st Qu.:11 1st Qu.:1010 1st Qu.:59 1st Qu.:3.1 1st Qu.:23
Median :17 Median :1014 Median :71 Median :3.5 Median :25
Mean :17 Mean :1015 Mean :69 Mean :3.6 Mean :26
3rd Qu.:24 3rd Qu.:1018 3rd Qu.:80 3rd Qu.:4.2 3rd Qu.:30
Max. :37 Max. :1037 Max. :97 Max. :5.2 Max. :41
TIT TAT TEY CDP CO
Min. :1016 Min. :516 Min. :100 Min. : 9.9 Min. : 0
1st Qu.:1070 1st Qu.:545 1st Qu.:126 1st Qu.:11.5 1st Qu.: 2
Median :1080 Median :550 Median :132 Median :11.9 Median : 3
Mean :1079 Mean :547 Mean :134 Mean :12.1 Mean : 3
3rd Qu.:1100 3rd Qu.:550 3rd Qu.:147 3rd Qu.:13.1 3rd Qu.: 4
Max. :1100 Max. :551 Max. :180 Max. :15.2 Max. :41
NOX
Min. : 26
1st Qu.: 52
Median : 57
Mean : 60
3rd Qu.: 65
Max. :120
Correlaciones
# Matriz de correlaciones
<- cor(gasturbine)
cor_gt cor_gt
AT AP AH AFDP GTEP TIT TAT TEY CDP CO NOX
AT 1.00 -0.493 -0.466 0.469 0.194 0.330 0.208 0.11 0.201 -0.391 -0.594
AP -0.49 1.000 0.084 -0.094 -0.044 -0.082 -0.290 0.05 0.029 0.201 0.214
AH -0.47 0.084 1.000 -0.245 -0.298 -0.261 0.026 -0.18 -0.222 0.159 0.065
AFDP 0.47 -0.094 -0.245 1.000 0.844 0.915 -0.520 0.88 0.923 -0.641 -0.584
GTEP 0.19 -0.044 -0.298 0.844 1.000 0.893 -0.621 0.93 0.938 -0.557 -0.367
TIT 0.33 -0.082 -0.261 0.915 0.893 1.000 -0.396 0.95 0.952 -0.738 -0.520
TAT 0.21 -0.290 0.026 -0.520 -0.621 -0.396 1.000 -0.63 -0.657 0.026 0.054
TEY 0.11 0.050 -0.183 0.885 0.932 0.952 -0.634 1.00 0.991 -0.617 -0.403
CDP 0.20 0.029 -0.222 0.923 0.938 0.952 -0.657 0.99 1.000 -0.613 -0.443
CO -0.39 0.201 0.159 -0.641 -0.557 -0.738 0.026 -0.62 -0.613 1.000 0.678
NOX -0.59 0.214 0.065 -0.584 -0.367 -0.520 0.054 -0.40 -0.443 0.678 1.000
# Mapa de calor de correlaciones
corrplot(cor_gt, method = "color", main="mapa de calor",col = colorRampPalette(c("royalblue4", "white", "red3"))(100), tl.cex = 0.8)
# Dispersión, histograma y correlación
pairs.panels(gasturbine, pch=20, stars=T, main="Correlaciones")
Autovalores y Autovectores
### Usando matriz de correlaciones
<- eigen(cor_gt)
eigen_cor <- eigen_cor$values
eigen_valores <- eigen_cor$vectors
eigen_vectores
# Eigen valores
eigen_valores
[1] 5.97794 2.15542 1.05344 0.72396 0.56477 0.27026 0.13942 0.08056 0.03212
[10] 0.00144 0.00066
# Eigen vectores
eigen_vectores
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] -0.162 0.5444 -0.2074 -0.0597 -0.402 -0.399 0.294 0.279 -0.363 -0.0961
[2,] 0.039 -0.4689 -0.0406 -0.8250 -0.078 -0.248 0.051 0.156 -0.034 -0.0158
[3,] 0.124 -0.2001 0.8329 0.2410 -0.157 -0.332 0.105 0.195 -0.095 -0.0095
[4,] -0.391 0.0157 0.0033 0.0257 -0.187 -0.351 0.209 -0.276 0.753 -0.0204
[5,] -0.377 -0.1395 -0.1154 0.1692 0.108 0.048 -0.276 0.809 0.225 -0.0035
[6,] -0.395 -0.0061 0.0381 -0.0046 0.252 -0.272 -0.187 -0.179 -0.261 -0.0417
[7,] 0.217 0.4525 0.1271 -0.2010 0.541 -0.418 -0.362 0.023 0.153 0.0847
[8,] -0.387 -0.1880 0.0280 0.0692 0.149 -0.056 -0.144 -0.224 -0.264 -0.6342
[9,] -0.396 -0.1559 -0.0110 0.0644 0.028 -0.089 -0.035 -0.160 -0.279 0.7607
[10,] 0.301 -0.2296 -0.3088 0.2579 -0.437 -0.380 -0.588 -0.114 -0.010 0.0032
[11,] 0.248 -0.3270 -0.3665 0.3355 0.436 -0.374 0.495 0.083 -0.048 -0.0134
[,11]
[1,] 0.0712
[2,] 0.0125
[3,] 0.0055
[4,] 0.0235
[5,] 0.0091
[6,] -0.7535
[7,] 0.2494
[8,] 0.4911
[9,] 0.3505
[10,] -0.0060
[11,] 0.0083
# La varianza media en los autovalores de cor(gasturbine) es 1 debido a que se
# trabaja con la matriz de correlaciones
### Usando matriz de varianzas covrianzas
<- eigen(cov(gasturbine))
eigen_cov <- eigen_cov$values
eigen_valores2 <- eigen_cov$vectors
eigen_vectores2
# Eigen valores
eigen_valores2
[1] 718.2600 190.7223 131.3550 46.8968 24.1699 14.7371 1.8693 1.2843
[9] 0.1922 0.0180 0.0022
# Eigen vectores
eigen_vectores2
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] 0.105 0.37467 -0.3963 0.1544 0.0025 0.784 0.0132 0.0644 0.221
[2,] -0.018 -0.14258 0.2799 -0.7460 0.4678 0.344 -0.0679 0.0231 0.045
[3,] -0.160 -0.87055 -0.3754 0.1640 0.0524 0.211 -0.0359 0.0124 0.015
[4,] 0.021 -0.00017 -0.0058 -0.0011 -0.0122 0.038 0.0196 0.0035 -0.089
[5,] 0.153 -0.01766 0.0718 0.0238 -0.1683 0.027 -0.9696 0.0078 0.033
[6,] 0.731 -0.06785 -0.0102 0.2215 0.4024 0.026 0.0338 -0.0594 -0.490
[7,] -0.097 0.12130 -0.1875 0.2387 0.6970 -0.315 -0.1380 -0.1420 0.501
[8,] 0.581 -0.23424 0.2351 -0.0380 -0.2662 -0.028 0.1811 -0.0234 0.667
[9,] 0.041 -0.01162 0.0125 -0.0057 -0.0288 0.023 0.0115 0.0056 -0.046
[10,] -0.061 -0.00579 0.0687 -0.0006 -0.0944 0.138 0.0064 -0.9807 -0.041
[11,] -0.232 -0.08224 0.7232 0.5336 0.1465 0.317 0.0163 0.0950 0.019
[,10] [,11]
[1,] -0.01477 -0.00234
[2,] -0.00090 -0.00044
[3,] -0.00410 -0.00021
[4,] 0.99471 -0.01421
[5,] 0.01613 -0.00020
[6,] -0.05631 -0.04085
[7,] 0.07058 0.06164
[8,] 0.04264 -0.00823
[9,] 0.00782 0.99712
[10,] -0.00513 -0.00079
[11,] 0.00048 -0.00040
# cota de varianza media autovalores de cov(gasturbine)
=(sum(eigen_valores2)/ncol(gasturbine))
var_med var_med
[1] 103
# Se pueden considerar tomar valores mayores a var_med=103
Análisis de Componentes Principales (ACP)
# Analisis de componenetes principales usando martiz de correlaciones (datos estandarizados)
<- dudi.pca(gasturbine, scannf=FALSE, scale=TRUE, nf=4)
acp
# Resumen de componenetes y varianzas acumuladas
summary(acp)
Class: pca dudi
Call: dudi.pca(df = gasturbine, scale = TRUE, scannf = FALSE, nf = 4)
Total inertia: 11
Eigenvalues:
Ax1 Ax2 Ax3 Ax4 Ax5
5.9779 2.1554 1.0534 0.7240 0.5648
Projected inertia (%):
Ax1 Ax2 Ax3 Ax4 Ax5
54.345 19.595 9.577 6.581 5.134
Cumulative projected inertia (%):
Ax1 Ax1:2 Ax1:3 Ax1:4 Ax1:5
54.34 73.94 83.52 90.10 95.23
(Only 5 dimensions (out of 11) are shown)
# Autovectores
$c1 acp
CS1 CS2 CS3 CS4
AT 0.162 0.5444 -0.2074 -0.0597
AP -0.039 -0.4689 -0.0406 -0.8250
AH -0.124 -0.2001 0.8329 0.2410
AFDP 0.391 0.0157 0.0033 0.0257
GTEP 0.377 -0.1395 -0.1154 0.1692
TIT 0.395 -0.0061 0.0381 -0.0046
TAT -0.217 0.4525 0.1271 -0.2010
TEY 0.387 -0.1880 0.0280 0.0692
CDP 0.396 -0.1559 -0.0110 0.0644
CO -0.301 -0.2296 -0.3088 0.2579
NOX -0.248 -0.3270 -0.3665 0.3355
Gráficos de sedimentación (scree)
plot(acp$eig,type="b",pch=20,col="royalblue4", xlab = "Número de componentes", ylab = "Varianza explicada")
title("Gráfico de sedimentación")
abline(h=1,lty=3,col="red3")
screeplot(acp, main ="Valores Propios")
fviz_eig(acp, addlabels=TRUE, hjust = -0.3)
Análisis de cargas (loadings)
# acp$co corresponde a las coordenadas de las variables
# PC1*PC2
fviz_pca_var(acp, col.var="royalblue4", axes=c(1,2))+theme_minimal() +
ggtitle("Contribución de las variables en componenetes 1 y 2")
# PC2*PC3
fviz_pca_var(acp, col.var="firebrick3", axes=c(1,3))+theme_minimal() +
ggtitle("Contribución de las variables en componenetes 1 y 3")
# PC2*PC3
fviz_pca_var(acp, col.var="gold3", axes=c(2,3))+theme_minimal() +
ggtitle("Contribución de las variables en componenetes 2 y 3")
<- acp$co*acp$co
contrib contrib
Comp1 Comp2 Comp3 Comp4
AT 0.1565 0.638813 0.045332 0.002582
AP 0.0092 0.473831 0.001733 0.492760
AH 0.0922 0.086339 0.730883 0.042032
AFDP 0.9155 0.000531 0.000012 0.000477
GTEP 0.8510 0.041944 0.014035 0.020729
TIT 0.9323 0.000079 0.001533 0.000015
TAT 0.2807 0.441261 0.017018 0.029244
TEY 0.8963 0.076176 0.000824 0.003463
CDP 0.9363 0.052355 0.000128 0.003006
CO 0.5415 0.113634 0.100473 0.048170
NOX 0.3665 0.230454 0.141474 0.081484
<- as.matrix(contrib)
contrib corrplot(contrib, is.corr = FALSE, col = colorRampPalette(c("white", "firebrick3"))(200))
Análisis de puntuaciones (scores)
# Puntuaciones de las obervaciones
$li[1:10,] acp
Axis1 Axis2 Axis3 Axis4
1 -4.9 -3.2 -0.96 1.489
2 -5.5 -2.7 -0.50 1.140
3 -2.8 -2.1 0.11 -0.100
4 -2.2 -2.1 0.11 -0.042
5 -1.9 -1.9 0.35 -0.256
6 -1.7 -1.8 0.41 -0.356
7 -2.0 -1.8 0.29 -0.363
8 -2.1 -2.1 -0.14 -0.202
9 -2.0 -2.2 -0.19 -0.251
10 -2.0 -2.2 -0.21 -0.354
# Correlaiones de las componentes principales
cor(acp$li)
Axis1 Axis2 Axis3
Axis1 1.00000000000000000 -0.00000000000000070 -0.00000000000000086
Axis2 -0.00000000000000070 1.00000000000000000 0.00000000000000057
Axis3 -0.00000000000000086 0.00000000000000057 1.00000000000000000
Axis4 0.00000000000000090 0.00000000000000328 -0.00000000000000049
Axis4
Axis1 0.00000000000000090
Axis2 0.00000000000000328
Axis3 -0.00000000000000049
Axis4 1.00000000000000000
# PC1*PC2
fviz_pca_biplot(acp, repel = FALSE, axes=c(1,2), col.var = "royalblue4", col.ind ="black")+theme_minimal() +
ggtitle("Puntuaciones (scores) en componenetes 1 y 2")
# PC1*PC3
fviz_pca_biplot(acp, repel = FALSE, axes=c(1,3), col.var = "firebrick3", col.ind ="black")+theme_minimal() +
ggtitle("Puntuaciones (scores) en componenetes 1 y 3")
# PC2*PC3
fviz_pca_biplot(acp, repel = FALSE, axes=c(2,3), col.var = "gold3", col.ind ="black")+theme_minimal() +
ggtitle("Puntuaciones (scores) en componenetes 2 y 3")
Aplicación en Python
# Cargar lirerias
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
import seaborn as sns
Exploración inicial en Python
=pd.read_csv(r"C:\Users\MINEDUCYT\Documents\Seminario\gas+turbine+co+and+nox+emission+data+set\gt_2015.csv", sep=",")
gasturbine
'display.max_rows', None) # Mostrar todas las filas
pd.set_option('display.max_columns', None) # Mostrar todas las columnas
pd.set_option(
print("Información sobre la base de datos: \n")
Información sobre la base de datos:
print(gasturbine.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7384 entries, 0 to 7383
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 AT 7384 non-null float64
1 AP 7384 non-null float64
2 AH 7384 non-null float64
3 AFDP 7384 non-null float64
4 GTEP 7384 non-null float64
5 TIT 7384 non-null float64
6 TAT 7384 non-null float64
7 TEY 7384 non-null float64
8 CDP 7384 non-null float64
9 CO 7384 non-null float64
10 NOX 7384 non-null float64
dtypes: float64(11)
memory usage: 634.7 KB
None
print("Visualizar datos: \n")
Visualizar datos:
print(gasturbine.head())
AT AP AH AFDP GTEP TIT TAT TEY CDP \
0 1.95320 1020.1 84.985 2.5304 20.116 1048.7 544.92 116.27 10.799
1 1.21910 1020.1 87.523 2.3937 18.584 1045.5 548.50 109.18 10.347
2 0.94915 1022.2 78.335 2.7789 22.264 1068.8 549.95 125.88 11.256
3 1.00750 1021.7 76.942 2.8170 23.358 1075.2 549.63 132.21 11.702
4 1.28580 1021.6 76.732 2.8377 23.483 1076.2 549.68 133.58 11.737
CO NOX
0 7.4491 113.250
1 6.4684 112.020
2 3.6335 88.147
3 3.1972 87.078
4 2.3833 82.515
print(gasturbine.tail())
AT AP AH AFDP GTEP TIT TAT TEY CDP \
7379 3.6268 1028.5 93.200 3.1661 19.087 1037.0 541.59 109.08 10.411
7380 4.1674 1028.6 94.036 3.1923 19.016 1037.6 542.28 108.79 10.344
7381 5.4820 1028.5 95.219 3.3128 18.857 1038.0 543.48 107.81 10.462
7382 5.8837 1028.7 94.200 3.9831 23.563 1076.9 550.11 131.41 11.771
7383 6.0392 1028.8 94.547 3.8752 22.524 1067.9 548.23 125.41 11.462
CO NOX
7379 10.9930 89.172
7380 11.1440 88.849
7381 11.4140 96.147
7382 3.3134 64.738
7383 11.9810 109.240
print("Medias y varianzas: \n")
Medias y varianzas:
print(gasturbine.mean())
AT 17.225259
AP 1014.509110
AH 68.647464
AFDP 3.598909
GTEP 26.130149
TIT 1078.974689
TAT 546.642484
TEY 133.993380
CDP 12.097025
CO 3.129986
NOX 59.890509
dtype: float64
print(gasturbine.var())
AT 65.541708
AP 47.546957
AH 183.361833
AFDP 0.372375
GTEP 20.014320
TIT 390.554390
TAT 30.129848
TEY 261.766784
CDP 1.291861
CO 4.995057
NOX 123.931746
dtype: float64
Correlaciones
= gasturbine.corr()
cor_gt
print("Matriz de correlaciones:\n")
Matriz de correlaciones:
print(cor_gt)
AT AP AH AFDP GTEP TIT TAT \
AT 1.000000 -0.493098 -0.466288 0.468976 0.193578 0.330112 0.208277
AP -0.493098 1.000000 0.084381 -0.094144 -0.043730 -0.081605 -0.290147
AH -0.466288 0.084381 1.000000 -0.245456 -0.297708 -0.260683 0.026251
AFDP 0.468976 -0.094144 -0.245456 1.000000 0.843958 0.915128 -0.519807
GTEP 0.193578 -0.043730 -0.297708 0.843958 1.000000 0.892851 -0.620652
TIT 0.330112 -0.081605 -0.260683 0.915128 0.892851 1.000000 -0.396161
TAT 0.208277 -0.290147 0.026251 -0.519807 -0.620652 -0.396161 1.000000
TEY 0.109437 0.050326 -0.182732 0.884954 0.932337 0.951813 -0.633933
CDP 0.200909 0.029420 -0.221706 0.922991 0.938142 0.951590 -0.656613
CO -0.390647 0.200945 0.158999 -0.640789 -0.557177 -0.738092 0.025768
NOX -0.593580 0.214236 0.065351 -0.584452 -0.366655 -0.520081 0.054455
TEY CDP CO NOX
AT 0.109437 0.200909 -0.390647 -0.593580
AP 0.050326 0.029420 0.200945 0.214236
AH -0.182732 -0.221706 0.158999 0.065351
AFDP 0.884954 0.922991 -0.640789 -0.584452
GTEP 0.932337 0.938142 -0.557177 -0.366655
TIT 0.951813 0.951590 -0.738092 -0.520081
TAT -0.633933 -0.656613 0.025768 0.054455
TEY 1.000000 0.991207 -0.616791 -0.403278
CDP 0.991207 1.000000 -0.612653 -0.443093
CO -0.616791 -0.612653 1.000000 0.678394
NOX -0.403278 -0.443093 0.678394 1.000000
print("Mapa de calor:\n")
Mapa de calor:
# Limpia la figura actual antes de graficar
plt.clf() =(10, 8))
plt.figure(figsize=True, cmap="coolwarm", center=0, linewidths=0.5)
sns.heatmap(cor_gt, annot plt.show()
Autovalores y Autovectores
### usando la matriz de correlaciones
= np.linalg.eig(cor_gt)
eigenvalores, eigenvectores
print("Autovalores (Matriz de correlaciones):\n")
Autovalores (Matriz de correlaciones):
print(eigenvalores)
[5.97794266e+00 2.15541717e+00 1.05344415e+00 7.23961606e-01
5.64769827e-01 2.70263701e-01 1.39416544e-01 8.05621247e-02
3.21171230e-02 1.44249402e-03 6.62604594e-04]
print("Autovectores (Matriz de correlaciones):\n")
Autovectores (Matriz de correlaciones):
print(eigenvectores)
[[-0.16180989 -0.544404 0.20744257 0.0597164 -0.40177671 -0.39883275
0.29444045 0.27898943 0.36264519 -0.09610488 0.07122729]
[ 0.03924278 0.46886299 0.04055504 0.8250117 -0.07816598 -0.24820253
0.05108821 0.15628742 0.03439303 -0.01575343 0.01254007]
[ 0.12420936 0.20014239 -0.83294863 -0.24095192 -0.1569623 -0.33168388
0.10467024 0.19469407 0.09497537 -0.00948361 0.00552722]
[-0.3913334 -0.01569281 -0.00334615 -0.02566472 -0.18680991 -0.35124973
0.20929655 -0.27628743 -0.75270069 -0.02037484 0.02354711]
[-0.37729364 0.13949764 0.11542329 -0.16921148 0.10841048 0.0484399
-0.27649242 0.80940884 -0.22461244 -0.0034743 0.00905849]
[-0.39491983 0.00605929 -0.03814273 0.00459056 0.25246254 -0.27215755
-0.18686203 -0.17940666 0.26100566 -0.0417482 -0.75349026]
[ 0.21671066 -0.45246202 -0.12710261 0.2009845 0.54128314 -0.41757241
-0.36222363 0.02309572 -0.15269027 0.08468051 0.24937147]
[-0.38721347 0.1879941 -0.02797079 -0.06916172 0.14868318 -0.0563032
-0.14366195 -0.2239324 0.26403162 -0.63415876 0.49114864]
[-0.39575355 0.1558526 0.01100148 -0.06443842 0.02818856 -0.08922904
-0.03464295 -0.15994885 0.27874622 0.76074877 0.3504817 ]
[ 0.30095799 0.2296089 0.30882984 -0.25794721 -0.43716531 -0.3799865
-0.58832795 -0.1140741 0.00999469 0.00315148 -0.00597155]
[ 0.24759088 0.32698363 0.36646488 -0.33548824 0.43639208 -0.37367483
0.4953724 0.08302722 0.04822736 -0.01337433 0.00828121]]
### usando la matriz de covarianzas
=gasturbine.cov()
cov_gt= np.linalg.eig(cov_gt)
eigenvalores2, eigenvectores2
print("Autovalores (Matriz de correlaciones):\n")
Autovalores (Matriz de correlaciones):
print(eigenvalores2)
[7.18259979e+02 1.90722347e+02 1.31354959e+02 4.68968296e+01
2.41698576e+01 1.47370615e+01 1.86927493e+00 1.28427642e+00
1.92161079e-01 1.79539052e-02 2.17925728e-03]
print("Autovectores (Matriz de correlaciones):\n")
Autovectores (Matriz de correlaciones):
print(eigenvectores2)
[[ 1.05251664e-01 3.74670428e-01 -3.96291931e-01 1.54423292e-01
-2.45879610e-03 -7.83608689e-01 -1.32245809e-02 -6.43738168e-02
2.21492909e-01 1.47716856e-02 -2.34491049e-03]
[-1.81330690e-02 -1.42582490e-01 2.79905774e-01 -7.45954471e-01
-4.67844235e-01 -3.44259073e-01 6.78863173e-02 -2.31474987e-02
4.48131462e-02 8.96308835e-04 -4.38923292e-04]
[-1.59724640e-01 -8.70553240e-01 -3.75401815e-01 1.64047818e-01
-5.24189286e-02 -2.10610019e-01 3.58641460e-02 -1.23752864e-02
1.50442784e-02 4.09921306e-03 -2.07842652e-04]
[ 2.12156234e-02 -1.72755581e-04 -5.78139359e-03 -1.07280272e-03
1.21984007e-02 -3.77694240e-02 -1.95940115e-02 -3.46304865e-03
-8.88606162e-02 -9.94709023e-01 -1.42054205e-02]
[ 1.52929838e-01 -1.76580196e-02 7.17883934e-02 2.37916897e-02
1.68337346e-01 -2.66916195e-02 9.69591902e-01 -7.81782506e-03
3.31846362e-02 -1.61339557e-02 -1.95146524e-04]
[ 7.30595973e-01 -6.78500192e-02 -1.01718436e-02 2.21468622e-01
-4.02401299e-01 -2.59634532e-02 -3.38147415e-02 5.94195803e-02
-4.90265042e-01 5.63052480e-02 -4.08529118e-02]
[-9.67075359e-02 1.21299356e-01 -1.87514050e-01 2.38676172e-01
-6.97040350e-01 3.15076568e-01 1.38027236e-01 1.41957140e-01
5.00682379e-01 -7.05838038e-02 6.16402985e-02]
[ 5.80573738e-01 -2.34240128e-01 2.35121826e-01 -3.80099782e-02
2.66238084e-01 2.75457840e-02 -1.81118005e-01 2.34026159e-02
6.66678521e-01 -4.26359572e-02 -8.22735427e-03]
[ 4.10992004e-02 -1.16176804e-02 1.25040038e-02 -5.72276092e-03
2.88001514e-02 -2.31321939e-02 -1.15071656e-02 -5.62224519e-03
-4.62780171e-02 -7.81587240e-03 9.97123599e-01]
[-6.05822739e-02 -5.79234535e-03 6.87156586e-02 -5.95958703e-04
9.44104673e-02 -1.38129745e-01 -6.40758271e-03 9.80713882e-01
-4.13898743e-02 5.13344404e-03 -7.91837593e-04]
[-2.31723189e-01 -8.22399319e-02 7.23233360e-01 5.33604688e-01
-1.46506522e-01 -3.17216838e-01 -1.63028381e-02 -9.50390390e-02
1.87411658e-02 -4.75371721e-04 -3.99424067e-04]]
print("Cota de varianza media autovalores de matriz de covarianzas \n")
Cota de varianza media autovalores de matriz de covarianzas
=(sum(eigenvalores2)/gasturbine.shape[1])
var_medprint("Se pueden considerar tomar valores mayores a var_med =", round(var_med,2))
Se pueden considerar tomar valores mayores a var_med = 102.68
Análisis de Componentes Principales (ACP)
### Modelo PCA
= make_pipeline(StandardScaler(), PCA())
pca_pipe pca_pipe.fit(gasturbine)
Pipeline(steps=[('standardscaler', StandardScaler()), ('pca', PCA())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('standardscaler', StandardScaler()), ('pca', PCA())])
StandardScaler()
PCA()
# Se extrae el modelo entrenado del pipeline
= pca_pipe.named_steps['pca']
acp_gt
print(" Resumen del modelo: \n")
Resumen del modelo:
print("Varianzas explicadas:\n")
Varianzas explicadas:
print(acp_gt.explained_variance_)
[5.97875235e+00 2.15570912e+00 1.05358683e+00 7.24059664e-01
5.64846323e-01 2.70300307e-01 1.39435427e-01 8.05730366e-02
3.21214731e-02 1.44268940e-03 6.62694341e-04]
print("\nPorcentaje de varianza explicada:\n")
Porcentaje de varianza explicada:
print(acp_gt.explained_variance_ratio_)
[5.43449332e-01 1.95947016e-01 9.57676498e-02 6.58146915e-02
5.13427115e-02 2.45694273e-02 1.26742313e-02 7.32382952e-03
2.91973845e-03 1.31135820e-04 6.02367812e-05]
print("\nVarianza acumulada:\n")
Varianza acumulada:
print(np.cumsum(acp_gt.explained_variance_ratio_))
[0.54344933 0.73939635 0.835164 0.90097869 0.9523214 0.97689083
0.98956506 0.99688889 0.99980863 0.99993976 1. ]
print("\nComponentes principales:")
Componentes principales:
print(acp_gt.components_)
[[ 0.16180989 -0.03924278 -0.12420936 0.3913334 0.37729364 0.39491983
-0.21671066 0.38721347 0.39575355 -0.30095799 -0.24759088]
[ 0.544404 -0.46886299 -0.20014239 0.01569281 -0.13949764 -0.00605929
0.45246202 -0.1879941 -0.1558526 -0.2296089 -0.32698363]
[-0.20744257 -0.04055504 0.83294863 0.00334615 -0.11542329 0.03814273
0.12710261 0.02797079 -0.01100148 -0.30882984 -0.36646488]
[ 0.0597164 0.8250117 -0.24095192 -0.02566472 -0.16921148 0.00459056
0.2009845 -0.06916172 -0.06443842 -0.25794721 -0.33548824]
[-0.40177671 -0.07816598 -0.1569623 -0.18680991 0.10841048 0.25246254
0.54128314 0.14868318 0.02818856 -0.43716531 0.43639208]
[ 0.39883275 0.24820253 0.33168388 0.35124973 -0.0484399 0.27215755
0.41757241 0.0563032 0.08922904 0.3799865 0.37367483]
[-0.29444045 -0.05108821 -0.10467024 -0.20929655 0.27649242 0.18686203
0.36222363 0.14366195 0.03464295 0.58832795 -0.4953724 ]
[ 0.27898943 0.15628742 0.19469407 -0.27628743 0.80940884 -0.17940666
0.02309572 -0.2239324 -0.15994885 -0.1140741 0.08302722]
[-0.36264519 -0.03439303 -0.09497537 0.75270069 0.22461244 -0.26100566
0.15269027 -0.26403162 -0.27874622 -0.00999469 -0.04822736]
[-0.09610488 -0.01575343 -0.00948361 -0.02037484 -0.0034743 -0.0417482
0.08468051 -0.63415876 0.76074877 0.00315148 -0.01337433]
[-0.07122729 -0.01254007 -0.00552722 -0.02354711 -0.00905849 0.75349026
-0.24937147 -0.49114864 -0.3504817 0.00597155 -0.00828121]]
Gráfico de sedimentación (scree)
# Limpia la figura actual antes de graficar
plt.clf()
= np.cumsum(acp_gt.explained_variance_ratio_)
prop_varianza_acum = range(1, len(prop_varianza_acum) + 1)
componentes
=(8, 5))
plt.figure(figsize'bo-', linewidth=2, markersize=8)
plt.plot(componentes, prop_varianza_acum, plt.xticks(componentes)
([<matplotlib.axis.XTick object at 0x000001D597B87ED0>, <matplotlib.axis.XTick object at 0x000001D59885A210>, <matplotlib.axis.XTick object at 0x000001D59885A990>, <matplotlib.axis.XTick object at 0x000001D59885B110>, <matplotlib.axis.XTick object at 0x000001D59885B890>, <matplotlib.axis.XTick object at 0x000001D59888C050>, <matplotlib.axis.XTick object at 0x000001D59888C7D0>, <matplotlib.axis.XTick object at 0x000001D59888CF50>, <matplotlib.axis.XTick object at 0x000001D59888D6D0>, <matplotlib.axis.XTick object at 0x000001D59888DE50>, <matplotlib.axis.XTick object at 0x000001D59888E5D0>], [Text(1, 0, '1'), Text(2, 0, '2'), Text(3, 0, '3'), Text(4, 0, '4'), Text(5, 0, '5'), Text(6, 0, '6'), Text(7, 0, '7'), Text(8, 0, '8'), Text(9, 0, '9'), Text(10, 0, '10'), Text(11, 0, '11')])
'Porcentaje de varianza explicada acumulada')
plt.title('Componente principal')
plt.xlabel('Por. varianza acumulada')
plt.ylabel(0, 1.1) plt.ylim(
(0.0, 1.1)
True, linestyle='--', alpha=0.7)
plt.grid(
# Añadir etiquetas
for i, v in enumerate(prop_varianza_acum):
+ 0.03, f"{v:.0%}",
plt.text(componentes[i], v ='center', va='bottom')
ha
plt.tight_layout() plt.show()
Análisis de cargas (loadings)
# Limpia la figura actual antes de graficar
plt.clf() = plt.subplots(nrows=1, ncols=1, figsize=(4, 2))
fig, ax = acp_gt.components_
ncomponentes ='viridis', aspect='auto')
plt.imshow(ncomponentes.T, cmaprange(len(gasturbine.columns)), gasturbine.columns) plt.yticks(
([<matplotlib.axis.YTick object at 0x000001D5988F8410>, <matplotlib.axis.YTick object at 0x000001D5988FA0D0>, <matplotlib.axis.YTick object at 0x000001D5988FA850>, <matplotlib.axis.YTick object at 0x000001D5988FAFD0>, <matplotlib.axis.YTick object at 0x000001D5988FB750>, <matplotlib.axis.YTick object at 0x000001D5988FBED0>, <matplotlib.axis.YTick object at 0x000001D598030690>, <matplotlib.axis.YTick object at 0x000001D598030E10>, <matplotlib.axis.YTick object at 0x000001D598031590>, <matplotlib.axis.YTick object at 0x000001D598031D10>, <matplotlib.axis.YTick object at 0x000001D598032490>], [Text(0, 0, 'AT'), Text(0, 1, 'AP'), Text(0, 2, 'AH'), Text(0, 3, 'AFDP'), Text(0, 4, 'GTEP'), Text(0, 5, 'TIT'), Text(0, 6, 'TAT'), Text(0, 7, 'TEY'), Text(0, 8, 'CDP'), Text(0, 9, 'CO'), Text(0, 10, 'NOX')])
range(len(gasturbine.columns)), np.arange(acp_gt.n_components_) + 1) plt.xticks(
([<matplotlib.axis.XTick object at 0x000001D5988CBC50>, <matplotlib.axis.XTick object at 0x000001D598032D50>, <matplotlib.axis.XTick object at 0x000001D5980334D0>, <matplotlib.axis.XTick object at 0x000001D598033C50>, <matplotlib.axis.XTick object at 0x000001D598050410>, <matplotlib.axis.XTick object at 0x000001D598050B90>, <matplotlib.axis.XTick object at 0x000001D598051310>, <matplotlib.axis.XTick object at 0x000001D598051A90>, <matplotlib.axis.XTick object at 0x000001D598052210>, <matplotlib.axis.XTick object at 0x000001D598052990>, <matplotlib.axis.XTick object at 0x000001D598053110>], [Text(0, 0, '1'), Text(1, 0, '2'), Text(2, 0, '3'), Text(3, 0, '4'), Text(4, 0, '5'), Text(5, 0, '6'), Text(6, 0, '7'), Text(7, 0, '8'), Text(8, 0, '9'), Text(9, 0, '10'), Text(10, 0, '11')])
False)
plt.grid(;
plt.colorbar() plt.show()
Análisis de puntuaciones (acores)
# Limpia la figura actual antes de graficar
plt.clf()
# Obtener scores y loadings
= acp_gt.transform(StandardScaler().fit_transform(gasturbine))[:, :2] # Primeras 2 componentes
scores = acp_gt.components_.T[:, :2] # Primeras 2 componentes
loadings
# Escalar los loadings para mejor visualización
= 1.0 / (loadings.max() - loadings.min())
scale_factor = loadings * scale_factor * 7 # Factor de escala ajustable
loadings_scaled
# Crear la figura
=(12, 8))
plt.figure(figsize
# Graficar scores (observaciones)
0], scores[:, 1], alpha=0.5, label='Observaciones')
plt.scatter(scores[:,
# Graficar loadings (variables)
for i, (x, y) in enumerate(loadings_scaled):
0, 0, x, y, color='red', head_width=0.05, alpha=0.8)
plt.arrow(* 1.15, y * 1.15, gasturbine.columns[i], color='darkred',
plt.text(x ='center', va='center', fontsize=12)
ha
# Añadir círculo de correlación
= plt.Circle((0, 0), 1, color='gray', fill=False, linestyle='--', alpha=0.5)
circle
plt.gca().add_artist(circle)
# Configuración del gráfico
0, color='gray', linestyle='--', alpha=0.5)
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(
f'Componente 1 ({acp_gt.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
plt.xlabel(f'Componente 2 ({acp_gt.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
plt.ylabel('Puntuaciones (scores) de las observaciones"', fontsize=14)
plt.title(True, linestyle='--', alpha=0.3)
plt.grid(
plt.legend()
# Ajustar límites
= max(np.abs(scores).max(), np.abs(loadings_scaled).max()) * 1.2
max_val -max_val, max_val) plt.xlim(
(-10.651644601708835, 10.651644601708835)
-max_val, max_val) plt.ylim(
(-10.651644601708835, 10.651644601708835)
plt.tight_layout() plt.show()
# Limpia la figura actual antes de graficar
plt.clf()
# Obtener scores y loadings
= acp_gt.transform(StandardScaler().fit_transform(gasturbine))[:, [0, 2]] # componentes 1 y 3
scores = acp_gt.components_.T[:, [0, 2]] # componentes 1 y 3
loadings
# Escalar los loadings para mejor visualización
= 1.0 / (loadings.max() - loadings.min())
scale_factor = loadings * scale_factor * 7 # Factor de escala ajustable
loadings_scaled
# Crear la figura
=(12, 8))
plt.figure(figsize
# Graficar scores (observaciones)
0], scores[:, 1], alpha=0.5, label='Observaciones')
plt.scatter(scores[:,
# Graficar loadings (variables)
for i, (x, y) in enumerate(loadings_scaled):
0, 0, x, y, color='red', head_width=0.05, alpha=0.8)
plt.arrow(* 1.15, y * 1.15, gasturbine.columns[i], color='darkred',
plt.text(x ='center', va='center', fontsize=12)
ha
# Añadir círculo de correlación
= plt.Circle((0, 0), 1, color='gray', fill=False, linestyle='--', alpha=0.5)
circle
plt.gca().add_artist(circle)
# Configuración del gráfico
0, color='gray', linestyle='--', alpha=0.5)
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(
f'Componente 1 ({acp_gt.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
plt.xlabel(f'Componente 3 ({acp_gt.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
plt.ylabel('Puntuaciones (scores) de las observaciones"', fontsize=14)
plt.title(True, linestyle='--', alpha=0.3)
plt.grid(
plt.legend()
# Ajustar límites
= max(np.abs(scores).max(), np.abs(loadings_scaled).max()) * 1.2
max_val -max_val, max_val) plt.xlim(
(-10.651644601708835, 10.651644601708835)
-max_val, max_val) plt.ylim(
(-10.651644601708835, 10.651644601708835)
plt.tight_layout() plt.show()
# Limpia la figura actual antes de graficar
plt.clf()
# Obtener scores y loadings
= acp_gt.transform(StandardScaler().fit_transform(gasturbine))[:, [1, 2]] # componentes 2 y 3
scores = acp_gt.components_.T[:, [1, 2]] # componentes 2 y 3
loadings
# Escalar los loadings para mejor visualización
= 1.0 / (loadings.max() - loadings.min())
scale_factor = loadings * scale_factor * 7 # Factor de escala ajustable
loadings_scaled
# Crear la figura
=(12, 8))
plt.figure(figsize
# Graficar scores (observaciones)
0], scores[:, 1], alpha=0.5, label='Observaciones')
plt.scatter(scores[:,
# Graficar loadings (variables)
for i, (x, y) in enumerate(loadings_scaled):
0, 0, x, y, color='red', head_width=0.05, alpha=0.8)
plt.arrow(* 1.15, y * 1.15, gasturbine.columns[i], color='darkred',
plt.text(x ='center', va='center', fontsize=12)
ha
# Añadir círculo de correlación
= plt.Circle((0, 0), 1, color='gray', fill=False, linestyle='--', alpha=0.5)
circle
plt.gca().add_artist(circle)
# Configuración del gráfico
0, color='gray', linestyle='--', alpha=0.5)
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(
f'Componente 2 ({acp_gt.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
plt.xlabel(f'Componente 3 ({acp_gt.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
plt.ylabel('Puntuaciones (scores) de las observaciones"', fontsize=14)
plt.title(True, linestyle='--', alpha=0.3)
plt.grid(
plt.legend()
# Ajustar límites
= max(np.abs(scores).max(), np.abs(loadings_scaled).max()) * 1.2
max_val -max_val, max_val) plt.xlim(
(-8.949839286896347, 8.949839286896347)
-max_val, max_val) plt.ylim(
(-8.949839286896347, 8.949839286896347)
plt.tight_layout() plt.show()