# Ejemplo de PCA con R
# Analisis de componentes principales
# Importando datos sin considerar el ID
USArrests <- read.csv("USArrests.csv",head=T,sep=";")
head(USArrests)
## Ciudad Asesinatos Asaltos Poblac_urbanana Violaciones
## 1 Alabama 13.2 236 58 21.2
## 2 Alaska 10.0 263 48 44.5
## 3 Arizona 8.1 294 80 31.0
## 4 Arkansas 8.8 190 50 19.5
## 5 California 9.0 276 91 40.6
## 6 Colorado 7.9 204 78 38.7
str(USArrests)
## 'data.frame': 50 obs. of 5 variables:
## $ Ciudad : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Asesinatos : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Asaltos : int 236 263 294 190 276 204 110 238 335 211 ...
## $ Poblac_urbanana: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Violaciones : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
USArrests <- USArrests[,-1] #eliminamos Ciudad, no es variable
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Asesinatos : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Asaltos : int 236 263 294 190 276 204 110 238 335 211 ...
## $ Poblac_urbanana: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Violaciones : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
#IMPORTA datos con primera variable tipo ID
USArrests <- read.csv("USArrests.csv",head=T,sep=";", row.names= 1)
head(USArrests)
## Asesinatos Asaltos Poblac_urbanana Violaciones
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Asesinatos : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Asaltos : int 236 263 294 190 276 204 110 238 335 211 ...
## $ Poblac_urbanana: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Violaciones : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
par(mfrow=c(1,1)) # manejando ventana de un graficos
#analisis descriptivo
summary(USArrests)
## Asesinatos Asaltos Poblac_urbanana Violaciones
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
#El promedio de as asaltos en muy alto en comparación con el promedio de violaciones y
#asesinatos. La variable población está en base a 100, 000 que no es posible realizar
#comparación con los demás promedios.
# Son diferentes unidades de medida, entonces a todas la ponemos en una misma unidad de medida
###################
# analisis descriptivo
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(lattice)
## Warning: package 'lattice' was built under R version 4.0.5
library(survival)
## Warning: package 'survival' was built under R version 4.0.5
library(Formula)
## Warning: package 'Formula' was built under R version 4.0.3
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(funModeling)
## Warning: package 'funModeling' was built under R version 4.0.5
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
profiling_num(USArrests) # analisis descriptivo/ muestra mas estadisticos para analizar
## variable mean std_dev variation_coef p_01 p_05 p_25 p_50
## 1 Asesinatos 7.788 4.355510 0.5592591 1.437 2.145 4.075 7.25
## 2 Asaltos 170.760 83.337661 0.4880397 45.490 50.250 109.000 159.00
## 3 Poblac_urbanana 65.540 14.474763 0.2208539 35.430 44.000 54.500 66.00
## 4 Violaciones 21.232 9.366385 0.4411447 7.545 8.750 15.075 20.10
## p_75 p_95 p_99 skewness kurtosis iqr range_98
## 1 11.250 15.400 16.763 0.3820378 2.135329 7.175 [1.437, 16.763]
## 2 249.000 297.300 336.020 0.2273179 1.930980 140.000 [45.49, 336.02]
## 3 77.750 86.550 90.020 -0.2191719 2.215790 23.250 [35.43, 90.02]
## 4 26.175 39.745 45.265 0.7769613 3.201898 11.100 [7.545, 45.265]
## range_80
## 1 [2.56, 13.32]
## 2 [56.9, 279.6]
## 3 [45, 83.2]
## 4 [10.67, 32.4]
#PROFILING -> COEF DE VARIACION, la data tiene mas homogeneidad ... mejor 0.22
plot_num(USArrests) # histograma
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

require(psych)
## Loading required package: psych
## Warning: package 'psych' was built under R version 4.0.5
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
multi.hist(x = USArrests,dcol = c("blue", "red"),
dlty = c("dotted", "solid"), main = "")

boxplot(USArrests,col=10:16, main="boxplot")

#El promedio de asaltos es superior a las demás variables. Note que la unidad de media de
#las diferentes variables no es la misma, por lo que es necesario estandarizarlas.
# obteniendo solo la media
apply(X = USArrests, MARGIN = 2, FUN = mean)
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 7.788 170.760 65.540 21.232
# obteniendo solo la varianza
apply(X = USArrests, MARGIN = 2, FUN = var)
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 18.97047 6945.16571 209.51878 87.72916
# La varianza es muy distinta entre las variables, en el caso de Assault, la varianza es varios
# órdenes de magnitud superior al resto.
#######################################
# Test de normalidad multivariante.
# Test de mardia de normalidad
library(MVN)
## Warning: package 'MVN' was built under R version 4.0.5
normalidad <- mvn(USArrests, mvnTest = "mardia")
normalidad
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 38.6236551434612 0.0074233268680534 NO
## 2 Mardia Kurtosis 0.962400363137224 0.335848531336779 YES
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling Asesinatos 0.6027 0.1114 YES
## 2 Anderson-Darling Asaltos 0.7263 0.0545 YES
## 3 Anderson-Darling Poblac_urbanana 0.3186 0.5255 YES
## 4 Anderson-Darling Violaciones 0.6755 0.0731 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th
## Asesinatos 50 7.788 4.355510 7.25 0.8 17.4 4.075 11.250
## Asaltos 50 170.760 83.337661 159.00 45.0 337.0 109.000 249.000
## Poblac_urbanana 50 65.540 14.474763 66.00 32.0 91.0 54.500 77.750
## Violaciones 50 21.232 9.366385 20.10 7.3 46.0 15.075 26.175
## Skew Kurtosis
## Asesinatos 0.3706342 -0.94923036
## Asaltos 0.2205325 -1.14548686
## Poblac_urbanana -0.2126297 -0.87195504
## Violaciones 0.7537694 0.07510264
# El test de Mardia indica que no existe normalidad multivariante
# Le hago caso a la multivariante...
# Asi tambien, el test de normalidad de Shapiro Wilk
#asaltos y violaciones no siguen una distribución normal.
#si queremos que los datos sean de distribucion normal, transformamos a las variables
# estudio de la correlacion
cor <- cor(USArrests)
cor
## Asesinatos Asaltos Poblac_urbanana Violaciones
## Asesinatos 1.00000000 0.8018733 0.06957262 0.5635788
## Asaltos 0.80187331 1.0000000 0.25887170 0.6652412
## Poblac_urbanana 0.06957262 0.2588717 1.00000000 0.4113412
## Violaciones 0.56357883 0.6652412 0.41134124 1.0000000
# pruebas de significancia de las correlaciones
corr.test(USArrests)
## Call:corr.test(x = USArrests)
## Correlation matrix
## Asesinatos Asaltos Poblac_urbanana Violaciones
## Asesinatos 1.00 0.80 0.07 0.56
## Asaltos 0.80 1.00 0.26 0.67
## Poblac_urbanana 0.07 0.26 1.00 0.41
## Violaciones 0.56 0.67 0.41 1.00
## Sample Size
## [1] 50
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## Asesinatos Asaltos Poblac_urbanana Violaciones
## Asesinatos 0.00 0.00 0.63 0.00
## Asaltos 0.00 0.00 0.14 0.00
## Poblac_urbanana 0.63 0.07 0.00 0.01
## Violaciones 0.00 0.00 0.00 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
pairs(USArrests) # grafico de dispersion bi variable

# X11() crea una nueva ventana
cor.plot(cor(USArrests)) # grafico de calor

#cuanto más azul más correlación positiva.
plot(USArrests, pch='*',col=c("red", "green", "blue","orange"))

library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.5
ggcorrplot(cor(USArrests),hc.order = T,type="lower")

#Muestra variables correlacionadas, si hay correl
# Al existir correlaciones, indica que es recomendable continuar, obtengamos la
# determinante
det(cor)
## [1] 0.1518054
# TEORIA: correlacion debe ser pequeña pero no cero
#cercano a 0
#######################################
# PRUEBA DE BARTLETT
# Prueba de Esfericidad de Bartlett
#library(rela)
library(psych)
cortest.bartlett(cor,n=dim(USArrests))
## $chisq
## [1] 88.288147 1.570963
##
## $p.value
## [1] 6.868423e-17 9.546421e-01
##
## $df
## [1] 6
# 6.868..-17 es signif. se esta contradeciendo
#El valor es significativo p.value (0.0000) < al 5% (0.05)
#######################################
# Prueba Kaiser Meyer Olkin y MSA
KMO(USArrests)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = USArrests)
## Overall MSA = 0.65
## MSA for each item =
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 0.62 0.64 0.50 0.78
# MSA = 0.65, (REGLA GENERAL: el valor mínimo aceptable es 0.5).
#Overall MSA (0.65) > 0.5, la clasificación es medianamente buena. La variable peor
#explicada es población urbana con 0.50 (que está al límite), podría interpretarse también
#que puede ser una variable a salir del estudio.
#######################################
# escalando variables - (x-promedio/desv estandar)
USArrestscale <- scale(USArrests)
head(USArrestscale)
## Asesinatos Asaltos Poblac_urbanana Violaciones
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
#analisis descriptivo
summary(USArrestscale)
## Asesinatos Asaltos Poblac_urbanana Violaciones
## Min. :-1.6044 Min. :-1.5090 Min. :-2.31714 Min. :-1.4874
## 1st Qu.:-0.8525 1st Qu.:-0.7411 1st Qu.:-0.76271 1st Qu.:-0.6574
## Median :-0.1235 Median :-0.1411 Median : 0.03178 Median :-0.1209
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7949 3rd Qu.: 0.9388 3rd Qu.: 0.84354 3rd Qu.: 0.5277
## Max. : 2.2069 Max. : 1.9948 Max. : 1.75892 Max. : 2.6444
library(dplyr)
profiling_num(USArrestscale) #analisis descriptivo
## variable mean std_dev variation_coef p_01 p_05
## 1 var.Asesinatos -7.663087e-17 1 -1.304957e+16 -1.458153 -1.295600
## 2 var.Asaltos 1.112408e-16 1 8.989510e+15 -1.503162 -1.446045
## 3 var.Poblac_urbanana -4.332808e-16 1 -2.307972e+15 -2.080172 -1.488107
## 4 var.Violaciones 8.942391e-17 1 1.118269e+16 -1.461290 -1.332638
## p_25 p_50 p_75 p_95 p_99 skewness kurtosis
## 1 -0.8524835 -0.12352171 0.7948553 1.747671 2.060608 0.3820378 2.135329
## 2 -0.7410815 -0.14111267 0.9388312 1.518401 1.983017 0.2273179 1.930980
## 3 -0.7627068 0.03177945 0.8435371 1.451492 1.691219 -0.2191719 2.215790
## 4 -0.6573508 -0.12085773 0.5277383 1.976536 2.565878 0.7769613 3.201898
## iqr range_98
## 1 1.647339 [-1.45815308513101, 2.06060839852791]
## 2 1.679913 [-1.50316194067986, 1.98301702176701]
## 3 1.606244 [-2.08017217043142, 1.69121935344275]
## 4 1.185089 [-1.461289567454, 2.5658779991687]
## range_80
## 1 [-1.20031874178317, 1.27011539394501]
## 2 [-1.36624905057722, 1.30601217868281]
## 3 [-1.41902146730858, 1.22005448455061]
## 4 [-1.12764962456705, 1.19234908229169]
require(psych)
multi.hist(x=USArrestscale, dcol = c("blue", "red"),
dlty = c("dotted", "solid"), main = "")

boxplot(USArrestscale, col=10:16, main="boxplot")

#test de mardia de normalidad
library(MVN)
normalidad <- mvn(USArrestscale, mvnTest = "mardia")
normalidad
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 38.6236551434611 0.0074233268680534 NO
## 2 Mardia Kurtosis 0.962400363137226 0.335848531336778 YES
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling Asesinatos 0.6027 0.1114 YES
## 2 Anderson-Darling Asaltos 0.7263 0.0545 YES
## 3 Anderson-Darling Poblac_urbanana 0.3186 0.5255 YES
## 4 Anderson-Darling Violaciones 0.6755 0.0731 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max
## Asesinatos 50 -7.663087e-17 1 -0.12352171 -1.604405 2.206860
## Asaltos 50 1.112408e-16 1 -0.14111267 -1.509042 1.994776
## Poblac_urbanana 50 -4.332808e-16 1 0.03177945 -2.317136 1.758923
## Violaciones 50 8.942391e-17 1 -0.12085773 -1.487447 2.644350
## 25th 75th Skew Kurtosis
## Asesinatos -0.8524835 0.7948553 0.3706342 -0.94923036
## Asaltos -0.7410815 0.9388312 0.2205325 -1.14548686
## Poblac_urbanana -0.7627068 0.8435371 -0.2126297 -0.87195504
## Violaciones -0.6573508 0.5277383 0.7537694 0.07510264
## EDGAR CARPIO NORMALIDAD , otros metodos de normalidad
#estudio de la correlacion
cor <- cor(USArrestscale)
cor
## Asesinatos Asaltos Poblac_urbanana Violaciones
## Asesinatos 1.00000000 0.8018733 0.06957262 0.5635788
## Asaltos 0.80187331 1.0000000 0.25887170 0.6652412
## Poblac_urbanana 0.06957262 0.2588717 1.00000000 0.4113412
## Violaciones 0.56357883 0.6652412 0.41134124 1.0000000
x11()
cor.plot(cor(USArrestscale)) #grafico de calor

library(ggcorrplot)
ggcorrplot(cor(USArrestscale),hc.order = T, type = "lower")

#Al existir correlaciones, indica que es recomendable continuar
det(cor)
## [1] 0.1518054
#Prueba de esfericidad de Bartlett
library(psych)
cortest.bartlett(cor,n=dim(USArrestscale))
## $chisq
## [1] 88.288147 1.570963
##
## $p.value
## [1] 6.868423e-17 9.546421e-01
##
## $df
## [1] 6
# Indicador Kaiser Meyer Olkin (KMO y MSA)
KMO(USArrestscale)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = USArrestscale)
## Overall MSA = 0.65
## MSA for each item =
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 0.62 0.64 0.50 0.78
# CONCLUSION: data escalado no mejora mucho los datos
#######################################
# ESTANDARIZACION
# -- Estandarizar las variables no cambia todo el análisis descriptivo
#IMPORTANTE: Si no se estandarizan las variables para que tengan media cero y desviación estándar 1
#antes de realizar el estudio PCA, la variable Assault dominará la mayoría de las
#componentes principales.
#### CALCULANDO LAS COMPONENTES PRINCIPALES ######
##### con valores NO escalados ######
library(psych)
facto=principal(r=USArrests,nfactors=2) #numero de grupos = 2
facto$values # cuantos pasaron de 1?
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
# Un solo valor es mayor que 1
# Grafico de sedimentacion:
par(mfrow=c(1,1))
facto$communality # Comunalidades
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 0.8853816 0.8785149 0.9459401 0.7601701
facto # Cargas Factoriales, Componentes
## Principal Components Analysis
## Call: principal(r = USArrests, nfactors = 2)
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Asesinatos 0.94 -0.06 0.89 0.115 1.0
## Asaltos 0.92 0.18 0.88 0.121 1.1
## Poblac_urbanana 0.07 0.97 0.95 0.054 1.0
## Violaciones 0.73 0.48 0.76 0.240 1.7
##
## RC1 RC2
## SS loadings 2.26 1.21
## Proportion Var 0.57 0.30
## Cumulative Var 0.57 0.87
## Proportion Explained 0.65 0.35
## Cumulative Proportion 0.65 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.08
## with the empirical chi square 3.44 with prob < NA
##
## Fit based upon off diagonal values = 0.98
# Graficamente
fa.diagram(facto) # analisis exploratorio

##### con valores escalados ######
library(psych)
#usamos rotacion pero no lo usaremos, pero lo escalamos
facto=principal(r=USArrests,nfactors=2,rotate="none", scale=TRUE)
#desviación estándar sea uno, hay que usar scale = TRUE.
facto$values
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
#X Grafico de sedimentacion:
par(mfrow=c(1,1))
facto$communality # Comunalidades
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 0.8853816 0.8785149 0.9459401 0.7601701
facto # Cargas Factoriales, Componentes
## Principal Components Analysis
## Call: principal(r = USArrests, nfactors = 2, rotate = "none", scale = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 PC2 h2 u2 com
## Asesinatos 0.84 -0.42 0.89 0.115 1.5
## Asaltos 0.92 -0.19 0.88 0.121 1.1
## Poblac_urbanana 0.44 0.87 0.95 0.054 1.5
## Violaciones 0.86 0.17 0.76 0.240 1.1
##
## PC1 PC2
## SS loadings 2.48 0.99
## Proportion Var 0.62 0.25
## Cumulative Var 0.62 0.87
## Proportion Explained 0.71 0.29
## Cumulative Proportion 0.71 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.08
## with the empirical chi square 3.44 with prob < NA
##
## Fit based upon off diagonal values = 0.98
fa.diagram(facto)

# guardando scores
head(facto$scores)
## PC1 PC2
## Alabama 0.61951483 -1.1277874
## Alaska 1.22583308 -1.0679059
## Arizona 1.10830334 0.7422678
## Arkansas -0.08889509 -1.1142591
## California 1.58654347 1.5353037
## Colorado 0.95203595 0.9826713
puntosFact <-cbind(USArrests,facto$scores)
head(puntosFact)
## Asesinatos Asaltos Poblac_urbanana Violaciones PC1
## Alabama 13.2 236 58 21.2 0.61951483
## Alaska 10.0 263 48 44.5 1.22583308
## Arizona 8.1 294 80 31.0 1.10830334
## Arkansas 8.8 190 50 19.5 -0.08889509
## California 9.0 276 91 40.6 1.58654347
## Colorado 7.9 204 78 38.7 0.95203595
## PC2
## Alabama -1.1277874
## Alaska -1.0679059
## Arizona 0.7422678
## Arkansas -1.1142591
## California 1.5353037
## Colorado 0.9826713
write.csv(puntosFact,"USArrestsScores.csv")
#######################
# UTILIZANDO ROTACION, cuando tengamos problemas de escala, estan cercanos los valores
# se usa para observar las cargas factoriales
facto.rot=principal(r=USArrests,nfactors=2, rotate="varimax") #cuartimax
facto.rot$values
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
facto.rot
## Principal Components Analysis
## Call: principal(r = USArrests, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Asesinatos 0.94 -0.06 0.89 0.115 1.0
## Asaltos 0.92 0.18 0.88 0.121 1.1
## Poblac_urbanana 0.07 0.97 0.95 0.054 1.0
## Violaciones 0.73 0.48 0.76 0.240 1.7
##
## RC1 RC2
## SS loadings 2.26 1.21
## Proportion Var 0.57 0.30
## Cumulative Var 0.57 0.87
## Proportion Explained 0.65 0.35
## Cumulative Proportion 0.65 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.08
## with the empirical chi square 3.44 with prob < NA
##
## Fit based upon off diagonal values = 0.98
# podemos solicitar otros resultados, por ejemplo,
#las comunalidades
facto.rot$communality
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 0.8853816 0.8785149 0.9459401 0.7601701
fa.diagram(facto.rot)

#######################################
# analisis con otra libreria
# PCA UTILIZANDO LA LIBRERÍA prcomp
#USArrests <- read.csv("USArrests.csv",head=T,sep=";", row.names= 1)
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
pca <- prcomp(USArrests, scale = TRUE)
pca
## Standard deviations (1, .., p=4):
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Asesinatos -0.5358995 0.4181809 -0.3412327 0.64922780
## Asaltos -0.5831836 0.1879856 -0.2681484 -0.74340748
## Poblac_urbanana -0.2781909 -0.8728062 -0.3780158 0.13387773
## Violaciones -0.5434321 -0.1673186 0.8177779 0.08902432
#######################################
# Importancia de los componentes:
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.5749 0.9949 0.59713 0.41645
## Proportion of Variance 0.6201 0.2474 0.08914 0.04336
## Cumulative Proportion 0.6201 0.8675 0.95664 1.00000
fviz_eig(pca)

# biplot, sirve para muchas variables
par(mfrow=c(1,1))
biplot(x = pca, scale = 0, cex = 0.8, col = c("blue4", "brown3"))

# Representacion dos primeras dimensiones y peso variables
fviz_pca_var(pca,
col.var = "contrib", # el color va en funcion de la contribucion al PC
gradient.cols = c("#800000", "#FFA500", "#008000"),
repel = TRUE # con esto se evita el solapado del texto
)

###############
############### XXXXXXX
pca
## Standard deviations (1, .., p=4):
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Asesinatos -0.5358995 0.4181809 -0.3412327 0.64922780
## Asaltos -0.5831836 0.1879856 -0.2681484 -0.74340748
## Poblac_urbanana -0.2781909 -0.8728062 -0.3780158 0.13387773
## Violaciones -0.5434321 -0.1673186 0.8177779 0.08902432
head(pca$x)
## PC1 PC2 PC3 PC4
## Alabama -0.9756604 1.1220012 -0.43980366 0.154696581
## Alaska -1.9305379 1.0624269 2.01950027 -0.434175454
## Arizona -1.7454429 -0.7384595 0.05423025 -0.826264240
## Arkansas 0.1399989 1.1085423 0.11342217 -0.180973554
## California -2.4986128 -1.5274267 0.59254100 -0.338559240
## Colorado -1.4993407 -0.9776297 1.08400162 0.001450164
dim(pca$x)
## [1] 50 4
par(mfrow=c(1,1))
biplot(x = pca, scale = 0, cex = 0.8, col = c("blue4", "brown3"))

pca$rotation <- -pca$rotation
pca$x <- -pca$x
biplot(x = pca, scale = 0, cex = 0.8, col = c("blue4", "brown3"))

# graficando la proporción de varianza explicada
library(ggplot2)
pca$sdev^2
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
prop_varianza <- pca$sdev^2/sum(pca$sdev^2)
prop_varianza
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
ggplot(data = data.frame(prop_varianza, pc = 1:4),aes(x = pc, y =
prop_varianza)) +
geom_col(width = 0.3) +
scale_y_continuous(limits = c(0, 1)) +
theme_bw() +
labs(x = "Componente principal", y = "Proporcion de varianza explicada")

prop_varianza_acum <- cumsum(prop_varianza)
prop_varianza_acum
## [1] 0.6200604 0.8675017 0.9566425 1.0000000
# graficando la proporción de varianza acumulada
ggplot(data = data.frame(prop_varianza_acum, pc = 1:4),
aes(x = pc, y = prop_varianza_acum, group = 1)) +
geom_point() +
geom_line() +
geom_label(aes(label=round(prop_varianza_acum,2)))+
theme_bw() +
labs(x = "Componente principal", y = "Proporcion de varianza explicada a
cumulada")

#componentes utilizando el metodo de componentes principales
library(psych)
facto=principal(r=USArrests,nfactors=2,rotate="none")
facto$values
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
# Grafico de sedimentacion:
plot(facto$values,type="h") # Grafica de Valores propios

facto$communality # Comunalidade
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 0.8853816 0.8785149 0.9459401 0.7601701
facto$loadings # Cargas Factoriales, Componentes
##
## Loadings:
## PC1 PC2
## Asesinatos 0.844 -0.416
## Asaltos 0.918 -0.187
## Poblac_urbanana 0.438 0.868
## Violaciones 0.856 0.166
##
## PC1 PC2
## SS loadings 2.48 0.990
## Proportion Var 0.62 0.247
## Cumulative Var 0.62 0.868
# guardando scores
head(facto$scores)
## PC1 PC2
## Alabama 0.61951483 -1.1277874
## Alaska 1.22583308 -1.0679059
## Arizona 1.10830334 0.7422678
## Arkansas -0.08889509 -1.1142591
## California 1.58654347 1.5353037
## Colorado 0.95203595 0.9826713
puntosFact <-cbind(USArrests,facto$scores)
head(puntosFact)
## Asesinatos Asaltos Poblac_urbanana Violaciones PC1
## Alabama 13.2 236 58 21.2 0.61951483
## Alaska 10.0 263 48 44.5 1.22583308
## Arizona 8.1 294 80 31.0 1.10830334
## Arkansas 8.8 190 50 19.5 -0.08889509
## California 9.0 276 91 40.6 1.58654347
## Colorado 7.9 204 78 38.7 0.95203595
## PC2
## Alabama -1.1277874
## Alaska -1.0679059
## Arizona 0.7422678
## Arkansas -1.1142591
## California 1.5353037
## Colorado 0.9826713
write.csv(puntosFact,"USArrestsScores.csv")
# utilizando rotacion
facto=principal(r=USArrests,nfactors=2,rotate="varimax")
facto$values
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
facto$communality
## Asesinatos Asaltos Poblac_urbanana Violaciones
## 0.8853816 0.8785149 0.9459401 0.7601701
facto$loadings
##
## Loadings:
## RC1 RC2
## Asesinatos 0.939
## Asaltos 0.920 0.179
## Poblac_urbanana 0.970
## Violaciones 0.727 0.481
##
## RC1 RC2
## SS loadings 2.262 1.208
## Proportion Var 0.565 0.302
## Cumulative Var 0.565 0.868
############### XXXXXXX
###############
# Usando la libreria FactoMineR
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.0.5
result <- PCA(USArrests,scale.unit = TRUE, ncp = 2, graph = TRUE)


result$eig
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.4802416 62.006039 62.00604
## comp 2 0.9897652 24.744129 86.75017
## comp 3 0.3565632 8.914080 95.66425
## comp 4 0.1734301 4.335752 100.00000
library(factoextra)
fviz_screeplot(result, addlabels=T,ylim=c(0,100))

#Contribuciones de las componentes
fviz_contrib(result,choice = "var", axes = 1)

fviz_contrib(result,choice = "var", axes = 2)

fviz_pca_var(result, col.var = "contrib",
gradient.col="npg", repel = T)+
ggtitle("Grafico de variables") + theme_minimal()

#Grafico de individuos
fviz_pca_ind(result, geom.ind = "point",
col.ind = "cos2", gradient.col="jco")+
ggtitle("Grafico de los individuos")

# biplot
fviz_pca_biplot(result, geom.ind = "point",
addEllipses = T,
palette = "uchicago")+ggtitle("variables e individuos")

# faces
# me da algo curioso, se busca parecidos
library(TeachingDemos)
## Warning: package 'TeachingDemos' was built under R version 4.0.5
##
## Attaching package: 'TeachingDemos'
## The following objects are masked from 'package:Hmisc':
##
## cnvrt.coords, subplot
faces(USArrests)

faces2(USArrests,nrows=7)

#######################################
# Utilizando la función PCA
# usando la libreria FactoMineR, función PCA
library(FactoMineR)
# observando los resultados de 2 componentes
result<- PCA(USArrests,scale.unit = TRUE, ncp=2,graph = TRUE)


fa.diagram(facto)
