UNIVERSINDAD NACIONAL DEL ALTIPLANO
INGENERIA ESTADISTICA E INFORMATICA
DOCENTE: CARPIO VARGAS, EDGAR ELOY
TRABAJO DE MULTIVARIANTE ACP
NOMBRE: VALERIA JARA SANCA
mundo <- read.csv("fisico quimicas acp.csv",head=T, sep=";")
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.0.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.0.2
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(mundo)
misdate <- sort(sapply(mundo, function(x){sum(is.na(x)/length(x))}), decreasing = T)
misdate
## NH4 PO4 V Eh NO3 B
## 0.684587814 0.283154122 0.229390681 0.204301075 0.100358423 0.075268817
## T SiO2 F ï..Sample Date pH
## 0.010752688 0.010752688 0.007168459 0.000000000 0.000000000 0.000000000
## Na K Mg Ca Alk. Cl
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## SO4 TDS
## 0.000000000 0.000000000
faltantes <- misdate[misdate > 0.1]
mundo <- read.csv("fisico quimicas acp.csv",head=T, sep=";")
final <- mundo[, colMeans(is.na(mundo)) <= .10]
dim(final)
## [1] 279 15
names(final)
## [1] "ï..Sample" "Date" "T" "pH" "Na" "K"
## [7] "Mg" "Ca" "Alk." "Cl" "SO4" "SiO2"
## [13] "TDS" "F" "B"
colSums(is.na(final))
## ï..Sample Date T pH Na K Mg Ca
## 0 0 3 0 0 0 0 0
## Alk. Cl SO4 SiO2 TDS F B
## 0 0 0 3 0 2 21
hay variables con datos N.A
library(datasets)
library(VIM)
## Warning: package 'VIM' was built under R version 4.0.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.0.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(final,numbers=T,sortVar=T)
##
## Variables sorted by number of missings:
## Variable Count
## B 0.075268817
## T 0.010752688
## SiO2 0.010752688
## F 0.007168459
## ï..Sample 0.000000000
## Date 0.000000000
## pH 0.000000000
## Na 0.000000000
## K 0.000000000
## Mg 0.000000000
## Ca 0.000000000
## Alk. 0.000000000
## Cl 0.000000000
## SO4 0.000000000
## TDS 0.000000000
COMO SE OBSERVA FALTA DATOS
B 0.075268817
T 0.010752688
PARA ESTO PROCEDEMOS A COMPLETAR CON EL METODO DE LA MEDIA
colSums(is.na(final))
## ï..Sample Date T pH Na K Mg Ca
## 0 0 3 0 0 0 0 0
## Alk. Cl SO4 SiO2 TDS F B
## 0 0 0 3 0 2 21
library(base)
library(mice)
## Warning: package 'mice' was built under R version 4.0.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
tempData <- mice(final,m=5,maxit=2,method="mean",seed=500)
##
## iter imp variable
## 1 1 T SiO2 F B
## 1 2 T SiO2 F B
## 1 3 T SiO2 F B
## 1 4 T SiO2 F B
## 1 5 T SiO2 F B
## 2 1 T SiO2 F B
## 2 2 T SiO2 F B
## 2 3 T SiO2 F B
## 2 4 T SiO2 F B
## 2 5 T SiO2 F B
## Warning: Number of logged events: 33
newdata <- complete(tempData,1)
colSums(is.na(newdata))
## ï..Sample Date T pH Na K Mg Ca
## 0 0 0 0 0 0 0 0
## Alk. Cl SO4 SiO2 TDS F B
## 0 0 0 0 0 0 0
colSums(is.na(final))
## ï..Sample Date T pH Na K Mg Ca
## 0 0 3 0 0 0 0 0
## Alk. Cl SO4 SiO2 TDS F B
## 0 0 0 3 0 2 21
dim(newdata)
## [1] 279 15
YA SE COMPLETARON AHORA USAREMOS LA MISMA GRAFICA PARA PODER COMPROBAR
library(datasets)
library(VIM)
aggr(newdata,numbers=T,sortVar=T)
##
## Variables sorted by number of missings:
## Variable Count
## ï..Sample 0
## Date 0
## T 0
## pH 0
## Na 0
## K 0
## Mg 0
## Ca 0
## Alk. 0
## Cl 0
## SO4 0
## SiO2 0
## TDS 0
## F 0
## B 0
COMO SE OBSERVA EN EL GRAFICO NO HAY DATOS N.A EN NINGUNA DE LAS VARIABLES ESTO NOS INDICA QUE LA DATA ESTA LIMPIA
AHORA GUARDAMOS LA DATA
write.csv(newdata, file="datos imputados.csv")
LOS DATOS LIMPIOS SE GUARDARON
\[ACP\]
CARGAMOS LA DATA LIMPIA
USArrests <- read.csv("datos imputados.csv",head=T,sep=",",row.names = 1)
head(USArrests)
## ï..Sample Date T pH Na K Mg Ca Alk. Cl SO4 SiO2
## 1 Piano 16/01/1997 8.4 6.60 15.6 5.5 13.1 17.8 122.0 18.5 18.5 75
## 2 S. 16/01/1997 15.0 6.52 110.0 18.8 44.5 46.7 519.0 47.1 79.2 62
## 3 Vena 16/01/1997 10.2 7.86 20.5 2.0 5.2 14.2 79.3 13.2 24.9 11
## 4 Fisuari 9/04/1997 10.2 6.62 40.7 7.0 21.7 34.7 256.0 20.5 34.3 31
## 5 Acquafredda 28/04/1997 12.2 7.30 55.9 12.1 29.2 89.6 354.0 62.9 79.2 37
## 6 Cambria 28/04/1997 10.7 6.67 69.7 12.5 41.8 42.5 415.0 32.9 46.0 40
## TDS F B
## 1 286 341 607.1895
## 2 933 202 190.0000
## 3 177 239 607.1895
## 4 449 238 145.0000
## 5 760 192 130.0000
## 6 708 262 230.0000
str(USArrests)
## 'data.frame': 279 obs. of 15 variables:
## $ ï..Sample: chr "Piano" "S." "Vena" "Fisuari" ...
## $ Date : chr "16/01/1997" "16/01/1997" "16/01/1997" "9/04/1997" ...
## $ T : num 8.4 15 10.2 10.2 12.2 10.7 14.1 10.9 14.1 9.2 ...
## $ pH : num 6.6 6.52 7.86 6.62 7.3 6.67 7.5 6.7 7.57 7 ...
## $ Na : num 15.6 110 20.5 40.7 55.9 69.7 52.4 46.9 37.9 19.1 ...
## $ K : num 5.5 18.8 2 7 12.1 12.5 10.2 9 8.4 3.9 ...
## $ Mg : num 13.1 44.5 5.2 21.7 29.2 41.8 22.2 24.4 11.8 8.1 ...
## $ Ca : num 17.8 46.7 14.2 34.7 89.6 42.5 23.2 63.5 21.9 51.1 ...
## $ Alk. : num 122 519 79.3 256 354 415 226 299 177 214 ...
## $ Cl : num 18.5 47.1 13.2 20.5 62.9 32.9 28.8 39.8 24.5 11.8 ...
## $ SO4 : num 18.5 79.2 24.9 34.3 79.2 46 40.4 52.3 23.9 15.1 ...
## $ SiO2 : num 75 62 11 31 37 40 37 35 39 20 ...
## $ TDS : int 286 933 177 449 760 708 452 595 349 344 ...
## $ F : num 341 202 239 238 192 262 458 189 294 142 ...
## $ B : num 607 190 607 145 130 ...
VEMOS SAMPLE Y LE FECHAN SON PEFRESENTACION DE LA VARIABLE EL ID EKIMINANDO ID(x)
USArrests <- USArrests[,-1]
str(USArrests)
## 'data.frame': 279 obs. of 14 variables:
## $ Date: chr "16/01/1997" "16/01/1997" "16/01/1997" "9/04/1997" ...
## $ T : num 8.4 15 10.2 10.2 12.2 10.7 14.1 10.9 14.1 9.2 ...
## $ pH : num 6.6 6.52 7.86 6.62 7.3 6.67 7.5 6.7 7.57 7 ...
## $ Na : num 15.6 110 20.5 40.7 55.9 69.7 52.4 46.9 37.9 19.1 ...
## $ K : num 5.5 18.8 2 7 12.1 12.5 10.2 9 8.4 3.9 ...
## $ Mg : num 13.1 44.5 5.2 21.7 29.2 41.8 22.2 24.4 11.8 8.1 ...
## $ Ca : num 17.8 46.7 14.2 34.7 89.6 42.5 23.2 63.5 21.9 51.1 ...
## $ Alk.: num 122 519 79.3 256 354 415 226 299 177 214 ...
## $ Cl : num 18.5 47.1 13.2 20.5 62.9 32.9 28.8 39.8 24.5 11.8 ...
## $ SO4 : num 18.5 79.2 24.9 34.3 79.2 46 40.4 52.3 23.9 15.1 ...
## $ SiO2: num 75 62 11 31 37 40 37 35 39 20 ...
## $ TDS : int 286 933 177 449 760 708 452 595 349 344 ...
## $ F : num 341 202 239 238 192 262 458 189 294 142 ...
## $ B : num 607 190 607 145 130 ...
USArrests <- USArrests[,-1]
str(USArrests)
## 'data.frame': 279 obs. of 13 variables:
## $ T : num 8.4 15 10.2 10.2 12.2 10.7 14.1 10.9 14.1 9.2 ...
## $ pH : num 6.6 6.52 7.86 6.62 7.3 6.67 7.5 6.7 7.57 7 ...
## $ Na : num 15.6 110 20.5 40.7 55.9 69.7 52.4 46.9 37.9 19.1 ...
## $ K : num 5.5 18.8 2 7 12.1 12.5 10.2 9 8.4 3.9 ...
## $ Mg : num 13.1 44.5 5.2 21.7 29.2 41.8 22.2 24.4 11.8 8.1 ...
## $ Ca : num 17.8 46.7 14.2 34.7 89.6 42.5 23.2 63.5 21.9 51.1 ...
## $ Alk.: num 122 519 79.3 256 354 415 226 299 177 214 ...
## $ Cl : num 18.5 47.1 13.2 20.5 62.9 32.9 28.8 39.8 24.5 11.8 ...
## $ SO4 : num 18.5 79.2 24.9 34.3 79.2 46 40.4 52.3 23.9 15.1 ...
## $ SiO2: num 75 62 11 31 37 40 37 35 39 20 ...
## $ TDS : int 286 933 177 449 760 708 452 595 349 344 ...
## $ F : num 341 202 239 238 192 262 458 189 294 142 ...
## $ B : num 607 190 607 145 130 ...
str(USArrests)
## 'data.frame': 279 obs. of 13 variables:
## $ T : num 8.4 15 10.2 10.2 12.2 10.7 14.1 10.9 14.1 9.2 ...
## $ pH : num 6.6 6.52 7.86 6.62 7.3 6.67 7.5 6.7 7.57 7 ...
## $ Na : num 15.6 110 20.5 40.7 55.9 69.7 52.4 46.9 37.9 19.1 ...
## $ K : num 5.5 18.8 2 7 12.1 12.5 10.2 9 8.4 3.9 ...
## $ Mg : num 13.1 44.5 5.2 21.7 29.2 41.8 22.2 24.4 11.8 8.1 ...
## $ Ca : num 17.8 46.7 14.2 34.7 89.6 42.5 23.2 63.5 21.9 51.1 ...
## $ Alk.: num 122 519 79.3 256 354 415 226 299 177 214 ...
## $ Cl : num 18.5 47.1 13.2 20.5 62.9 32.9 28.8 39.8 24.5 11.8 ...
## $ SO4 : num 18.5 79.2 24.9 34.3 79.2 46 40.4 52.3 23.9 15.1 ...
## $ SiO2: num 75 62 11 31 37 40 37 35 39 20 ...
## $ TDS : int 286 933 177 449 760 708 452 595 349 344 ...
## $ F : num 341 202 239 238 192 262 458 189 294 142 ...
## $ B : num 607 190 607 145 130 ...
COMO SE OBSERVA SOLO QUEDAN 13 VARIABLES
\(Análisis descriptivo\)
boxplot(USArrests,col=10:16)
EL PROMEDIO DE TDS ES SUPERIOR A LOS DEMAS VARAIBLAES
\(HIstograma\)
par(mfrow=c(1,1))
hist(USArrests$pH, freq = TRUE, main = "Histograma de frecuencias",
xlab = "pH", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$Na, freq = TRUE, main = "Histograma de frecuencias",
xlab = "Na", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$K, freq = TRUE, main = "Histograma de frecuencias",
xlab = "K", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$Mg, freq = TRUE, main = "Histograma de frecuencias",
xlab = "Mg", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$Ca, freq = TRUE, main = "Histograma de frecuencias",
xlab = "Ca", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$Alk, freq = TRUE, main = "Histograma de frecuencias",
xlab = "Alk", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$Cl, freq = TRUE, main = "Histograma de frecuencias",
xlab = "Cl", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$SO4, freq = TRUE, main = "Histograma de frecuencias",
xlab = "SO4", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$SiO2, freq = TRUE, main = "Histograma de frecuencias",
xlab = "SiO2", ylab = "Frecuencia", col = "#009ACD")
hist(USArrests$TDS, freq = TRUE, main = "Histograma de frecuencias",
xlab = "TDS", ylab = "Frecuencia", col = "#009ACD")
\(Histogramas escaladas\)
require(psych)
## Loading required package: psych
## Warning: package 'psych' was built under R version 4.0.3
multi.hist(x = USArrests,dcol = c("blue", "red"), dlty = c("dotted", "solid"),
main = "")
summary(USArrests)
## T pH Na K
## Min. : 7.50 Min. :5.900 Min. : 7.5 Min. : 1.20
## 1st Qu.:14.00 1st Qu.:6.595 1st Qu.: 58.3 1st Qu.: 10.75
## Median :15.90 Median :7.040 Median : 107.0 Median : 16.40
## Mean :15.54 Mean :7.027 Mean : 223.9 Mean : 19.26
## 3rd Qu.:17.55 3rd Qu.:7.490 3rd Qu.: 147.5 3rd Qu.: 21.70
## Max. :22.60 Max. :8.800 Max. :25210.0 Max. :546.00
## Mg Ca Alk. Cl
## Min. : 1.60 Min. : 3.50 Min. : 48.8 Min. : 7.80
## 1st Qu.: 29.65 1st Qu.: 26.80 1st Qu.: 245.5 1st Qu.: 39.75
## Median : 65.00 Median : 46.70 Median : 451.0 Median : 63.30
## Mean : 71.85 Mean : 59.07 Mean : 567.4 Mean : 270.44
## 3rd Qu.: 97.60 3rd Qu.: 73.10 3rd Qu.: 825.0 3rd Qu.: 98.55
## Max. :299.00 Max. :1555.00 Max. :2190.0 Max. :43613.00
## SO4 SiO2 TDS F
## Min. : 1.00 Min. : 11.00 Min. : 125 Min. : 20.0
## 1st Qu.: 31.30 1st Qu.: 41.00 1st Qu.: 643 1st Qu.:319.5
## Median : 58.60 Median : 59.00 Median : 1057 Median :421.0
## Mean : 84.85 Mean : 57.72 Mean : 1385 Mean :423.6
## 3rd Qu.:107.00 3rd Qu.: 73.00 3rd Qu.: 1443 3rd Qu.:535.0
## Max. :529.00 Max. :105.00 Max. :73023 Max. :995.0
## B
## Min. : 4.5
## 1st Qu.: 190.0
## Median : 478.0
## Mean : 607.2
## 3rd Qu.: 687.5
## Max. :6270.0
library(psych)
describe(USArrests)
## vars n mean sd median trimmed mad min max range
## T 1 279 15.54 2.63 15.90 15.68 2.52 7.5 22.6 15.1
## pH 2 279 7.03 0.59 7.04 7.02 0.67 5.9 8.8 2.9
## Na 3 279 223.87 1515.33 107.00 107.41 66.27 7.5 25210.0 25202.5
## K 4 279 19.26 32.95 16.40 16.53 8.15 1.2 546.0 544.8
## Mg 5 279 71.85 51.62 65.00 66.04 51.30 1.6 299.0 297.4
## Ca 6 279 59.07 97.68 46.70 48.92 33.06 3.5 1555.0 1551.5
## Alk. 7 279 567.40 397.96 451.00 523.48 379.55 48.8 2190.0 2141.2
## Cl 8 279 270.44 2622.23 63.30 69.68 42.85 7.8 43613.0 43605.2
## SO4 9 279 84.85 82.94 58.60 69.38 48.04 1.0 529.0 528.0
## SiO2 10 279 57.72 19.64 59.00 58.04 23.72 11.0 105.0 94.0
## TDS 11 279 1385.00 4373.48 1057.00 1038.20 597.49 125.0 73023.0 72898.0
## F 12 279 423.57 169.28 421.00 420.61 164.57 20.0 995.0 975.0
## B 13 279 607.19 753.90 478.00 476.91 403.27 4.5 6270.0 6265.5
## skew kurtosis se
## T -0.45 -0.08 0.16
## pH 0.18 -0.49 0.04
## Na 16.09 261.98 90.72
## K 14.67 231.23 1.97
## Mg 1.32 2.65 3.09
## Ca 12.97 194.35 5.85
## Alk. 1.19 1.75 23.83
## Cl 16.20 264.55 156.99
## SO4 2.28 6.53 4.97
## SiO2 -0.13 -0.73 1.18
## TDS 15.77 255.05 261.83
## F 0.24 0.18 10.13
## B 4.40 25.51 45.13
summary(USArrests)
## T pH Na K
## Min. : 7.50 Min. :5.900 Min. : 7.5 Min. : 1.20
## 1st Qu.:14.00 1st Qu.:6.595 1st Qu.: 58.3 1st Qu.: 10.75
## Median :15.90 Median :7.040 Median : 107.0 Median : 16.40
## Mean :15.54 Mean :7.027 Mean : 223.9 Mean : 19.26
## 3rd Qu.:17.55 3rd Qu.:7.490 3rd Qu.: 147.5 3rd Qu.: 21.70
## Max. :22.60 Max. :8.800 Max. :25210.0 Max. :546.00
## Mg Ca Alk. Cl
## Min. : 1.60 Min. : 3.50 Min. : 48.8 Min. : 7.80
## 1st Qu.: 29.65 1st Qu.: 26.80 1st Qu.: 245.5 1st Qu.: 39.75
## Median : 65.00 Median : 46.70 Median : 451.0 Median : 63.30
## Mean : 71.85 Mean : 59.07 Mean : 567.4 Mean : 270.44
## 3rd Qu.: 97.60 3rd Qu.: 73.10 3rd Qu.: 825.0 3rd Qu.: 98.55
## Max. :299.00 Max. :1555.00 Max. :2190.0 Max. :43613.00
## SO4 SiO2 TDS F
## Min. : 1.00 Min. : 11.00 Min. : 125 Min. : 20.0
## 1st Qu.: 31.30 1st Qu.: 41.00 1st Qu.: 643 1st Qu.:319.5
## Median : 58.60 Median : 59.00 Median : 1057 Median :421.0
## Mean : 84.85 Mean : 57.72 Mean : 1385 Mean :423.6
## 3rd Qu.:107.00 3rd Qu.: 73.00 3rd Qu.: 1443 3rd Qu.:535.0
## Max. :529.00 Max. :105.00 Max. :73023 Max. :995.0
## B
## Min. : 4.5
## 1st Qu.: 190.0
## Median : 478.0
## Mean : 607.2
## 3rd Qu.: 687.5
## Max. :6270.0
library(psych)
describe(USArrests)
## vars n mean sd median trimmed mad min max range
## T 1 279 15.54 2.63 15.90 15.68 2.52 7.5 22.6 15.1
## pH 2 279 7.03 0.59 7.04 7.02 0.67 5.9 8.8 2.9
## Na 3 279 223.87 1515.33 107.00 107.41 66.27 7.5 25210.0 25202.5
## K 4 279 19.26 32.95 16.40 16.53 8.15 1.2 546.0 544.8
## Mg 5 279 71.85 51.62 65.00 66.04 51.30 1.6 299.0 297.4
## Ca 6 279 59.07 97.68 46.70 48.92 33.06 3.5 1555.0 1551.5
## Alk. 7 279 567.40 397.96 451.00 523.48 379.55 48.8 2190.0 2141.2
## Cl 8 279 270.44 2622.23 63.30 69.68 42.85 7.8 43613.0 43605.2
## SO4 9 279 84.85 82.94 58.60 69.38 48.04 1.0 529.0 528.0
## SiO2 10 279 57.72 19.64 59.00 58.04 23.72 11.0 105.0 94.0
## TDS 11 279 1385.00 4373.48 1057.00 1038.20 597.49 125.0 73023.0 72898.0
## F 12 279 423.57 169.28 421.00 420.61 164.57 20.0 995.0 975.0
## B 13 279 607.19 753.90 478.00 476.91 403.27 4.5 6270.0 6265.5
## skew kurtosis se
## T -0.45 -0.08 0.16
## pH 0.18 -0.49 0.04
## Na 16.09 261.98 90.72
## K 14.67 231.23 1.97
## Mg 1.32 2.65 3.09
## Ca 12.97 194.35 5.85
## Alk. 1.19 1.75 23.83
## Cl 16.20 264.55 156.99
## SO4 2.28 6.53 4.97
## SiO2 -0.13 -0.73 1.18
## TDS 15.77 255.05 261.83
## F 0.24 0.18 10.13
## B 4.40 25.51 45.13
Observamos el resumen de estadísticas descriptivas.
Obteniendo solo promedios
#obteniendo solo las medias
apply(X = USArrests, MARGIN = 2, FUN = mean)
## T pH Na K Mg Ca
## 15.542029 7.027204 223.866667 19.258781 71.849104 59.074910
## Alk. Cl SO4 SiO2 TDS F
## 567.404659 270.436559 84.846953 57.717391 1385.003584 423.570397
## B
## 607.189535
el promedio de los datos muestra que hay una gran diferencia de B,ALK y F frente a los demas
\(Obteniendo varianzas\)
#obteniendo solo las varianzas
apply(X = USArrests, MARGIN = 2, FUN = var)
## T pH Na K Mg Ca
## 6.898030e+00 3.425720e-01 2.296218e+06 1.085472e+03 2.665107e+03 9.541488e+03
## Alk. Cl SO4 SiO2 TDS F
## 1.583715e+05 6.876066e+06 6.879545e+03 3.859207e+02 1.912735e+07 2.865524e+04
## B
## 5.683618e+05
la varianza no es distinta entre las varibles,
\(Mostrando las correlaciones\)
corr.test(USArrests)
## Call:corr.test(x = USArrests)
## Correlation matrix
## T pH Na K Mg Ca Alk. Cl SO4 SiO2 TDS F
## T 1.00 -0.20 0.10 0.20 0.44 0.24 0.36 0.09 0.34 0.36 0.15 0.25
## pH -0.20 1.00 -0.10 -0.15 -0.33 -0.22 -0.38 -0.10 0.11 -0.51 -0.14 0.21
## Na 0.10 -0.10 1.00 0.97 0.30 0.92 0.22 1.00 -0.03 0.01 1.00 -0.11
## K 0.20 -0.15 0.97 1.00 0.41 0.91 0.30 0.97 0.05 0.10 0.98 -0.09
## Mg 0.44 -0.33 0.30 0.41 1.00 0.42 0.90 0.29 0.06 0.42 0.39 -0.04
## Ca 0.24 -0.22 0.92 0.91 0.42 1.00 0.35 0.92 0.11 0.06 0.94 -0.16
## Alk. 0.36 -0.38 0.22 0.30 0.90 0.35 1.00 0.20 -0.21 0.44 0.30 -0.09
## Cl 0.09 -0.10 1.00 0.97 0.29 0.92 0.20 1.00 -0.03 0.00 0.99 -0.11
## SO4 0.34 0.11 -0.03 0.05 0.06 0.11 -0.21 -0.03 1.00 -0.10 -0.01 0.18
## SiO2 0.36 -0.51 0.01 0.10 0.42 0.06 0.44 0.00 -0.10 1.00 0.05 0.06
## TDS 0.15 -0.14 1.00 0.98 0.39 0.94 0.30 0.99 -0.01 0.05 1.00 -0.11
## F 0.25 0.21 -0.11 -0.09 -0.04 -0.16 -0.09 -0.11 0.18 0.06 -0.11 1.00
## B 0.32 -0.22 -0.01 0.05 0.66 0.10 0.65 -0.02 -0.18 0.28 0.05 -0.14
## B
## T 0.32
## pH -0.22
## Na -0.01
## K 0.05
## Mg 0.66
## Ca 0.10
## Alk. 0.65
## Cl -0.02
## SO4 -0.18
## SiO2 0.28
## TDS 0.05
## F -0.14
## B 1.00
## Sample Size
## [1] 279
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## T pH Na K Mg Ca Alk. Cl SO4 SiO2 TDS F B
## T 0.00 0.04 1.00 0.04 0.00 0.00 0.00 1.00 0.00 0.00 0.46 0.00 0.00
## pH 0.00 0.00 1.00 0.37 0.00 0.01 0.00 1.00 1.00 0.00 0.71 0.01 0.01
## Na 0.09 0.10 0.00 0.00 0.00 0.00 0.01 0.00 1.00 1.00 0.00 1.00 1.00
## K 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 1.00 1.00
## Mg 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 1.00 0.00
## Ca 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 0.20 1.00
## Alk. 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.02 0.00 0.00 1.00 0.00
## Cl 0.13 0.11 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 1.00 1.00
## SO4 0.00 0.06 0.65 0.44 0.31 0.07 0.00 0.65 0.00 1.00 1.00 0.09 0.11
## SiO2 0.00 0.00 0.93 0.10 0.00 0.35 0.00 0.97 0.09 0.00 1.00 1.00 0.00
## TDS 0.01 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.82 0.41 0.00 1.00 1.00
## F 0.00 0.00 0.07 0.15 0.54 0.01 0.12 0.06 0.00 0.32 0.06 0.00 0.66
## B 0.00 0.00 0.83 0.39 0.00 0.09 0.00 0.74 0.00 0.00 0.41 0.02 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
se observar que si existen variables correlacionadas no muy altas y significativas
grafico de dispersion para obervar las correlaciones
pairs(USArrests)
cor.plot(cor(USArrests)) # grafico de calor
existen correlaciones negativas y positivas cuando mas azul mas correlacion
\(Prueba de Bartlett\)
library(psych)
cortest.bartlett(cor(USArrests),n=dim(USArrests))
## $chisq
## [1] 8148.1431 204.0769
##
## $p.value
## [1] 0.000000e+00 3.149713e-13
##
## $df
## [1] 78
los valores son significaivos podemos decir que el modelo es bueno
\(Prueba KMO.\)
# Indicador Kaiser-Meyer-Olkinn KMO y MSA
KMO(USArrests)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = USArrests)
## Overall MSA = 0.56
## MSA for each item =
## T pH Na K Mg Ca Alk. Cl SO4 SiO2 TDS F B
## 0.69 0.78 0.55 0.80 0.45 0.54 0.49 0.67 0.12 0.34 0.56 0.61 0.88
Overall MSA = 0.56 > 0.5, el modelo es bueno
\(Estandarización:\)
pca <- prcomp(USArrests, scale = TRUE)
names(pca)
## [1] "sdev" "rotation" "center" "scale" "x"
head(pca$rotation)
## PC1 PC2 PC3 PC4 PC5 PC6
## T 0.1361927 -0.2731805 0.51457595 0.055394888 -0.08433851 -0.706986865
## pH -0.1321625 0.2865636 0.18253852 -0.640005003 0.17570254 -0.098016682
## Na 0.3967272 0.2234817 -0.02362820 0.009351879 0.08265150 -0.037195812
## K 0.4075000 0.1573201 0.03703918 0.028813153 0.04620004 0.006749292
## Mg 0.2666984 -0.3924425 0.06902412 -0.257587853 -0.06307970 0.386096014
## Ca 0.4040827 0.1251113 0.03667790 0.018220776 -0.11786022 -0.022152576
## PC7 PC8 PC9 PC10 PC11 PC12
## T 0.05725529 0.342512173 -0.1003657 0.030629847 -0.02030488 0.0025562365
## pH -0.62956947 0.088158141 0.1070242 0.018965253 0.03262828 0.0009676038
## Na -0.01803842 -0.055037736 -0.1269257 0.008954831 -0.37831504 -0.3878341885
## K -0.06566401 -0.069652446 -0.3193736 -0.506114433 0.66132205 -0.0032468934
## Mg -0.04849942 0.153514682 -0.3940159 0.573850679 0.19634377 -0.0189271696
## Ca 0.09218847 0.006661993 0.7612744 0.311169923 0.34139699 -0.0209886140
## PC13
## T -0.0007082226
## pH 0.0002187098
## Na 0.6860907073
## K 0.0118352532
## Mg 0.0362573708
## Ca 0.0557167989
dim(pca$rotation)
## [1] 13 13
Hay un total de 13 componentes principales distintas, ya que en general pueden haber min(n−1,P) componentes en un set de datos n×p. En este caso min(13, 13) = 13.
#Obteniendo las medias originales y las estandarizadas
pca$center
## T pH Na K Mg Ca
## 15.542029 7.027204 223.866667 19.258781 71.849104 59.074910
## Alk. Cl SO4 SiO2 TDS F
## 567.404659 270.436559 84.846953 57.717391 1385.003584 423.570397
## B
## 607.189535
pca$scale
## T pH Na K Mg Ca
## 2.6264102 0.5852965 1515.3277634 32.9464966 51.6246747 97.6805387
## Alk. Cl SO4 SiO2 TDS F
## 397.9591165 2622.2253484 82.9430253 19.6448646 4373.4821766 169.2785992
## B
## 753.8977197
Observamos que los promedios han cambiado
Importancia de los componentes:
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.317 1.6799 1.2486 1.01729 0.93899 0.67735 0.64232
## Proportion of Variance 0.413 0.2171 0.1199 0.07961 0.06782 0.03529 0.03174
## Cumulative Proportion 0.413 0.6301 0.7500 0.82959 0.89741 0.93270 0.96444
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.55320 0.30574 0.18911 0.16420 0.007534 0.004095
## Proportion of Variance 0.02354 0.00719 0.00275 0.00207 0.000000 0.000000
## Cumulative Proportion 0.98798 0.99517 0.99792 0.99999 1.000000 1.000000
Observando las desviaciones estándar pasan 4 valors 1 (2.317),(1.6799),(1.2486)y(1.01729) lo cual indicaría que no se puede formar grupos. Tomemos la decisión de formas 9 grupos.
Observando los componentes formados entre las variables
pca
## Standard deviations (1, .., p=13):
## [1] 2.317042302 1.679901026 1.248595492 1.017285400 0.938989147 0.677351457
## [7] 0.642321342 0.553200120 0.305740352 0.189110331 0.164198415 0.007533977
## [13] 0.004095098
##
## Rotation (n x k) = (13 x 13):
## PC1 PC2 PC3 PC4 PC5 PC6
## T 0.136192712 -0.27318050 0.51457595 0.055394888 -0.08433851 -0.706986865
## pH -0.132162509 0.28656357 0.18253852 -0.640005003 0.17570254 -0.098016682
## Na 0.396727156 0.22348173 -0.02362820 0.009351879 0.08265150 -0.037195812
## K 0.407500039 0.15732008 0.03703918 0.028813153 0.04620004 0.006749292
## Mg 0.266698399 -0.39244251 0.06902412 -0.257587853 -0.06307970 0.386096014
## Ca 0.404082698 0.12511134 0.03667790 0.018220776 -0.11786022 -0.022152576
## Alk. 0.233217729 -0.42627858 -0.09957174 -0.232771078 0.07845122 0.277739482
## Cl 0.394402810 0.23157844 -0.02743122 0.015258389 0.07709383 -0.048187232
## SO4 -0.001820351 0.06887796 0.61200831 0.037806788 -0.60050763 0.368988352
## SiO2 0.101690231 -0.37861307 0.03258883 0.528321514 0.23329368 0.031560853
## TDS 0.410477198 0.17674325 -0.01405008 -0.006819769 0.06320757 -0.004464665
## F -0.062312932 -0.00595508 0.54202179 -0.016012019 0.70670312 0.226904514
## B 0.116027158 -0.42917176 -0.11654719 -0.429479670 -0.06798481 -0.267314496
## PC7 PC8 PC9 PC10 PC11
## T 0.05725529 0.342512173 -0.10036568 0.030629847 -0.02030488
## pH -0.62956947 0.088158141 0.10702417 0.018965253 0.03262828
## Na -0.01803842 -0.055037736 -0.12692565 0.008954831 -0.37831504
## K -0.06566401 -0.069652446 -0.31937361 -0.506114433 0.66132205
## Mg -0.04849942 0.153514682 -0.39401594 0.573850679 0.19634377
## Ca 0.09218847 0.006661993 0.76127438 0.311169923 0.34139699
## Alk. 0.02844242 0.457250106 0.28580750 -0.511740209 -0.23749744
## Cl -0.01344234 -0.077613828 -0.11640437 0.094923402 -0.29041269
## SO4 -0.09257456 -0.222096150 0.04131944 -0.185153223 -0.16726647
## SiO2 -0.67145100 -0.211096250 0.11805386 0.022022059 -0.02915912
## TDS -0.01640925 -0.030776922 -0.07062980 0.032132990 -0.30177404
## F 0.33430569 -0.181865710 0.07813000 0.003631076 0.01301761
## B 0.11153625 -0.707396318 0.06002420 -0.098172838 -0.04627410
## PC12 PC13
## T 0.0025562365 -0.0007082226
## pH 0.0009676038 0.0002187098
## Na -0.3878341885 0.6860907073
## K -0.0032468934 0.0118352532
## Mg -0.0189271696 0.0362573708
## Ca -0.0209886140 0.0557167989
## Alk. 0.0790422003 0.0200509685
## Cl 0.8163809669 -0.0445493765
## SO4 0.0251463438 0.0033175253
## SiO2 0.0023307855 0.0042563038
## TDS -0.4188104546 -0.7227059594
## F 0.0005309019 0.0001083877
## B -0.0014377951 0.0004838945
PC1: T, SO4 y F
PC2: pH, sio2
pc3: so4, F
pc4: pH, sion2
pc5: B
pc6: Ca
pc7: K, mg, alk
pc8: mg,
pc9: CI
pc10: Na, TDS
head(pca$x)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 -1.2535876 0.8561843 -2.2464536 1.2917491 0.4218158 0.8831636 -0.35618118
## 2 -0.2062781 0.1530381 -0.9679597 1.0751717 -0.9116472 -0.1647192 -0.08995261
## 3 -1.8724095 2.6085487 -1.8912115 -1.6958557 -0.4907340 -0.1121410 0.31497998
## 4 -1.1924992 1.6369774 -2.1172449 0.2878570 -0.6538190 0.5741960 0.88955252
## 5 -0.7717266 1.6449251 -1.3068634 -0.2989835 -1.0007858 0.1866089 -0.10649719
## 6 -0.7962054 1.1254591 -1.8439328 0.2532316 -0.5293883 0.7592181 0.57346895
## PC8 PC9 PC10 PC11 PC12 PC13
## 1 -1.55252599 0.21408926 0.0497285265 -0.08913391 -0.002572394 0.0022071666
## 2 0.32863120 -0.07586143 -0.2419917043 -0.05372308 0.003543838 -0.0016897110
## 3 -0.41249080 -0.01570493 0.0317565036 -0.03907811 -0.001642762 -0.0021730706
## 4 -0.16453316 -0.04897481 0.0002472432 -0.02386110 -0.002387001 -0.0001692588
## 5 0.20137389 0.41832450 0.0108713589 0.13085190 -0.001163117 0.0007241825
## 6 -0.09725678 -0.02008797 -0.0570347172 0.02746829 -0.003660400 0.0008757448
\(vemos componentes para casos:\)
dim(pca$x)
## [1] 279 13
Mediante la función biplot() se puede obtener una representación bidimensional de las dos primeras componentes. Es recomendable indicar el argumento scale = 0 para que las flechas estén en la misma escala que las componentes.
par(mfrow=c(1,1))
biplot(x = pca, scale = 0, cex = 0.8, col = c("blue4", "brown3"))
La imagen especular, cuya interpretación es equivalente, se puede obtener invirtiendo el signo de los loadings y de los principal component scores.
pca$rotation <- -pca$rotation
pca$x <- -pca$x
biplot(x = pca, scale = 0, cex = 0.8, col = c("blue4", "brown3"))
Una vez calculadas las componentes principales, se puede conocer la varianza explicada por cada una de ellas, la proporción respecto al total y la proporción de varianza acumulada.
# graficando la proporción de varianza explicada
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
pca$sdev^2
## [1] 5.368685e+00 2.822067e+00 1.558991e+00 1.034870e+00 8.817006e-01
## [6] 4.588050e-01 4.125767e-01 3.060304e-01 9.347716e-02 3.576272e-02
## [11] 2.696112e-02 5.676080e-05 1.676983e-05
prop_varianza <- pca$sdev^2/sum(pca$sdev^2)
prop_varianza
## [1] 4.129758e-01 2.170821e-01 1.199224e-01 7.960535e-02 6.782312e-02
## [6] 3.529269e-02 3.173667e-02 2.354080e-02 7.190551e-03 2.750978e-03
## [11] 2.073932e-03 4.366216e-06 1.289987e-06
ggplot(data = data.frame(prop_varianza, pc = 1:13),aes(x = pc, y =
prop_varianza)) +
geom_col(width = 0.3) +
scale_y_continuous(limits = c(0, 1)) +
theme_bw() +
labs(x = "Componente principal", y = "Proporcion de varianza explicada")
prop_varianza_acum <- cumsum(prop_varianza)
prop_varianza_acum
## [1] 0.4129758 0.6300579 0.7499802 0.8295856 0.8974087 0.9327014 0.9644381
## [8] 0.9879789 0.9951694 0.9979204 0.9999943 0.9999987 1.0000000
ggplot(data = data.frame(prop_varianza_acum, pc = 1:13),
aes(x = pc, y = prop_varianza_acum, group = 1)) +
geom_point() +
geom_line() +
geom_label(aes(label=round(prop_varianza_acum,2)))+
theme_bw() +
labs(x = "Componente principal", y = "Proporcion de varianza explicada a cumulada")
En este caso, la primera componente explica el 41% de la varianza observada en los datos y la segunda el 22.4%. Las dos últimas componentes no superan por separado el 1% de varianza explicada. Si se empleasen únicamente las dos primeras componentes se conseguiría explicar el 63% de la varianza observada.
library(psych)
facto=principal(r=USArrests,nfactors=4,rotate="none")
facto$values
## [1] 5.368685e+00 2.822067e+00 1.558991e+00 1.034870e+00 8.817006e-01
## [6] 4.588050e-01 4.125767e-01 3.060304e-01 9.347716e-02 3.576272e-02
## [11] 2.696112e-02 5.676080e-05 1.676983e-05
# Grafico de sedimentacion:
plot(facto$values,type="h") # Grafica de Valores propios
facto$communality # Comunalidades
## T pH Na K Mg Ca Alk. Cl
## 0.7261632 0.8013542 0.9868969 0.9643469 0.8925865 0.9232283 0.8763414 0.9878756
## SO4 SiO2 TDS F B
## 0.5988118 0.7505666 0.9930901 0.4792237 0.8041277
facto$loadings # Cargas Factoriales, Componentes
##
## Loadings:
## PC1 PC2 PC3 PC4
## T 0.316 0.459 0.642
## pH -0.306 -0.481 0.228 0.651
## Na 0.919 -0.375
## K 0.944 -0.264
## Mg 0.618 0.659 0.262
## Ca 0.936 -0.210
## Alk. 0.540 0.716 -0.124 0.237
## Cl 0.914 -0.389
## SO4 -0.116 0.764
## SiO2 0.236 0.636 -0.537
## TDS 0.951 -0.297
## F -0.144 0.677
## B 0.269 0.721 -0.146 0.437
##
## PC1 PC2 PC3 PC4
## SS loadings 5.369 2.822 1.559 1.035
## Proportion Var 0.413 0.217 0.120 0.080
## Cumulative Var 0.413 0.630 0.750 0.830
# guardando scores
head(facto$scores)
## PC1 PC2 PC3 PC4
## 1 -0.54102924 -0.50966355 -1.7991845 -1.2698001
## 2 -0.08902648 -0.09109946 -0.7752388 -1.0569027
## 3 -0.80810330 -1.55279904 -1.5146711 1.6670403
## 4 -0.51466439 -0.97444873 -1.6957013 -0.2829658
## 5 -0.33306538 -0.97917976 -1.0466668 0.2939033
## 6 -0.34363007 -0.66995562 -1.4768056 -0.2489288
puntosFact <-cbind(USArrests,facto$scores)
head(puntosFact)
## T pH Na K Mg Ca Alk. Cl SO4 SiO2 TDS F B
## 1 8.4 6.60 15.6 5.5 13.1 17.8 122.0 18.5 18.5 75 286 341 607.1895
## 2 15.0 6.52 110.0 18.8 44.5 46.7 519.0 47.1 79.2 62 933 202 190.0000
## 3 10.2 7.86 20.5 2.0 5.2 14.2 79.3 13.2 24.9 11 177 239 607.1895
## 4 10.2 6.62 40.7 7.0 21.7 34.7 256.0 20.5 34.3 31 449 238 145.0000
## 5 12.2 7.30 55.9 12.1 29.2 89.6 354.0 62.9 79.2 37 760 192 130.0000
## 6 10.7 6.67 69.7 12.5 41.8 42.5 415.0 32.9 46.0 40 708 262 230.0000
## PC1 PC2 PC3 PC4
## 1 -0.54102924 -0.50966355 -1.7991845 -1.2698001
## 2 -0.08902648 -0.09109946 -0.7752388 -1.0569027
## 3 -0.80810330 -1.55279904 -1.5146711 1.6670403
## 4 -0.51466439 -0.97444873 -1.6957013 -0.2829658
## 5 -0.33306538 -0.97917976 -1.0466668 0.2939033
## 6 -0.34363007 -0.66995562 -1.4768056 -0.2489288
write.csv(puntosFact,"fiscoquimicosScores.csv")
# utilizando rotacion
facto=principal(r=USArrests,nfactors=8,rotate="varimax")
facto$values
## [1] 5.368685e+00 2.822067e+00 1.558991e+00 1.034870e+00 8.817006e-01
## [6] 4.588050e-01 4.125767e-01 3.060304e-01 9.347716e-02 3.576272e-02
## [11] 2.696112e-02 5.676080e-05 1.676983e-05
facto$communality
## T pH Na K Mg Ca Alk. Cl
## 0.9990137 0.9988877 0.9946160 0.9695133 0.9726716 0.9392211 0.9814777 0.9960994
## SO4 SiO2 TDS F B
## 0.9978600 0.9986570 0.9970228 0.9994243 0.9992608
facto$loadings
##
## Loadings:
## RC1 RC2 RC3 RC5 RC4 RC7 RC6 RC8
## T 0.235 0.209 0.154 0.155 0.909 0.108
## pH -0.185 0.117 0.938 -0.238
## Na 0.995
## K 0.970 0.137
## Mg 0.262 0.888 0.120 0.164 0.127 0.217
## Ca 0.926 0.180 0.105 -0.121 0.112
## Alk. 0.166 0.922 -0.169 -0.145 0.140 0.133 0.123
## Cl 0.996
## SO4 0.976 0.168
## SiO2 0.254 -0.258 0.915 0.150
## TDS 0.988 0.139
## F 0.977 0.102 0.124
## B 0.536 -0.109 0.142 0.814
##
## RC1 RC2 RC3 RC5 RC4 RC7 RC6 RC8
## SS loadings 4.879 2.159 1.078 1.020 1.013 0.979 0.966 0.750
## Proportion Var 0.375 0.166 0.083 0.078 0.078 0.075 0.074 0.058
## Cumulative Var 0.375 0.541 0.624 0.703 0.781 0.856 0.930 0.988
\(utlizando la funcion PCA\)
library(FactoMineR)
# observando los resultados de 2 componentes
result<- PCA(USArrests,scale.unit = TRUE, ncp=4,graph = TRUE)
Existe bastante relación positiva entre sio2,b,t,aik,mg y forman el componente 1. tambien exisiste telacion entre so4,ca,tds,k,ci,Na forman el componente 2
PH no está relacionado con las demás variables
fa.diagram(facto)