UNIVERSINDAD NACIONAL DEL ALTIPLANO

INGENERIA ESTADISTICA E INFORMATICA

DOCENTE: CARPIO VARGAS, EDGAR ELOY

TRABAJO DE MULTIVARIANTE ACP

NOMBRE: VALERIA JARA SANCA

mundo <- read.csv("fisico quimicas acp.csv",head=T, sep=";")
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.0.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.0.2
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(mundo)

misdate <- sort(sapply(mundo, function(x){sum(is.na(x)/length(x))}), decreasing = T) 
misdate
##         NH4         PO4           V          Eh         NO3           B 
## 0.684587814 0.283154122 0.229390681 0.204301075 0.100358423 0.075268817 
##           T        SiO2           F   ï..Sample        Date          pH 
## 0.010752688 0.010752688 0.007168459 0.000000000 0.000000000 0.000000000 
##          Na           K          Mg          Ca        Alk.          Cl 
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 
##         SO4         TDS 
## 0.000000000 0.000000000
faltantes <- misdate[misdate > 0.1]
mundo <- read.csv("fisico quimicas acp.csv",head=T, sep=";")
final <- mundo[, colMeans(is.na(mundo)) <= .10]
dim(final)
## [1] 279  15
names(final)
##  [1] "ï..Sample" "Date"      "T"         "pH"        "Na"        "K"        
##  [7] "Mg"        "Ca"        "Alk."      "Cl"        "SO4"       "SiO2"     
## [13] "TDS"       "F"         "B"
colSums(is.na(final))
## ï..Sample      Date         T        pH        Na         K        Mg        Ca 
##         0         0         3         0         0         0         0         0 
##      Alk.        Cl       SO4      SiO2       TDS         F         B 
##         0         0         0         3         0         2        21

hay variables con datos N.A

library(datasets)
library(VIM)
## Warning: package 'VIM' was built under R version 4.0.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.0.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr(final,numbers=T,sortVar=T)

## 
##  Variables sorted by number of missings: 
##   Variable       Count
##          B 0.075268817
##          T 0.010752688
##       SiO2 0.010752688
##          F 0.007168459
##  ï..Sample 0.000000000
##       Date 0.000000000
##         pH 0.000000000
##         Na 0.000000000
##          K 0.000000000
##         Mg 0.000000000
##         Ca 0.000000000
##       Alk. 0.000000000
##         Cl 0.000000000
##        SO4 0.000000000
##        TDS 0.000000000

COMO SE OBSERVA FALTA DATOS

B 0.075268817

T 0.010752688

PARA ESTO PROCEDEMOS A COMPLETAR CON EL METODO DE LA MEDIA

colSums(is.na(final))
## ï..Sample      Date         T        pH        Na         K        Mg        Ca 
##         0         0         3         0         0         0         0         0 
##      Alk.        Cl       SO4      SiO2       TDS         F         B 
##         0         0         0         3         0         2        21
library(base)
library(mice)
## Warning: package 'mice' was built under R version 4.0.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
tempData <- mice(final,m=5,maxit=2,method="mean",seed=500)
## 
##  iter imp variable
##   1   1  T  SiO2  F  B
##   1   2  T  SiO2  F  B
##   1   3  T  SiO2  F  B
##   1   4  T  SiO2  F  B
##   1   5  T  SiO2  F  B
##   2   1  T  SiO2  F  B
##   2   2  T  SiO2  F  B
##   2   3  T  SiO2  F  B
##   2   4  T  SiO2  F  B
##   2   5  T  SiO2  F  B
## Warning: Number of logged events: 33
newdata <- complete(tempData,1)
colSums(is.na(newdata))
## ï..Sample      Date         T        pH        Na         K        Mg        Ca 
##         0         0         0         0         0         0         0         0 
##      Alk.        Cl       SO4      SiO2       TDS         F         B 
##         0         0         0         0         0         0         0
colSums(is.na(final))
## ï..Sample      Date         T        pH        Na         K        Mg        Ca 
##         0         0         3         0         0         0         0         0 
##      Alk.        Cl       SO4      SiO2       TDS         F         B 
##         0         0         0         3         0         2        21
dim(newdata)
## [1] 279  15

YA SE COMPLETARON AHORA USAREMOS LA MISMA GRAFICA PARA PODER COMPROBAR

library(datasets)
library(VIM)
aggr(newdata,numbers=T,sortVar=T)

## 
##  Variables sorted by number of missings: 
##   Variable Count
##  ï..Sample     0
##       Date     0
##          T     0
##         pH     0
##         Na     0
##          K     0
##         Mg     0
##         Ca     0
##       Alk.     0
##         Cl     0
##        SO4     0
##       SiO2     0
##        TDS     0
##          F     0
##          B     0

COMO SE OBSERVA EN EL GRAFICO NO HAY DATOS N.A EN NINGUNA DE LAS VARIABLES ESTO NOS INDICA QUE LA DATA ESTA LIMPIA

AHORA GUARDAMOS LA DATA

write.csv(newdata, file="datos imputados.csv")

LOS DATOS LIMPIOS SE GUARDARON

\[ACP\]

CARGAMOS LA DATA LIMPIA

USArrests <- read.csv("datos imputados.csv",head=T,sep=",",row.names = 1) 
head(USArrests)
##     ï..Sample       Date    T   pH    Na    K   Mg   Ca  Alk.   Cl  SO4 SiO2
## 1       Piano 16/01/1997  8.4 6.60  15.6  5.5 13.1 17.8 122.0 18.5 18.5   75
## 2          S. 16/01/1997 15.0 6.52 110.0 18.8 44.5 46.7 519.0 47.1 79.2   62
## 3        Vena 16/01/1997 10.2 7.86  20.5  2.0  5.2 14.2  79.3 13.2 24.9   11
## 4     Fisuari  9/04/1997 10.2 6.62  40.7  7.0 21.7 34.7 256.0 20.5 34.3   31
## 5 Acquafredda 28/04/1997 12.2 7.30  55.9 12.1 29.2 89.6 354.0 62.9 79.2   37
## 6     Cambria 28/04/1997 10.7 6.67  69.7 12.5 41.8 42.5 415.0 32.9 46.0   40
##   TDS   F        B
## 1 286 341 607.1895
## 2 933 202 190.0000
## 3 177 239 607.1895
## 4 449 238 145.0000
## 5 760 192 130.0000
## 6 708 262 230.0000
str(USArrests)
## 'data.frame':    279 obs. of  15 variables:
##  $ ï..Sample: chr  "Piano" "S." "Vena" "Fisuari" ...
##  $ Date     : chr  "16/01/1997" "16/01/1997" "16/01/1997" "9/04/1997" ...
##  $ T        : num  8.4 15 10.2 10.2 12.2 10.7 14.1 10.9 14.1 9.2 ...
##  $ pH       : num  6.6 6.52 7.86 6.62 7.3 6.67 7.5 6.7 7.57 7 ...
##  $ Na       : num  15.6 110 20.5 40.7 55.9 69.7 52.4 46.9 37.9 19.1 ...
##  $ K        : num  5.5 18.8 2 7 12.1 12.5 10.2 9 8.4 3.9 ...
##  $ Mg       : num  13.1 44.5 5.2 21.7 29.2 41.8 22.2 24.4 11.8 8.1 ...
##  $ Ca       : num  17.8 46.7 14.2 34.7 89.6 42.5 23.2 63.5 21.9 51.1 ...
##  $ Alk.     : num  122 519 79.3 256 354 415 226 299 177 214 ...
##  $ Cl       : num  18.5 47.1 13.2 20.5 62.9 32.9 28.8 39.8 24.5 11.8 ...
##  $ SO4      : num  18.5 79.2 24.9 34.3 79.2 46 40.4 52.3 23.9 15.1 ...
##  $ SiO2     : num  75 62 11 31 37 40 37 35 39 20 ...
##  $ TDS      : int  286 933 177 449 760 708 452 595 349 344 ...
##  $ F        : num  341 202 239 238 192 262 458 189 294 142 ...
##  $ B        : num  607 190 607 145 130 ...

VEMOS SAMPLE Y LE FECHAN SON PEFRESENTACION DE LA VARIABLE EL ID EKIMINANDO ID(x)

USArrests <- USArrests[,-1] 
str(USArrests)
## 'data.frame':    279 obs. of  14 variables:
##  $ Date: chr  "16/01/1997" "16/01/1997" "16/01/1997" "9/04/1997" ...
##  $ T   : num  8.4 15 10.2 10.2 12.2 10.7 14.1 10.9 14.1 9.2 ...
##  $ pH  : num  6.6 6.52 7.86 6.62 7.3 6.67 7.5 6.7 7.57 7 ...
##  $ Na  : num  15.6 110 20.5 40.7 55.9 69.7 52.4 46.9 37.9 19.1 ...
##  $ K   : num  5.5 18.8 2 7 12.1 12.5 10.2 9 8.4 3.9 ...
##  $ Mg  : num  13.1 44.5 5.2 21.7 29.2 41.8 22.2 24.4 11.8 8.1 ...
##  $ Ca  : num  17.8 46.7 14.2 34.7 89.6 42.5 23.2 63.5 21.9 51.1 ...
##  $ Alk.: num  122 519 79.3 256 354 415 226 299 177 214 ...
##  $ Cl  : num  18.5 47.1 13.2 20.5 62.9 32.9 28.8 39.8 24.5 11.8 ...
##  $ SO4 : num  18.5 79.2 24.9 34.3 79.2 46 40.4 52.3 23.9 15.1 ...
##  $ SiO2: num  75 62 11 31 37 40 37 35 39 20 ...
##  $ TDS : int  286 933 177 449 760 708 452 595 349 344 ...
##  $ F   : num  341 202 239 238 192 262 458 189 294 142 ...
##  $ B   : num  607 190 607 145 130 ...
USArrests <- USArrests[,-1] 
str(USArrests)
## 'data.frame':    279 obs. of  13 variables:
##  $ T   : num  8.4 15 10.2 10.2 12.2 10.7 14.1 10.9 14.1 9.2 ...
##  $ pH  : num  6.6 6.52 7.86 6.62 7.3 6.67 7.5 6.7 7.57 7 ...
##  $ Na  : num  15.6 110 20.5 40.7 55.9 69.7 52.4 46.9 37.9 19.1 ...
##  $ K   : num  5.5 18.8 2 7 12.1 12.5 10.2 9 8.4 3.9 ...
##  $ Mg  : num  13.1 44.5 5.2 21.7 29.2 41.8 22.2 24.4 11.8 8.1 ...
##  $ Ca  : num  17.8 46.7 14.2 34.7 89.6 42.5 23.2 63.5 21.9 51.1 ...
##  $ Alk.: num  122 519 79.3 256 354 415 226 299 177 214 ...
##  $ Cl  : num  18.5 47.1 13.2 20.5 62.9 32.9 28.8 39.8 24.5 11.8 ...
##  $ SO4 : num  18.5 79.2 24.9 34.3 79.2 46 40.4 52.3 23.9 15.1 ...
##  $ SiO2: num  75 62 11 31 37 40 37 35 39 20 ...
##  $ TDS : int  286 933 177 449 760 708 452 595 349 344 ...
##  $ F   : num  341 202 239 238 192 262 458 189 294 142 ...
##  $ B   : num  607 190 607 145 130 ...
str(USArrests)
## 'data.frame':    279 obs. of  13 variables:
##  $ T   : num  8.4 15 10.2 10.2 12.2 10.7 14.1 10.9 14.1 9.2 ...
##  $ pH  : num  6.6 6.52 7.86 6.62 7.3 6.67 7.5 6.7 7.57 7 ...
##  $ Na  : num  15.6 110 20.5 40.7 55.9 69.7 52.4 46.9 37.9 19.1 ...
##  $ K   : num  5.5 18.8 2 7 12.1 12.5 10.2 9 8.4 3.9 ...
##  $ Mg  : num  13.1 44.5 5.2 21.7 29.2 41.8 22.2 24.4 11.8 8.1 ...
##  $ Ca  : num  17.8 46.7 14.2 34.7 89.6 42.5 23.2 63.5 21.9 51.1 ...
##  $ Alk.: num  122 519 79.3 256 354 415 226 299 177 214 ...
##  $ Cl  : num  18.5 47.1 13.2 20.5 62.9 32.9 28.8 39.8 24.5 11.8 ...
##  $ SO4 : num  18.5 79.2 24.9 34.3 79.2 46 40.4 52.3 23.9 15.1 ...
##  $ SiO2: num  75 62 11 31 37 40 37 35 39 20 ...
##  $ TDS : int  286 933 177 449 760 708 452 595 349 344 ...
##  $ F   : num  341 202 239 238 192 262 458 189 294 142 ...
##  $ B   : num  607 190 607 145 130 ...

COMO SE OBSERVA SOLO QUEDAN 13 VARIABLES

\(Análisis descriptivo\)

boxplot(USArrests,col=10:16)

EL PROMEDIO DE TDS ES SUPERIOR A LOS DEMAS VARAIBLAES

\(HIstograma\)

par(mfrow=c(1,1))
hist(USArrests$pH, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "pH", ylab = "Frecuencia", col = "#009ACD") 

hist(USArrests$Na, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "Na", ylab = "Frecuencia", col = "#009ACD") 

hist(USArrests$K, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "K", ylab = "Frecuencia", col = "#009ACD")

hist(USArrests$Mg, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "Mg", ylab = "Frecuencia", col = "#009ACD")

hist(USArrests$Ca, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "Ca", ylab = "Frecuencia", col = "#009ACD")

hist(USArrests$Alk, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "Alk", ylab = "Frecuencia", col = "#009ACD")

hist(USArrests$Cl, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "Cl", ylab = "Frecuencia", col = "#009ACD")

hist(USArrests$SO4, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "SO4", ylab = "Frecuencia", col = "#009ACD")

hist(USArrests$SiO2, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "SiO2", ylab = "Frecuencia", col = "#009ACD")

hist(USArrests$TDS, freq = TRUE, main = "Histograma de frecuencias",
     xlab = "TDS", ylab = "Frecuencia", col = "#009ACD")

\(Histogramas escaladas\)

require(psych)
## Loading required package: psych
## Warning: package 'psych' was built under R version 4.0.3
multi.hist(x = USArrests,dcol = c("blue", "red"), dlty = c("dotted", "solid"),
           main = "")

summary(USArrests)
##        T               pH              Na                K         
##  Min.   : 7.50   Min.   :5.900   Min.   :    7.5   Min.   :  1.20  
##  1st Qu.:14.00   1st Qu.:6.595   1st Qu.:   58.3   1st Qu.: 10.75  
##  Median :15.90   Median :7.040   Median :  107.0   Median : 16.40  
##  Mean   :15.54   Mean   :7.027   Mean   :  223.9   Mean   : 19.26  
##  3rd Qu.:17.55   3rd Qu.:7.490   3rd Qu.:  147.5   3rd Qu.: 21.70  
##  Max.   :22.60   Max.   :8.800   Max.   :25210.0   Max.   :546.00  
##        Mg               Ca               Alk.              Cl          
##  Min.   :  1.60   Min.   :   3.50   Min.   :  48.8   Min.   :    7.80  
##  1st Qu.: 29.65   1st Qu.:  26.80   1st Qu.: 245.5   1st Qu.:   39.75  
##  Median : 65.00   Median :  46.70   Median : 451.0   Median :   63.30  
##  Mean   : 71.85   Mean   :  59.07   Mean   : 567.4   Mean   :  270.44  
##  3rd Qu.: 97.60   3rd Qu.:  73.10   3rd Qu.: 825.0   3rd Qu.:   98.55  
##  Max.   :299.00   Max.   :1555.00   Max.   :2190.0   Max.   :43613.00  
##       SO4              SiO2             TDS              F        
##  Min.   :  1.00   Min.   : 11.00   Min.   :  125   Min.   : 20.0  
##  1st Qu.: 31.30   1st Qu.: 41.00   1st Qu.:  643   1st Qu.:319.5  
##  Median : 58.60   Median : 59.00   Median : 1057   Median :421.0  
##  Mean   : 84.85   Mean   : 57.72   Mean   : 1385   Mean   :423.6  
##  3rd Qu.:107.00   3rd Qu.: 73.00   3rd Qu.: 1443   3rd Qu.:535.0  
##  Max.   :529.00   Max.   :105.00   Max.   :73023   Max.   :995.0  
##        B         
##  Min.   :   4.5  
##  1st Qu.: 190.0  
##  Median : 478.0  
##  Mean   : 607.2  
##  3rd Qu.: 687.5  
##  Max.   :6270.0
library(psych) 
describe(USArrests)
##      vars   n    mean      sd  median trimmed    mad   min     max   range
## T       1 279   15.54    2.63   15.90   15.68   2.52   7.5    22.6    15.1
## pH      2 279    7.03    0.59    7.04    7.02   0.67   5.9     8.8     2.9
## Na      3 279  223.87 1515.33  107.00  107.41  66.27   7.5 25210.0 25202.5
## K       4 279   19.26   32.95   16.40   16.53   8.15   1.2   546.0   544.8
## Mg      5 279   71.85   51.62   65.00   66.04  51.30   1.6   299.0   297.4
## Ca      6 279   59.07   97.68   46.70   48.92  33.06   3.5  1555.0  1551.5
## Alk.    7 279  567.40  397.96  451.00  523.48 379.55  48.8  2190.0  2141.2
## Cl      8 279  270.44 2622.23   63.30   69.68  42.85   7.8 43613.0 43605.2
## SO4     9 279   84.85   82.94   58.60   69.38  48.04   1.0   529.0   528.0
## SiO2   10 279   57.72   19.64   59.00   58.04  23.72  11.0   105.0    94.0
## TDS    11 279 1385.00 4373.48 1057.00 1038.20 597.49 125.0 73023.0 72898.0
## F      12 279  423.57  169.28  421.00  420.61 164.57  20.0   995.0   975.0
## B      13 279  607.19  753.90  478.00  476.91 403.27   4.5  6270.0  6265.5
##       skew kurtosis     se
## T    -0.45    -0.08   0.16
## pH    0.18    -0.49   0.04
## Na   16.09   261.98  90.72
## K    14.67   231.23   1.97
## Mg    1.32     2.65   3.09
## Ca   12.97   194.35   5.85
## Alk.  1.19     1.75  23.83
## Cl   16.20   264.55 156.99
## SO4   2.28     6.53   4.97
## SiO2 -0.13    -0.73   1.18
## TDS  15.77   255.05 261.83
## F     0.24     0.18  10.13
## B     4.40    25.51  45.13
summary(USArrests)
##        T               pH              Na                K         
##  Min.   : 7.50   Min.   :5.900   Min.   :    7.5   Min.   :  1.20  
##  1st Qu.:14.00   1st Qu.:6.595   1st Qu.:   58.3   1st Qu.: 10.75  
##  Median :15.90   Median :7.040   Median :  107.0   Median : 16.40  
##  Mean   :15.54   Mean   :7.027   Mean   :  223.9   Mean   : 19.26  
##  3rd Qu.:17.55   3rd Qu.:7.490   3rd Qu.:  147.5   3rd Qu.: 21.70  
##  Max.   :22.60   Max.   :8.800   Max.   :25210.0   Max.   :546.00  
##        Mg               Ca               Alk.              Cl          
##  Min.   :  1.60   Min.   :   3.50   Min.   :  48.8   Min.   :    7.80  
##  1st Qu.: 29.65   1st Qu.:  26.80   1st Qu.: 245.5   1st Qu.:   39.75  
##  Median : 65.00   Median :  46.70   Median : 451.0   Median :   63.30  
##  Mean   : 71.85   Mean   :  59.07   Mean   : 567.4   Mean   :  270.44  
##  3rd Qu.: 97.60   3rd Qu.:  73.10   3rd Qu.: 825.0   3rd Qu.:   98.55  
##  Max.   :299.00   Max.   :1555.00   Max.   :2190.0   Max.   :43613.00  
##       SO4              SiO2             TDS              F        
##  Min.   :  1.00   Min.   : 11.00   Min.   :  125   Min.   : 20.0  
##  1st Qu.: 31.30   1st Qu.: 41.00   1st Qu.:  643   1st Qu.:319.5  
##  Median : 58.60   Median : 59.00   Median : 1057   Median :421.0  
##  Mean   : 84.85   Mean   : 57.72   Mean   : 1385   Mean   :423.6  
##  3rd Qu.:107.00   3rd Qu.: 73.00   3rd Qu.: 1443   3rd Qu.:535.0  
##  Max.   :529.00   Max.   :105.00   Max.   :73023   Max.   :995.0  
##        B         
##  Min.   :   4.5  
##  1st Qu.: 190.0  
##  Median : 478.0  
##  Mean   : 607.2  
##  3rd Qu.: 687.5  
##  Max.   :6270.0
library(psych) 
describe(USArrests)
##      vars   n    mean      sd  median trimmed    mad   min     max   range
## T       1 279   15.54    2.63   15.90   15.68   2.52   7.5    22.6    15.1
## pH      2 279    7.03    0.59    7.04    7.02   0.67   5.9     8.8     2.9
## Na      3 279  223.87 1515.33  107.00  107.41  66.27   7.5 25210.0 25202.5
## K       4 279   19.26   32.95   16.40   16.53   8.15   1.2   546.0   544.8
## Mg      5 279   71.85   51.62   65.00   66.04  51.30   1.6   299.0   297.4
## Ca      6 279   59.07   97.68   46.70   48.92  33.06   3.5  1555.0  1551.5
## Alk.    7 279  567.40  397.96  451.00  523.48 379.55  48.8  2190.0  2141.2
## Cl      8 279  270.44 2622.23   63.30   69.68  42.85   7.8 43613.0 43605.2
## SO4     9 279   84.85   82.94   58.60   69.38  48.04   1.0   529.0   528.0
## SiO2   10 279   57.72   19.64   59.00   58.04  23.72  11.0   105.0    94.0
## TDS    11 279 1385.00 4373.48 1057.00 1038.20 597.49 125.0 73023.0 72898.0
## F      12 279  423.57  169.28  421.00  420.61 164.57  20.0   995.0   975.0
## B      13 279  607.19  753.90  478.00  476.91 403.27   4.5  6270.0  6265.5
##       skew kurtosis     se
## T    -0.45    -0.08   0.16
## pH    0.18    -0.49   0.04
## Na   16.09   261.98  90.72
## K    14.67   231.23   1.97
## Mg    1.32     2.65   3.09
## Ca   12.97   194.35   5.85
## Alk.  1.19     1.75  23.83
## Cl   16.20   264.55 156.99
## SO4   2.28     6.53   4.97
## SiO2 -0.13    -0.73   1.18
## TDS  15.77   255.05 261.83
## F     0.24     0.18  10.13
## B     4.40    25.51  45.13

Observamos el resumen de estadísticas descriptivas.

Obteniendo solo promedios

#obteniendo solo las medias
apply(X = USArrests, MARGIN = 2, FUN = mean)
##           T          pH          Na           K          Mg          Ca 
##   15.542029    7.027204  223.866667   19.258781   71.849104   59.074910 
##        Alk.          Cl         SO4        SiO2         TDS           F 
##  567.404659  270.436559   84.846953   57.717391 1385.003584  423.570397 
##           B 
##  607.189535

el promedio de los datos muestra que hay una gran diferencia de B,ALK y F frente a los demas

\(Obteniendo varianzas\)

#obteniendo solo las varianzas
apply(X = USArrests, MARGIN = 2, FUN = var)
##            T           pH           Na            K           Mg           Ca 
## 6.898030e+00 3.425720e-01 2.296218e+06 1.085472e+03 2.665107e+03 9.541488e+03 
##         Alk.           Cl          SO4         SiO2          TDS            F 
## 1.583715e+05 6.876066e+06 6.879545e+03 3.859207e+02 1.912735e+07 2.865524e+04 
##            B 
## 5.683618e+05

la varianza no es distinta entre las varibles,

\(Mostrando las correlaciones\)

corr.test(USArrests) 
## Call:corr.test(x = USArrests)
## Correlation matrix 
##          T    pH    Na     K    Mg    Ca  Alk.    Cl   SO4  SiO2   TDS     F
## T     1.00 -0.20  0.10  0.20  0.44  0.24  0.36  0.09  0.34  0.36  0.15  0.25
## pH   -0.20  1.00 -0.10 -0.15 -0.33 -0.22 -0.38 -0.10  0.11 -0.51 -0.14  0.21
## Na    0.10 -0.10  1.00  0.97  0.30  0.92  0.22  1.00 -0.03  0.01  1.00 -0.11
## K     0.20 -0.15  0.97  1.00  0.41  0.91  0.30  0.97  0.05  0.10  0.98 -0.09
## Mg    0.44 -0.33  0.30  0.41  1.00  0.42  0.90  0.29  0.06  0.42  0.39 -0.04
## Ca    0.24 -0.22  0.92  0.91  0.42  1.00  0.35  0.92  0.11  0.06  0.94 -0.16
## Alk.  0.36 -0.38  0.22  0.30  0.90  0.35  1.00  0.20 -0.21  0.44  0.30 -0.09
## Cl    0.09 -0.10  1.00  0.97  0.29  0.92  0.20  1.00 -0.03  0.00  0.99 -0.11
## SO4   0.34  0.11 -0.03  0.05  0.06  0.11 -0.21 -0.03  1.00 -0.10 -0.01  0.18
## SiO2  0.36 -0.51  0.01  0.10  0.42  0.06  0.44  0.00 -0.10  1.00  0.05  0.06
## TDS   0.15 -0.14  1.00  0.98  0.39  0.94  0.30  0.99 -0.01  0.05  1.00 -0.11
## F     0.25  0.21 -0.11 -0.09 -0.04 -0.16 -0.09 -0.11  0.18  0.06 -0.11  1.00
## B     0.32 -0.22 -0.01  0.05  0.66  0.10  0.65 -0.02 -0.18  0.28  0.05 -0.14
##          B
## T     0.32
## pH   -0.22
## Na   -0.01
## K     0.05
## Mg    0.66
## Ca    0.10
## Alk.  0.65
## Cl   -0.02
## SO4  -0.18
## SiO2  0.28
## TDS   0.05
## F    -0.14
## B     1.00
## Sample Size 
## [1] 279
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##         T   pH   Na    K   Mg   Ca Alk.   Cl  SO4 SiO2  TDS    F    B
## T    0.00 0.04 1.00 0.04 0.00 0.00 0.00 1.00 0.00 0.00 0.46 0.00 0.00
## pH   0.00 0.00 1.00 0.37 0.00 0.01 0.00 1.00 1.00 0.00 0.71 0.01 0.01
## Na   0.09 0.10 0.00 0.00 0.00 0.00 0.01 0.00 1.00 1.00 0.00 1.00 1.00
## K    0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 1.00 1.00
## Mg   0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 1.00 0.00
## Ca   0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 0.20 1.00
## Alk. 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.02 0.00 0.00 1.00 0.00
## Cl   0.13 0.11 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 1.00 1.00
## SO4  0.00 0.06 0.65 0.44 0.31 0.07 0.00 0.65 0.00 1.00 1.00 0.09 0.11
## SiO2 0.00 0.00 0.93 0.10 0.00 0.35 0.00 0.97 0.09 0.00 1.00 1.00 0.00
## TDS  0.01 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.82 0.41 0.00 1.00 1.00
## F    0.00 0.00 0.07 0.15 0.54 0.01 0.12 0.06 0.00 0.32 0.06 0.00 0.66
## B    0.00 0.00 0.83 0.39 0.00 0.09 0.00 0.74 0.00 0.00 0.41 0.02 0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

se observar que si existen variables correlacionadas no muy altas y significativas

grafico de dispersion para obervar las correlaciones

pairs(USArrests)

cor.plot(cor(USArrests)) # grafico de calor

existen correlaciones negativas y positivas cuando mas azul mas correlacion

\(Prueba de Bartlett\)

library(psych) 
cortest.bartlett(cor(USArrests),n=dim(USArrests))
## $chisq
## [1] 8148.1431  204.0769
## 
## $p.value
## [1] 0.000000e+00 3.149713e-13
## 
## $df
## [1] 78

los valores son significaivos podemos decir que el modelo es bueno

\(Prueba KMO.\)

# Indicador Kaiser-Meyer-Olkinn KMO y MSA
KMO(USArrests)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = USArrests)
## Overall MSA =  0.56
## MSA for each item = 
##    T   pH   Na    K   Mg   Ca Alk.   Cl  SO4 SiO2  TDS    F    B 
## 0.69 0.78 0.55 0.80 0.45 0.54 0.49 0.67 0.12 0.34 0.56 0.61 0.88

Overall MSA = 0.56 > 0.5, el modelo es bueno

\(Estandarización:\)

pca <- prcomp(USArrests, scale = TRUE) 
names(pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
head(pca$rotation)
##           PC1        PC2         PC3          PC4         PC5          PC6
## T   0.1361927 -0.2731805  0.51457595  0.055394888 -0.08433851 -0.706986865
## pH -0.1321625  0.2865636  0.18253852 -0.640005003  0.17570254 -0.098016682
## Na  0.3967272  0.2234817 -0.02362820  0.009351879  0.08265150 -0.037195812
## K   0.4075000  0.1573201  0.03703918  0.028813153  0.04620004  0.006749292
## Mg  0.2666984 -0.3924425  0.06902412 -0.257587853 -0.06307970  0.386096014
## Ca  0.4040827  0.1251113  0.03667790  0.018220776 -0.11786022 -0.022152576
##            PC7          PC8        PC9         PC10        PC11          PC12
## T   0.05725529  0.342512173 -0.1003657  0.030629847 -0.02030488  0.0025562365
## pH -0.62956947  0.088158141  0.1070242  0.018965253  0.03262828  0.0009676038
## Na -0.01803842 -0.055037736 -0.1269257  0.008954831 -0.37831504 -0.3878341885
## K  -0.06566401 -0.069652446 -0.3193736 -0.506114433  0.66132205 -0.0032468934
## Mg -0.04849942  0.153514682 -0.3940159  0.573850679  0.19634377 -0.0189271696
## Ca  0.09218847  0.006661993  0.7612744  0.311169923  0.34139699 -0.0209886140
##             PC13
## T  -0.0007082226
## pH  0.0002187098
## Na  0.6860907073
## K   0.0118352532
## Mg  0.0362573708
## Ca  0.0557167989
dim(pca$rotation)
## [1] 13 13

Hay un total de 13 componentes principales distintas, ya que en general pueden haber min(n−1,P) componentes en un set de datos n×p. En este caso min(13, 13) = 13.

#Obteniendo las medias originales y las estandarizadas
pca$center
##           T          pH          Na           K          Mg          Ca 
##   15.542029    7.027204  223.866667   19.258781   71.849104   59.074910 
##        Alk.          Cl         SO4        SiO2         TDS           F 
##  567.404659  270.436559   84.846953   57.717391 1385.003584  423.570397 
##           B 
##  607.189535
pca$scale
##            T           pH           Na            K           Mg           Ca 
##    2.6264102    0.5852965 1515.3277634   32.9464966   51.6246747   97.6805387 
##         Alk.           Cl          SO4         SiO2          TDS            F 
##  397.9591165 2622.2253484   82.9430253   19.6448646 4373.4821766  169.2785992 
##            B 
##  753.8977197

Observamos que los promedios han cambiado

Importancia de los componentes:

summary(pca)
## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.317 1.6799 1.2486 1.01729 0.93899 0.67735 0.64232
## Proportion of Variance 0.413 0.2171 0.1199 0.07961 0.06782 0.03529 0.03174
## Cumulative Proportion  0.413 0.6301 0.7500 0.82959 0.89741 0.93270 0.96444
##                            PC8     PC9    PC10    PC11     PC12     PC13
## Standard deviation     0.55320 0.30574 0.18911 0.16420 0.007534 0.004095
## Proportion of Variance 0.02354 0.00719 0.00275 0.00207 0.000000 0.000000
## Cumulative Proportion  0.98798 0.99517 0.99792 0.99999 1.000000 1.000000

Observando las desviaciones estándar pasan 4 valors 1 (2.317),(1.6799),(1.2486)y(1.01729) lo cual indicaría que no se puede formar grupos. Tomemos la decisión de formas 9 grupos.

Observando los componentes formados entre las variables

pca
## Standard deviations (1, .., p=13):
##  [1] 2.317042302 1.679901026 1.248595492 1.017285400 0.938989147 0.677351457
##  [7] 0.642321342 0.553200120 0.305740352 0.189110331 0.164198415 0.007533977
## [13] 0.004095098
## 
## Rotation (n x k) = (13 x 13):
##               PC1         PC2         PC3          PC4         PC5          PC6
## T     0.136192712 -0.27318050  0.51457595  0.055394888 -0.08433851 -0.706986865
## pH   -0.132162509  0.28656357  0.18253852 -0.640005003  0.17570254 -0.098016682
## Na    0.396727156  0.22348173 -0.02362820  0.009351879  0.08265150 -0.037195812
## K     0.407500039  0.15732008  0.03703918  0.028813153  0.04620004  0.006749292
## Mg    0.266698399 -0.39244251  0.06902412 -0.257587853 -0.06307970  0.386096014
## Ca    0.404082698  0.12511134  0.03667790  0.018220776 -0.11786022 -0.022152576
## Alk.  0.233217729 -0.42627858 -0.09957174 -0.232771078  0.07845122  0.277739482
## Cl    0.394402810  0.23157844 -0.02743122  0.015258389  0.07709383 -0.048187232
## SO4  -0.001820351  0.06887796  0.61200831  0.037806788 -0.60050763  0.368988352
## SiO2  0.101690231 -0.37861307  0.03258883  0.528321514  0.23329368  0.031560853
## TDS   0.410477198  0.17674325 -0.01405008 -0.006819769  0.06320757 -0.004464665
## F    -0.062312932 -0.00595508  0.54202179 -0.016012019  0.70670312  0.226904514
## B     0.116027158 -0.42917176 -0.11654719 -0.429479670 -0.06798481 -0.267314496
##              PC7          PC8         PC9         PC10        PC11
## T     0.05725529  0.342512173 -0.10036568  0.030629847 -0.02030488
## pH   -0.62956947  0.088158141  0.10702417  0.018965253  0.03262828
## Na   -0.01803842 -0.055037736 -0.12692565  0.008954831 -0.37831504
## K    -0.06566401 -0.069652446 -0.31937361 -0.506114433  0.66132205
## Mg   -0.04849942  0.153514682 -0.39401594  0.573850679  0.19634377
## Ca    0.09218847  0.006661993  0.76127438  0.311169923  0.34139699
## Alk.  0.02844242  0.457250106  0.28580750 -0.511740209 -0.23749744
## Cl   -0.01344234 -0.077613828 -0.11640437  0.094923402 -0.29041269
## SO4  -0.09257456 -0.222096150  0.04131944 -0.185153223 -0.16726647
## SiO2 -0.67145100 -0.211096250  0.11805386  0.022022059 -0.02915912
## TDS  -0.01640925 -0.030776922 -0.07062980  0.032132990 -0.30177404
## F     0.33430569 -0.181865710  0.07813000  0.003631076  0.01301761
## B     0.11153625 -0.707396318  0.06002420 -0.098172838 -0.04627410
##               PC12          PC13
## T     0.0025562365 -0.0007082226
## pH    0.0009676038  0.0002187098
## Na   -0.3878341885  0.6860907073
## K    -0.0032468934  0.0118352532
## Mg   -0.0189271696  0.0362573708
## Ca   -0.0209886140  0.0557167989
## Alk.  0.0790422003  0.0200509685
## Cl    0.8163809669 -0.0445493765
## SO4   0.0251463438  0.0033175253
## SiO2  0.0023307855  0.0042563038
## TDS  -0.4188104546 -0.7227059594
## F     0.0005309019  0.0001083877
## B    -0.0014377951  0.0004838945

PC1: T, SO4 y F

PC2: pH, sio2

pc3: so4, F

pc4: pH, sion2

pc5: B

pc6: Ca

pc7: K, mg, alk

pc8: mg,

pc9: CI

pc10: Na, TDS

head(pca$x)
##          PC1       PC2        PC3        PC4        PC5        PC6         PC7
## 1 -1.2535876 0.8561843 -2.2464536  1.2917491  0.4218158  0.8831636 -0.35618118
## 2 -0.2062781 0.1530381 -0.9679597  1.0751717 -0.9116472 -0.1647192 -0.08995261
## 3 -1.8724095 2.6085487 -1.8912115 -1.6958557 -0.4907340 -0.1121410  0.31497998
## 4 -1.1924992 1.6369774 -2.1172449  0.2878570 -0.6538190  0.5741960  0.88955252
## 5 -0.7717266 1.6449251 -1.3068634 -0.2989835 -1.0007858  0.1866089 -0.10649719
## 6 -0.7962054 1.1254591 -1.8439328  0.2532316 -0.5293883  0.7592181  0.57346895
##           PC8         PC9          PC10        PC11         PC12          PC13
## 1 -1.55252599  0.21408926  0.0497285265 -0.08913391 -0.002572394  0.0022071666
## 2  0.32863120 -0.07586143 -0.2419917043 -0.05372308  0.003543838 -0.0016897110
## 3 -0.41249080 -0.01570493  0.0317565036 -0.03907811 -0.001642762 -0.0021730706
## 4 -0.16453316 -0.04897481  0.0002472432 -0.02386110 -0.002387001 -0.0001692588
## 5  0.20137389  0.41832450  0.0108713589  0.13085190 -0.001163117  0.0007241825
## 6 -0.09725678 -0.02008797 -0.0570347172  0.02746829 -0.003660400  0.0008757448

\(vemos componentes para casos:\)

dim(pca$x)
## [1] 279  13

Mediante la función biplot() se puede obtener una representación bidimensional de las dos primeras componentes. Es recomendable indicar el argumento scale = 0 para que las flechas estén en la misma escala que las componentes.

par(mfrow=c(1,1))
biplot(x = pca, scale = 0, cex = 0.8, col = c("blue4", "brown3"))

La imagen especular, cuya interpretación es equivalente, se puede obtener invirtiendo el signo de los loadings y de los principal component scores.

pca$rotation <- -pca$rotation
pca$x <- -pca$x
biplot(x = pca, scale = 0, cex = 0.8, col = c("blue4", "brown3"))

Una vez calculadas las componentes principales, se puede conocer la varianza explicada por cada una de ellas, la proporción respecto al total y la proporción de varianza acumulada.

# graficando la proporción de varianza explicada
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
pca$sdev^2
##  [1] 5.368685e+00 2.822067e+00 1.558991e+00 1.034870e+00 8.817006e-01
##  [6] 4.588050e-01 4.125767e-01 3.060304e-01 9.347716e-02 3.576272e-02
## [11] 2.696112e-02 5.676080e-05 1.676983e-05
prop_varianza <- pca$sdev^2/sum(pca$sdev^2)
prop_varianza
##  [1] 4.129758e-01 2.170821e-01 1.199224e-01 7.960535e-02 6.782312e-02
##  [6] 3.529269e-02 3.173667e-02 2.354080e-02 7.190551e-03 2.750978e-03
## [11] 2.073932e-03 4.366216e-06 1.289987e-06
ggplot(data = data.frame(prop_varianza, pc = 1:13),aes(x = pc, y = 
                                                         prop_varianza)) +
  geom_col(width = 0.3) + 
  scale_y_continuous(limits = c(0, 1)) + 
  theme_bw() +
  labs(x = "Componente principal", y = "Proporcion de varianza explicada") 

prop_varianza_acum <- cumsum(prop_varianza) 
prop_varianza_acum
##  [1] 0.4129758 0.6300579 0.7499802 0.8295856 0.8974087 0.9327014 0.9644381
##  [8] 0.9879789 0.9951694 0.9979204 0.9999943 0.9999987 1.0000000
ggplot(data = data.frame(prop_varianza_acum, pc = 1:13),
       aes(x = pc, y = prop_varianza_acum, group = 1)) +
  geom_point() + 
  geom_line() +
  geom_label(aes(label=round(prop_varianza_acum,2)))+ 
  theme_bw() +
  labs(x = "Componente principal", y = "Proporcion de varianza explicada a cumulada")

En este caso, la primera componente explica el 41% de la varianza observada en los datos y la segunda el 22.4%. Las dos últimas componentes no superan por separado el 1% de varianza explicada. Si se empleasen únicamente las dos primeras componentes se conseguiría explicar el 63% de la varianza observada.

library(psych) 
facto=principal(r=USArrests,nfactors=4,rotate="none")
facto$values
##  [1] 5.368685e+00 2.822067e+00 1.558991e+00 1.034870e+00 8.817006e-01
##  [6] 4.588050e-01 4.125767e-01 3.060304e-01 9.347716e-02 3.576272e-02
## [11] 2.696112e-02 5.676080e-05 1.676983e-05
# Grafico de sedimentacion:
plot(facto$values,type="h") # Grafica de Valores propios

facto$communality   # Comunalidades
##         T        pH        Na         K        Mg        Ca      Alk.        Cl 
## 0.7261632 0.8013542 0.9868969 0.9643469 0.8925865 0.9232283 0.8763414 0.9878756 
##       SO4      SiO2       TDS         F         B 
## 0.5988118 0.7505666 0.9930901 0.4792237 0.8041277
facto$loadings  # Cargas Factoriales, Componentes
## 
## Loadings:
##      PC1    PC2    PC3    PC4   
## T     0.316  0.459  0.642       
## pH   -0.306 -0.481  0.228  0.651
## Na    0.919 -0.375              
## K     0.944 -0.264              
## Mg    0.618  0.659         0.262
## Ca    0.936 -0.210              
## Alk.  0.540  0.716 -0.124  0.237
## Cl    0.914 -0.389              
## SO4         -0.116  0.764       
## SiO2  0.236  0.636        -0.537
## TDS   0.951 -0.297              
## F    -0.144         0.677       
## B     0.269  0.721 -0.146  0.437
## 
##                  PC1   PC2   PC3   PC4
## SS loadings    5.369 2.822 1.559 1.035
## Proportion Var 0.413 0.217 0.120 0.080
## Cumulative Var 0.413 0.630 0.750 0.830
# guardando scores
head(facto$scores)
##           PC1         PC2        PC3        PC4
## 1 -0.54102924 -0.50966355 -1.7991845 -1.2698001
## 2 -0.08902648 -0.09109946 -0.7752388 -1.0569027
## 3 -0.80810330 -1.55279904 -1.5146711  1.6670403
## 4 -0.51466439 -0.97444873 -1.6957013 -0.2829658
## 5 -0.33306538 -0.97917976 -1.0466668  0.2939033
## 6 -0.34363007 -0.66995562 -1.4768056 -0.2489288
puntosFact <-cbind(USArrests,facto$scores) 
head(puntosFact)
##      T   pH    Na    K   Mg   Ca  Alk.   Cl  SO4 SiO2 TDS   F        B
## 1  8.4 6.60  15.6  5.5 13.1 17.8 122.0 18.5 18.5   75 286 341 607.1895
## 2 15.0 6.52 110.0 18.8 44.5 46.7 519.0 47.1 79.2   62 933 202 190.0000
## 3 10.2 7.86  20.5  2.0  5.2 14.2  79.3 13.2 24.9   11 177 239 607.1895
## 4 10.2 6.62  40.7  7.0 21.7 34.7 256.0 20.5 34.3   31 449 238 145.0000
## 5 12.2 7.30  55.9 12.1 29.2 89.6 354.0 62.9 79.2   37 760 192 130.0000
## 6 10.7 6.67  69.7 12.5 41.8 42.5 415.0 32.9 46.0   40 708 262 230.0000
##           PC1         PC2        PC3        PC4
## 1 -0.54102924 -0.50966355 -1.7991845 -1.2698001
## 2 -0.08902648 -0.09109946 -0.7752388 -1.0569027
## 3 -0.80810330 -1.55279904 -1.5146711  1.6670403
## 4 -0.51466439 -0.97444873 -1.6957013 -0.2829658
## 5 -0.33306538 -0.97917976 -1.0466668  0.2939033
## 6 -0.34363007 -0.66995562 -1.4768056 -0.2489288
write.csv(puntosFact,"fiscoquimicosScores.csv")
# utilizando rotacion
facto=principal(r=USArrests,nfactors=8,rotate="varimax")
facto$values
##  [1] 5.368685e+00 2.822067e+00 1.558991e+00 1.034870e+00 8.817006e-01
##  [6] 4.588050e-01 4.125767e-01 3.060304e-01 9.347716e-02 3.576272e-02
## [11] 2.696112e-02 5.676080e-05 1.676983e-05
facto$communality
##         T        pH        Na         K        Mg        Ca      Alk.        Cl 
## 0.9990137 0.9988877 0.9946160 0.9695133 0.9726716 0.9392211 0.9814777 0.9960994 
##       SO4      SiO2       TDS         F         B 
## 0.9978600 0.9986570 0.9970228 0.9994243 0.9992608
facto$loadings
## 
## Loadings:
##      RC1    RC2    RC3    RC5    RC4    RC7    RC6    RC8   
## T            0.235  0.209  0.154         0.155  0.909  0.108
## pH          -0.185         0.117  0.938 -0.238              
## Na    0.995                                                 
## K     0.970  0.137                                          
## Mg    0.262  0.888  0.120                0.164  0.127  0.217
## Ca    0.926  0.180  0.105        -0.121         0.112       
## Alk.  0.166  0.922 -0.169        -0.145  0.140  0.133  0.123
## Cl    0.996                                                 
## SO4                 0.976                       0.168       
## SiO2         0.254               -0.258  0.915  0.150       
## TDS   0.988  0.139                                          
## F                          0.977  0.102         0.124       
## B            0.536 -0.109                       0.142  0.814
## 
##                  RC1   RC2   RC3   RC5   RC4   RC7   RC6   RC8
## SS loadings    4.879 2.159 1.078 1.020 1.013 0.979 0.966 0.750
## Proportion Var 0.375 0.166 0.083 0.078 0.078 0.075 0.074 0.058
## Cumulative Var 0.375 0.541 0.624 0.703 0.781 0.856 0.930 0.988

\(utlizando la funcion PCA\)

library(FactoMineR)
# observando los resultados de 2 componentes
result<- PCA(USArrests,scale.unit = TRUE, ncp=4,graph = TRUE)

Existe bastante relación positiva entre sio2,b,t,aik,mg y forman el componente 1. tambien exisiste telacion entre so4,ca,tds,k,ci,Na forman el componente 2

PH no está relacionado con las demás variables

fa.diagram(facto)