Análisis descriptivo

el libro titulado “Técnicas de análisis multivariante de datos- aplicaciones con SPSS-césar pérez” sugiere los siguientes pasos para el abordaje de un conjunto de datos

Análisis exploratorio y gráfico de los datos

usaré la base USairpollution

library(HSAUR2)
datos <- USairpollution
head(datos)
##             SO2 temp manu popul wind precip predays
## Albany       46 47.6   44   116  8.8  33.36     135
## Albuquerque  11 56.8   46   244  8.9   7.77      58
## Atlanta      24 61.5  368   497  9.1  48.34     115
## Baltimore    47 55.0  625   905  9.6  41.31     111
## Buffalo      11 47.1  391   463 12.4  36.11     166
## Charleston   31 55.2   35    71  6.5  40.75     148
str(datos)
## 'data.frame':    41 obs. of  7 variables:
##  $ SO2    : int  46 11 24 47 11 31 110 23 65 26 ...
##  $ temp   : num  47.6 56.8 61.5 55 47.1 55.2 50.6 54 49.7 51.5 ...
##  $ manu   : int  44 46 368 625 391 35 3344 462 1007 266 ...
##  $ popul  : int  116 244 497 905 463 71 3369 453 751 540 ...
##  $ wind   : num  8.8 8.9 9.1 9.6 12.4 6.5 10.4 7.1 10.9 8.6 ...
##  $ precip : num  33.36 7.77 48.34 41.31 36.11 ...
##  $ predays: int  135 58 115 111 166 148 122 132 155 134 ...
summary(datos)
##       SO2              temp            manu            popul       
##  Min.   :  8.00   Min.   :43.50   Min.   :  35.0   Min.   :  71.0  
##  1st Qu.: 13.00   1st Qu.:50.60   1st Qu.: 181.0   1st Qu.: 299.0  
##  Median : 26.00   Median :54.60   Median : 347.0   Median : 515.0  
##  Mean   : 30.05   Mean   :55.76   Mean   : 463.1   Mean   : 608.6  
##  3rd Qu.: 35.00   3rd Qu.:59.30   3rd Qu.: 462.0   3rd Qu.: 717.0  
##  Max.   :110.00   Max.   :75.50   Max.   :3344.0   Max.   :3369.0  
##       wind            precip         predays     
##  Min.   : 6.000   Min.   : 7.05   Min.   : 36.0  
##  1st Qu.: 8.700   1st Qu.:30.96   1st Qu.:103.0  
##  Median : 9.300   Median :38.74   Median :115.0  
##  Mean   : 9.444   Mean   :36.77   Mean   :113.9  
##  3rd Qu.:10.600   3rd Qu.:43.11   3rd Qu.:128.0  
##  Max.   :12.700   Max.   :59.80   Max.   :166.0
#devtools::install_github('hadley/ggplot2')

todas las variables son cuantitativas, ahora haremos un gráfico de dispersion, histogramas y correlación

## correlación, histograma, scatterplot ###
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if (missing(cex.cor)) 
    cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex =1.5,font=4)
}

panel.hist <- function(x, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5))
  h <- hist(x, plot = FALSE)
  breaks <- h$breaks
  nB <- length(breaks)
  y <- h$counts
  y <- y/max(y)
  rect(breaks[-nB], 0, breaks[-1], y, col = "magenta3", ...)
  
}

# run pairs plot
pairs(datos, upper.panel = panel.smooth, diag.panel = panel.hist, 
      lower.panel = panel.cor,main="Análisis Descriptivo")
require(PerformanceAnalytics)

chart.Correlation(datos, histogram = TRUE, method = "pearson")

histogramas

require(ggplot2);require(gridExtra)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

p1.1 <- ggplot(datos, aes(SO2)) +
      geom_histogram(aes(y=..density..), colour="black", fill="white") +
      geom_density(alpha=.2, fill="#FF6666")
p2.1 <- ggplot(datos, aes(manu)) +
      geom_histogram(aes(y=..density..), colour="black", fill="white") +
      geom_density(alpha=.2, fill="#FF6666")
multiplot(p1.1,p2.1, cols = 2)

gráficos de dispersión

mlab <- 'popul'
plab <- 'manu'
plot(manu~popul, data = datos, xlab=mlab, ylab = plab)
rug(datos$popul,side=1, col="red")
rug(datos$manu,side=2 ,col="blue")

## con ggplot2
qplot(popul, manu, data = datos, main = "Diagrama de dispersión", xlab = "Popul", ylab = 'manu') +
  geom_rug(sides = "b", color = "red") +
  geom_rug(sides = "l", color = "blue")

gráfico de contorno

commonTheme = list(labs(color="Density",fill="Density"
                        ),
                   theme_bw(),
                   theme(legend.position=c(0,1), legend.direction ="horizontal",
                         legend.justification=c(0,1)))
p8 <-ggplot(data=datos,aes(x = popul, y = manu)) + 
  stat_density2d(aes(fill=..level..,alpha=..level..),geom='polygon',colour='black') + 
  scale_fill_continuous(low="green",high="red") +
  geom_smooth(method=lm,linetype=2,colour="red",se=F) + 
  guides(alpha="none") +
  geom_point() + commonTheme #+ xlab("PROMLECTURACRITICA")
p8

gráfico burbuja

ggplot(datos, aes(temp,wind))+
  geom_point(aes(size=SO2,colour=precip), alpha=.5)+
  scale_size()+
  theme_bw()

marginales

library(ggplot2);library(gridExtra)

pMain <- ggplot(datos, aes(x = popul, y = manu)) +
  geom_point()+
  xlab("popul")+
  ylab("manu")
pTop <- ggplot(datos, aes(x = popul)) +
  geom_histogram(bins =5 ) #+ ylab("Tiempo de Reacción (segundos)")+
  ggtitle("Histograma del popul")
## $title
## [1] "Histograma del popul"
## 
## $subtitle
## NULL
## 
## attr(,"class")
## [1] "labels"
pRight <- ggplot(datos, aes(x = popul, y = manu)) +
  geom_boxplot()+ 
  xlab("popul")+
  ylab("manu")+
  ggtitle("Boxplot")
pEmpty <- ggplot(datos, aes(x = popul, y = manu)) +
  geom_blank() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        line = element_blank(),
        panel.background = element_blank())
grid.arrange(pTop, pEmpty, pMain, pRight,
             ncol = 2, nrow = 2, widths = c(3, 1), heights = c(1, 3))

gráfico estrella

stars(datos,cex = .5)

gráfico dispersión en 3D

library(scatterplot3d)
with(datos,scatterplot3d(temp,wind,SO2,type = 'h',angle = 55))

gráfico stalac

require(MVA)
stalac(datos)

Comprobación de los supuestos del análisis multivariante

Shapiro test

require(mvShapiroTest)
mvShapiro.Test(as.matrix(datos))
## 
##  Generalized Shapiro-Wilk test for Multivariate Normality by
##  Villasenor-Alva and Gonzalez-Estrada
## 
## data:  as.matrix(datos)
## MVW = 0.93207, p-value = 6.351e-07

Mardia test

require(MVN)
mardiaTest(data=datos, cov = TRUE, qqplot = TRUE)

##    Mardia's Multivariate Normality Test 
## --------------------------------------- 
##    data : datos 
## 
##    g1p            : 33.16284 
##    chi.skew       : 226.6127 
##    p.value.skew   : 4.824913e-15 
## 
##    g2p            : 76.94565 
##    z.kurtosis     : 3.977547 
##    p.value.kurt   : 6.962989e-05 
## 
##    chi.small.skew : 247.6159 
##    p.value.small  : 4.801002e-18 
## 
##    Result          : Data are not multivariate normal. 
## ---------------------------------------

Royston test

roystonTest(data=datos, qqplot = TRUE)

##   Royston's Multivariate Normality Test 
## --------------------------------------------- 
##   data : datos 
## 
##   H       : 84.49088 
##   p-value : 3.751309e-16 
## 
##   Result  : Data are not multivariate normal. 
## ---------------------------------------------

Henze-Zirkler test

hzTest(data=datos, qqplot = TRUE)

##   Henze-Zirkler's Multivariate Normality Test 
## --------------------------------------------- 
##   data : datos 
## 
##   HZ      : 1.197216 
##   p-value : 2.819996e-10 
## 
##   Result  : Data are not multivariate normal. 
## ---------------------------------------------

Normal qqplot univariado

qqplot.data <- function (vec) # argument: vector of numbers
{
  # following four lines from base R's qqline()
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]
  
  d <- data.frame(resids = vec)
  
  ggplot(d, aes(sample = resids)) + stat_qq() + 
    geom_abline(slope = slope, intercept = int, col = "blue") +
    ggtitle("Normal Q-Q Plot")
  
}
qqplot.data(datos$temp)