Análisis descriptivo

el libro titulado “Técnicas de análisis multivariante de datos- aplicaciones con SPSS-césar pérez” sugiere los siguientes pasos para el abordaje de un conjunto de datos

Análisis exploratorio y gráfico de los datos
- Histogramas de frecuencias
- Diagrama de tallo y hojas
- Gráfico de caja y bigotes
- Gráfico multiple de caja y bigotes
- Gráfico de simetría
- Gráfico de dispersión
Análisis de los datos ausentes
- Supresión de casos según lista
- Supresión de datos según pareja
- suprimir los casos(filas) o variables (columnas)
- Imputación de la información faltante
  - Método de imputación por sustitución del caso
  - Método de sustitución por la media
  - Método de sustitución por la mediana
  - Método de sustitución por interpolación
  - Método de sustitución por un valor constante
  - Método de imputación por regresión
  - Método de imputación multiple
Análisis y detección de valores atípicos
- Detección univariante de casos atípicos
- Detección bivariante de casos atípicos
Comprobación de los supuestos del análisis multivariante
- Normalidad
- Heteroscedasticidad
- Multicolinealidad
- Autocorrelación
- Linealidad
- Análisis de los residuos

Análisis exploratorio y gráfico de los datos

usaré la base USairpollution

library(HSAUR2)
datos <- USairpollution
head(datos)

##             SO2 temp manu popul wind precip predays
## Albany       46 47.6   44   116  8.8  33.36     135
## Albuquerque  11 56.8   46   244  8.9   7.77      58
## Atlanta      24 61.5  368   497  9.1  48.34     115
## Baltimore    47 55.0  625   905  9.6  41.31     111
## Buffalo      11 47.1  391   463 12.4  36.11     166
## Charleston   31 55.2   35    71  6.5  40.75     148

str(datos)

## 'data.frame':    41 obs. of  7 variables:
##  $ SO2    : int  46 11 24 47 11 31 110 23 65 26 ...
##  $ temp   : num  47.6 56.8 61.5 55 47.1 55.2 50.6 54 49.7 51.5 ...
##  $ manu   : int  44 46 368 625 391 35 3344 462 1007 266 ...
##  $ popul  : int  116 244 497 905 463 71 3369 453 751 540 ...
##  $ wind   : num  8.8 8.9 9.1 9.6 12.4 6.5 10.4 7.1 10.9 8.6 ...
##  $ precip : num  33.36 7.77 48.34 41.31 36.11 ...
##  $ predays: int  135 58 115 111 166 148 122 132 155 134 ...

summary(datos)

##       SO2              temp            manu            popul       
##  Min.   :  8.00   Min.   :43.50   Min.   :  35.0   Min.   :  71.0  
##  1st Qu.: 13.00   1st Qu.:50.60   1st Qu.: 181.0   1st Qu.: 299.0  
##  Median : 26.00   Median :54.60   Median : 347.0   Median : 515.0  
##  Mean   : 30.05   Mean   :55.76   Mean   : 463.1   Mean   : 608.6  
##  3rd Qu.: 35.00   3rd Qu.:59.30   3rd Qu.: 462.0   3rd Qu.: 717.0  
##  Max.   :110.00   Max.   :75.50   Max.   :3344.0   Max.   :3369.0  
##       wind            precip         predays     
##  Min.   : 6.000   Min.   : 7.05   Min.   : 36.0  
##  1st Qu.: 8.700   1st Qu.:30.96   1st Qu.:103.0  
##  Median : 9.300   Median :38.74   Median :115.0  
##  Mean   : 9.444   Mean   :36.77   Mean   :113.9  
##  3rd Qu.:10.600   3rd Qu.:43.11   3rd Qu.:128.0  
##  Max.   :12.700   Max.   :59.80   Max.   :166.0

#devtools::install_github('hadley/ggplot2')

todas las variables son cuantitativas, ahora haremos un gráfico de dispersion, histogramas y correlación

## correlación, histograma, scatterplot ###
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if (missing(cex.cor)) 
    cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex =1.5,font=4)
}

panel.hist <- function(x, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5))
  h <- hist(x, plot = FALSE)
  breaks <- h$breaks
  nB <- length(breaks)
  y <- h$counts
  y <- y/max(y)
  rect(breaks[-nB], 0, breaks[-1], y, col = "magenta3", ...)
  
}

# run pairs plot
pairs(datos, upper.panel = panel.smooth, diag.panel = panel.hist, 
      lower.panel = panel.cor,main="Análisis Descriptivo")
require(PerformanceAnalytics)

chart.Correlation(datos, histogram = TRUE, method = "pearson")

histogramas

require(ggplot2);require(gridExtra)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

p1.1 <- ggplot(datos, aes(SO2)) +
      geom_histogram(aes(y=..density..), colour="black", fill="white") +
      geom_density(alpha=.2, fill="#FF6666")
p2.1 <- ggplot(datos, aes(manu)) +
      geom_histogram(aes(y=..density..), colour="black", fill="white") +
      geom_density(alpha=.2, fill="#FF6666")
multiplot(p1.1,p2.1, cols = 2)

gráficos de dispersión

mlab <- 'popul'
plab <- 'manu'
plot(manu~popul, data = datos, xlab=mlab, ylab = plab)
rug(datos$popul,side=1, col="red")
rug(datos$manu,side=2 ,col="blue")

## con ggplot2
qplot(popul, manu, data = datos, main = "Diagrama de dispersión", xlab = "Popul", ylab = 'manu') +
  geom_rug(sides = "b", color = "red") +
  geom_rug(sides = "l", color = "blue")

gráfico de contorno

commonTheme = list(labs(color="Density",fill="Density"
                        ),
                   theme_bw(),
                   theme(legend.position=c(0,1), legend.direction ="horizontal",
                         legend.justification=c(0,1)))
p8 <-ggplot(data=datos,aes(x = popul, y = manu)) + 
  stat_density2d(aes(fill=..level..,alpha=..level..),geom='polygon',colour='black') + 
  scale_fill_continuous(low="green",high="red") +
  geom_smooth(method=lm,linetype=2,colour="red",se=F) + 
  guides(alpha="none") +
  geom_point() + commonTheme #+ xlab("PROMLECTURACRITICA")
p8

gráfico burbuja

ggplot(datos, aes(temp,wind))+
  geom_point(aes(size=SO2,colour=precip), alpha=.5)+
  scale_size()+
  theme_bw()

marginales

library(ggplot2);library(gridExtra)

pMain <- ggplot(datos, aes(x = popul, y = manu)) +
  geom_point()+
  xlab("popul")+
  ylab("manu")
pTop <- ggplot(datos, aes(x = popul)) +
  geom_histogram(bins =5 ) #+ ylab("Tiempo de Reacción (segundos)")+
  ggtitle("Histograma del popul")

## $title
## [1] "Histograma del popul"
## 
## $subtitle
## NULL
## 
## attr(,"class")
## [1] "labels"

pRight <- ggplot(datos, aes(x = popul, y = manu)) +
  geom_boxplot()+ 
  xlab("popul")+
  ylab("manu")+
  ggtitle("Boxplot")
pEmpty <- ggplot(datos, aes(x = popul, y = manu)) +
  geom_blank() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        line = element_blank(),
        panel.background = element_blank())
grid.arrange(pTop, pEmpty, pMain, pRight,
             ncol = 2, nrow = 2, widths = c(3, 1), heights = c(1, 3))

gráfico estrella

stars(datos,cex = .5)

gráfico dispersión en 3D

library(scatterplot3d)
with(datos,scatterplot3d(temp,wind,SO2,type = 'h',angle = 55))

gráfico stalac

require(MVA)
stalac(datos)

Comprobación de los supuestos del análisis multivariante

Shapiro test

require(mvShapiroTest)
mvShapiro.Test(as.matrix(datos))

## 
##  Generalized Shapiro-Wilk test for Multivariate Normality by
##  Villasenor-Alva and Gonzalez-Estrada
## 
## data:  as.matrix(datos)
## MVW = 0.93207, p-value = 6.351e-07

Mardia test

require(MVN)
mardiaTest(data=datos, cov = TRUE, qqplot = TRUE)

##    Mardia's Multivariate Normality Test 
## --------------------------------------- 
##    data : datos 
## 
##    g1p            : 33.16284 
##    chi.skew       : 226.6127 
##    p.value.skew   : 4.824913e-15 
## 
##    g2p            : 76.94565 
##    z.kurtosis     : 3.977547 
##    p.value.kurt   : 6.962989e-05 
## 
##    chi.small.skew : 247.6159 
##    p.value.small  : 4.801002e-18 
## 
##    Result          : Data are not multivariate normal. 
## ---------------------------------------

Royston test

roystonTest(data=datos, qqplot = TRUE)

##   Royston's Multivariate Normality Test 
## --------------------------------------------- 
##   data : datos 
## 
##   H       : 84.49088 
##   p-value : 3.751309e-16 
## 
##   Result  : Data are not multivariate normal. 
## ---------------------------------------------

Henze-Zirkler test

hzTest(data=datos, qqplot = TRUE)

##   Henze-Zirkler's Multivariate Normality Test 
## --------------------------------------------- 
##   data : datos 
## 
##   HZ      : 1.197216 
##   p-value : 2.819996e-10 
## 
##   Result  : Data are not multivariate normal. 
## ---------------------------------------------

Normal qqplot univariado

qqplot.data <- function (vec) # argument: vector of numbers
{
  # following four lines from base R's qqline()
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]
  
  d <- data.frame(resids = vec)
  
  ggplot(d, aes(sample = resids)) + stat_qq() + 
    geom_abline(slope = slope, intercept = int, col = "blue") +
    ggtitle("Normal Q-Q Plot")
  
}
qqplot.data(datos$temp)

Análisis descriptivo

Kevin Hidalgo

18 de agosto de 2017

Análisis descriptivo

Análisis exploratorio y gráfico de los datos

histogramas

gráficos de dispersión

gráfico de contorno

gráfico burbuja

marginales

gráfico estrella

gráfico dispersión en 3D

gráfico stalac

Comprobación de los supuestos del análisis multivariante

Shapiro test

Mardia test

Royston test

Henze-Zirkler test

Normal qqplot univariado