el libro titulado “Técnicas de análisis multivariante de datos- aplicaciones con SPSS-césar pérez” sugiere los siguientes pasos para el abordaje de un conjunto de datos
usaré la base USairpollution
library(HSAUR2)
datos <- USairpollution
head(datos)
## SO2 temp manu popul wind precip predays
## Albany 46 47.6 44 116 8.8 33.36 135
## Albuquerque 11 56.8 46 244 8.9 7.77 58
## Atlanta 24 61.5 368 497 9.1 48.34 115
## Baltimore 47 55.0 625 905 9.6 41.31 111
## Buffalo 11 47.1 391 463 12.4 36.11 166
## Charleston 31 55.2 35 71 6.5 40.75 148
str(datos)
## 'data.frame': 41 obs. of 7 variables:
## $ SO2 : int 46 11 24 47 11 31 110 23 65 26 ...
## $ temp : num 47.6 56.8 61.5 55 47.1 55.2 50.6 54 49.7 51.5 ...
## $ manu : int 44 46 368 625 391 35 3344 462 1007 266 ...
## $ popul : int 116 244 497 905 463 71 3369 453 751 540 ...
## $ wind : num 8.8 8.9 9.1 9.6 12.4 6.5 10.4 7.1 10.9 8.6 ...
## $ precip : num 33.36 7.77 48.34 41.31 36.11 ...
## $ predays: int 135 58 115 111 166 148 122 132 155 134 ...
summary(datos)
## SO2 temp manu popul
## Min. : 8.00 Min. :43.50 Min. : 35.0 Min. : 71.0
## 1st Qu.: 13.00 1st Qu.:50.60 1st Qu.: 181.0 1st Qu.: 299.0
## Median : 26.00 Median :54.60 Median : 347.0 Median : 515.0
## Mean : 30.05 Mean :55.76 Mean : 463.1 Mean : 608.6
## 3rd Qu.: 35.00 3rd Qu.:59.30 3rd Qu.: 462.0 3rd Qu.: 717.0
## Max. :110.00 Max. :75.50 Max. :3344.0 Max. :3369.0
## wind precip predays
## Min. : 6.000 Min. : 7.05 Min. : 36.0
## 1st Qu.: 8.700 1st Qu.:30.96 1st Qu.:103.0
## Median : 9.300 Median :38.74 Median :115.0
## Mean : 9.444 Mean :36.77 Mean :113.9
## 3rd Qu.:10.600 3rd Qu.:43.11 3rd Qu.:128.0
## Max. :12.700 Max. :59.80 Max. :166.0
#devtools::install_github('hadley/ggplot2')
todas las variables son cuantitativas, ahora haremos un gráfico de dispersion, histogramas y correlación
## correlación, histograma, scatterplot ###
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if (missing(cex.cor))
cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex =1.5,font=4)
}
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "magenta3", ...)
}
# run pairs plot
pairs(datos, upper.panel = panel.smooth, diag.panel = panel.hist,
lower.panel = panel.cor,main="Análisis Descriptivo")
require(PerformanceAnalytics)
chart.Correlation(datos, histogram = TRUE, method = "pearson")
require(ggplot2);require(gridExtra)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
p1.1 <- ggplot(datos, aes(SO2)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666")
p2.1 <- ggplot(datos, aes(manu)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666")
multiplot(p1.1,p2.1, cols = 2)
mlab <- 'popul'
plab <- 'manu'
plot(manu~popul, data = datos, xlab=mlab, ylab = plab)
rug(datos$popul,side=1, col="red")
rug(datos$manu,side=2 ,col="blue")
## con ggplot2
qplot(popul, manu, data = datos, main = "Diagrama de dispersión", xlab = "Popul", ylab = 'manu') +
geom_rug(sides = "b", color = "red") +
geom_rug(sides = "l", color = "blue")
commonTheme = list(labs(color="Density",fill="Density"
),
theme_bw(),
theme(legend.position=c(0,1), legend.direction ="horizontal",
legend.justification=c(0,1)))
p8 <-ggplot(data=datos,aes(x = popul, y = manu)) +
stat_density2d(aes(fill=..level..,alpha=..level..),geom='polygon',colour='black') +
scale_fill_continuous(low="green",high="red") +
geom_smooth(method=lm,linetype=2,colour="red",se=F) +
guides(alpha="none") +
geom_point() + commonTheme #+ xlab("PROMLECTURACRITICA")
p8
ggplot(datos, aes(temp,wind))+
geom_point(aes(size=SO2,colour=precip), alpha=.5)+
scale_size()+
theme_bw()
library(ggplot2);library(gridExtra)
pMain <- ggplot(datos, aes(x = popul, y = manu)) +
geom_point()+
xlab("popul")+
ylab("manu")
pTop <- ggplot(datos, aes(x = popul)) +
geom_histogram(bins =5 ) #+ ylab("Tiempo de Reacción (segundos)")+
ggtitle("Histograma del popul")
## $title
## [1] "Histograma del popul"
##
## $subtitle
## NULL
##
## attr(,"class")
## [1] "labels"
pRight <- ggplot(datos, aes(x = popul, y = manu)) +
geom_boxplot()+
xlab("popul")+
ylab("manu")+
ggtitle("Boxplot")
pEmpty <- ggplot(datos, aes(x = popul, y = manu)) +
geom_blank() +
theme(axis.text = element_blank(),
axis.title = element_blank(),
line = element_blank(),
panel.background = element_blank())
grid.arrange(pTop, pEmpty, pMain, pRight,
ncol = 2, nrow = 2, widths = c(3, 1), heights = c(1, 3))
stars(datos,cex = .5)
library(scatterplot3d)
with(datos,scatterplot3d(temp,wind,SO2,type = 'h',angle = 55))
require(MVA)
stalac(datos)
require(mvShapiroTest)
mvShapiro.Test(as.matrix(datos))
##
## Generalized Shapiro-Wilk test for Multivariate Normality by
## Villasenor-Alva and Gonzalez-Estrada
##
## data: as.matrix(datos)
## MVW = 0.93207, p-value = 6.351e-07
require(MVN)
mardiaTest(data=datos, cov = TRUE, qqplot = TRUE)
## Mardia's Multivariate Normality Test
## ---------------------------------------
## data : datos
##
## g1p : 33.16284
## chi.skew : 226.6127
## p.value.skew : 4.824913e-15
##
## g2p : 76.94565
## z.kurtosis : 3.977547
## p.value.kurt : 6.962989e-05
##
## chi.small.skew : 247.6159
## p.value.small : 4.801002e-18
##
## Result : Data are not multivariate normal.
## ---------------------------------------
roystonTest(data=datos, qqplot = TRUE)
## Royston's Multivariate Normality Test
## ---------------------------------------------
## data : datos
##
## H : 84.49088
## p-value : 3.751309e-16
##
## Result : Data are not multivariate normal.
## ---------------------------------------------
hzTest(data=datos, qqplot = TRUE)
## Henze-Zirkler's Multivariate Normality Test
## ---------------------------------------------
## data : datos
##
## HZ : 1.197216
## p-value : 2.819996e-10
##
## Result : Data are not multivariate normal.
## ---------------------------------------------
qqplot.data <- function (vec) # argument: vector of numbers
{
# following four lines from base R's qqline()
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
d <- data.frame(resids = vec)
ggplot(d, aes(sample = resids)) + stat_qq() +
geom_abline(slope = slope, intercept = int, col = "blue") +
ggtitle("Normal Q-Q Plot")
}
qqplot.data(datos$temp)