Esta práctica inicial de laboratorio tiene como objetivo realizar una primera aproximación al Lenguaje R, utilizando el enfoque de análisis exploratorio de datos sobre un dataset, a efectos de repasar conceptos fundamentales de estadística descriptiva.
A partir del dataset MPI_subnational.csv (Multidimensional Poverty Measures), se solicita trabajar sobre las siguientes consignas:
Explore y explique en que consiste el dataset utilizando herramientas de exploración de datos.
Para comenzar cargamos las librerías necesarias para esta práctica:
library(pacman)
pacman::p_load(tidyverse, modeest, WVPlots, DT, plotly, GGally)
options(warn=-1)Luego cargamos el dataset a analizar:
read_csv <- function(name) {
read.csv(
paste('https://raw.githubusercontent.com/adrianmarino/dm/master/datasets/', name, '.csv', sep=''),
header = TRUE,
sep = ','
)
}
dataset = read_csv('MPI_subnational')Renombramos las columnas del dataset a nombres mas cortos y reemplazamos valores nulos por ceros:
dataset = dataset %>%
rename(
country_code = ISO.country.code,
country = Country,
sub_nat_region = Sub.national.region,
world_region = World.region,
mpi_national = MPI.National,
mpi_regional = MPI.Regional,
hc_regional = Headcount.Ratio.Regional,
iod_regional = Intensity.of.deprivation.Regional
) %>%
replace_na(list(
iod_regional = 0,
mpi_national = 0,
hc_regional = 0,
mpi_regional = 0
))Creamos una nueva tabla con las columnas numéricas normalizadas, que usaremos para comparar distribuciones:
numeric_columns = c("mpi_national", "mpi_regional", "hc_regional", "iod_regional")
normalize <- function(x) { (x - min(x)) / ( max(x) - min(x) ) }
dataset.n <- dataset %>% mutate_at(numeric_columns, normalize)
summary(dataset.n)## country_code country sub_nat_region world_region
## Length:984 Length:984 Length:984 Length:984
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## mpi_national mpi_regional hc_regional iod_regional
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1002 1st Qu.:0.07124 1st Qu.:0.1260 1st Qu.:0.5455
## Median :0.2805 Median :0.20833 Median :0.3429 Median :0.6008
## Mean :0.3307 Mean :0.28405 Mean :0.4059 Mean :0.6210
## 3rd Qu.:0.4958 3rd Qu.:0.45901 3rd Qu.:0.6740 3rd Qu.:0.6838
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
Defininos las funciones que vamos a luego utilizar para graficar:
pie_plot <- function(data, title) {
values = table(data)
labels = paste(names(values), " (", values, ")", sep="")
pie(values, labels = labels, main=title)
}
ggpie_plot <- function(df, seg_label="", sum_label="") {
p <- ggplot(df, aes(x="", y=Frequency, fill=Value)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
geom_text(
aes(label = Frequency),
position = position_stack(vjust = 0.5),
color = "white"
)+
labs(
x = NULL,
y = NULL,
fill = seg_label,
title = paste(sum_label, "por", seg_label, sep=" ")
) +
theme_void()
p
}
gplot_hist <- function(
values,
ylab = "Frecuencia",
name = "",
line_size=1.05,
truncated_mean_value=0.05,
binwidth=1,
linetype="solid"
) {
df = as.data.frame(values)
p <- ggplot(df, aes(x=values)) +
geom_histogram(aes(y=..density..), color="darkblue", fill="lightblue", binwidth=binwidth) +
geom_density(alpha=0.2, size=line_size) +
# Plot measures of central tendency...
geom_vline(aes(xintercept = mean(values), color='Media'), linetype = linetype, size=line_size) +
geom_vline(aes(xintercept = mean(values, truncated_mean_value), color='Media Truncada'), linetype=linetype, size=line_size) +
geom_vline(aes(xintercept = median(values), color='Mediana'), linetype=linetype, size=line_size) +
geom_vline(aes(xintercept = max(values), color='Máximo'), linetype=linetype, size=line_size) +
geom_vline(aes(xintercept = min(values), color='Mínimo'), linetype=linetype, size=line_size)
for(var in mfv(df$values)) {
p <- p + geom_vline(aes(xintercept = var, color='Moda'), linetype=linetype, size=line_size)
}
p <- p + scale_color_manual(
name = "Medidas de tendencia central",
values = c(
'Media' = "black",
'Media Truncada' = 'wheat3',
'Mediana' = 'red',
'Máximo' = 'darkolivegreen4',
'Mínimo' = 'darkgoldenrod1',
'Moda' = 'blue'
)
)
p <- p + labs(x=name, y = ylab, title = paste("Histograma", name, sep=" - "))
p
}
plot_hist <- function(
values,
name,
xlab,
ylab = "Frecuencia",
lwd = 3,
legend_ = TRUE,
missing_value = 0,
truncated_mean_value=0.05
) {
# Fill missing values...
values[is.na(values)] <- missing_value
# Plot histogram..
hist(
values,
col = "deepskyblue",
main = sprintf("Distribución - %s", name),
xlab = name,
ylab = ylab,
freq = FALSE
)
# Plot measures of central tendency...
lines(density(values), col = "chocolate3", lwd = lwd)
abline(v = mean(values), col = "black", lwd = lwd)
abline(v = mean(values, truncated_mean_value), col = "wheat3", lwd = lwd)
abline(v = median(values), col = "red", lwd = lwd)
abline(v = mfv(values), col = "blue", lwd = lwd)
abline(v = max(values), col = "darkolivegreen4", lwd = lwd)
abline(v = min(values), col = "darkgoldenrod1", lwd = lwd)
# Plot legend...
if (isTRUE(legend_)) {
legend(
x = "topright",
c(
"Densidad", "Media","Media Truncada",
"Mediana", "Moda", "Máximo", "Mínimo"
),
col = c(
"chocolate3", "black", "wheat3", "red",
"blue", "darkolivegreen4", "darkgoldenrod1"
),
lwd = c(lwd, lwd, lwd, lwd, lwd, lwd, lwd),
cex = 1
)
}
}
box_plot <- function(data, horizontal = TRUE, xlab="", ylab="") {
boxplot(
data,
xlab=xlab,
ylab=ylab,
horizontal = horizontal,
las=1,
cex.lab=0.8,
cex.axis=0.6,
pars=list(boxlwd = 2, boxwex=.8),
col=colors()
)
}
plot_heatmap <- function(data) {
plot_ly(
z = data,
y = colnames(data),
x = rownames(data),
colors = colorRamp(c("white", "red")),
type = "heatmap"
)
}
show_table <- function(table, page_size = 6, filter = 'top') {
datatable(
table,
rownames = FALSE,
filter=filter,
options = list(page_size = page_size, scrollX=T)
)
}A continuación se carga el dataset en una tabla para tener una primera vista de sus datos:
show_table(dataset[,2:8])Veamos con que variables contamos y cual es su tipo. Para esto mostramos un resumen de la estructura del dataset:
str(dataset)## 'data.frame': 984 obs. of 8 variables:
## $ country_code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ sub_nat_region: chr "Badakhshan" "Badghis" "Baghlan" "Balkh" ...
## $ world_region : chr "South Asia" "South Asia" "South Asia" "South Asia" ...
## $ mpi_national : num 0.295 0.295 0.295 0.295 0.295 0.295 0.295 0.295 0.295 0.295 ...
## $ mpi_regional : num 0.387 0.466 0.3 0.301 0.325 0.313 0.319 0.25 0.245 0.384 ...
## $ hc_regional : num 67.5 79.3 59.7 55.7 61 65.1 61.4 49.4 47.4 74.6 ...
## $ iod_regional : num 57.3 58.8 50.3 54.1 53.3 48.1 52 50.6 51.6 51.5 ...
cities.count.by.region <- dataset %>%
group_by(world_region) %>%
summarise(count = n_distinct(sub_nat_region)) %>%
arrange(world_region)
cities.count.by.region.country <- dataset %>%
group_by(world_region, country) %>%
summarise(count = n_distinct(sub_nat_region), .groups = "keep") %>%
arrange(world_region, country)
parent_regions <- cities.count.by.region$world_region
regions <- cities.count.by.region.country$world_region
countries <- cities.count.by.region.country$country
cities.count <- sum(cities.count.by.region.country$count)
country.cities.count <- cities.count.by.region.country$count
region.cities.count <- cities.count.by.region$count
labels <- c(c(parent_regions), c(countries))
parents <- c(rep(" ", length(parent_regions)), regions)
values <- c(c(region.cities.count), c(country.cities.count))
# print(paste(length(labels), length(parents), length(values), sep=" - "))
plot_ly(labels = labels, parents = parents, values = values, type = 'sunburst', maxdepth=3)Calcule las medidas de posición para los atributos numéricos y agrupe los cálculos de acuerdo a la Región.
show_table(
dataset %>%
group_by(world_region) %>%
summarise(
Media = mean (mpi_regional),
Mediana = median(mpi_regional),
Moda = paste ("[", paste(mfv(mpi_regional), collapse = ', '), "]"),
Min = min (mpi_regional),
Max = max (mpi_regional)
) %>%
arrange(desc(Media))
)Observaciones
show_table(
dataset %>%
group_by(world_region) %>%
summarise(
Media = mean (mpi_national),
Mediana = median(mpi_national),
Moda = paste ("[", paste(mfv(mpi_national), collapse = ', '), "]"),
Min = min (mpi_national),
Max = max (mpi_national)
) %>%
arrange(desc(Media))
)Observaciones
show_table(
dataset %>%
group_by(world_region) %>%
summarise(
Media = mean (hc_regional),
Mediana = median(hc_regional),
Moda = paste ("[", paste(mfv(hc_regional), collapse = ', '), "]"),
Min = min (hc_regional),
Max = max (hc_regional)
) %>%
arrange(desc(Media))
)Observaciones
show_table(
dataset %>%
group_by(world_region) %>%
summarise(
Media = mean (iod_regional),
Mediana = median(iod_regional),
Moda = paste ("[", paste(mfv(iod_regional), collapse = ', '), "]"),
Min = min (iod_regional),
Max = max (iod_regional)
) %>%
arrange(desc(Media))
)Observaciones
p <- gplot_hist(dataset$mpi_national, name="Indice de pobreza nacional", binwidth=0.06)
ggplotly(p)Observaciones
p <- gplot_hist(dataset$mpi_regional, name="Indice de pobreza regional", binwidth=0.06)
ggplotly(p)Observaciones
p <- gplot_hist(dataset$hc_regional, name="% regional de población pobre", binwidth=4)
ggplotly(p)Observaciones
p <- gplot_hist(dataset$iod_regional, name="Intensidad de pobreza regional", binwidth=4)
ggplotly(p)Observaciones
show_table(
dataset %>%
group_by(world_region) %>%
summarise(
Varianza=var(mpi_regional),
Dispersion=sd(mpi_regional),
Rango=max(mpi_regional) - min(mpi_regional)
) %>%
arrange(desc(Varianza))
)
p <- dataset.n %>% select(mpi_national, mpi_regional, hc_regional, iod_regional) %>%
pivot_longer(., cols = c(mpi_national, mpi_regional, hc_regional, iod_regional), names_to = "Variables", values_to = "Frecuencia") %>%
ggplot(aes(x = Variables, y = Frecuencia, fill = Variables)) +
geom_boxplot()
ggplotly(p)ggplot(data = dataset.n, aes(x = mpi_national, y=world_region, fill=world_region)) +
geom_boxplot(alpha=0.4) +
ggtitle("MPI National por World Region")ggplot(data = dataset.n, aes(x = mpi_regional, y=world_region, fill=world_region)) +
geom_boxplot(alpha=0.4) +
ggtitle("MPI Regional por World Region")ggplot(data = dataset.n, aes(x = hc_regional, y=world_region, fill=world_region)) +
geom_boxplot(alpha=0.4) +
ggtitle("Head Count Regional por World Region")ggplot(data = dataset.n, aes(x=iod_regional, y=world_region, fill=world_region)) +
geom_boxplot(alpha=0.4) +
ggtitle("IOD regional por World Region")plot <- PairPlot(
dataset.n[,4:8],
colnames(dataset.n)[5:8],
" ",
group_var = "world_region",
palette=NULL
) +
ggplot2::scale_color_manual(values=unique(as.factor(dataset.n$world_region)))
ggplotly(plot)A continuación tomaremos las gráficas donde se encuentre una relación medianamente clara entre variables:
plot3 <- ggplot(dataset.n, aes(x = hc_regional, y = mpi_regional)) +
geom_point(aes(shape = world_region, color = world_region)) +
geom_smooth(method = 'loess', formula = 'y ~ x')
ggplotly(plot3)Observaciones
plot1 <- ggplot(dataset.n, aes(x = iod_regional, y = mpi_regional)) +
geom_point(aes(shape = world_region, color = world_region)) +
geom_smooth(method = 'loess', formula = 'y ~ x')
ggplotly(plot1)Observaciones
plot2 <- ggplot(dataset.n, aes(x = iod_regional, y = hc_regional)) +
geom_point(aes(shape = world_region, color = world_region)) +
geom_smooth(method = 'loess', formula = 'y ~ x')
ggplotly(plot2)Observaciones
La relaciones mas débiles que tiene mayor dispersión son las siguientes:
Calcular el coeficiente de correlación de todas las variables y explique el resultado. ¿Qué tipo de gráficos describen mejor esta relación entre las variables?
cor_matrix = cor(dataset[5:8])
cor_matrix[upper.tri(cor_matrix)] <- NA
cor_matrix## mpi_national mpi_regional hc_regional iod_regional
## mpi_national 1.0000000 NA NA NA
## mpi_regional 0.8591325 1.0000000 NA NA
## hc_regional 0.8555896 0.9839779 1.0000000 NA
## iod_regional 0.8052553 0.9347065 0.8946479 1
plot_heatmap(cor_matrix)Nota
Observaciones
Realizado por Adrian Marino
adrianmarino@gmail.com