Tamaño muestral
library(tidyverse)
library(treemap)
train %>%
group_by(pais, provincia_departamento) %>%
count(name = "total") %>%
treemap(.,
index = c("pais","provincia_departamento"),
vSize = "total",
type = "index",
palette = c("#1C8356", "#C4451C"),
title = "Tamaño muestral: País - Departamento",
fontsize.title = 12
)

train %>%
group_by(pais, rooms) %>%
count(name = "total") %>%
treemap(.,
index = c("pais","rooms"),
vSize = "total",
type = "index",
palette = c("#1C8356", "#C4451C"),
title = "Tamaño muestral: País - # de Salas",
fontsize.title = 12
)

train %>%
group_by(pais, bedrooms) %>%
count(name = "total") %>%
treemap(.,
index = c("pais","bedrooms"),
vSize = "total",
type = "index",
palette = c("#1C8356", "#C4451C"),
title = "Tamaño muestral: País - # de Dormitorios",
fontsize.title = 12
)

train %>%
group_by(pais, bathrooms) %>%
count(name = "total") %>%
treemap(.,
index = c("pais","bathrooms"),
vSize = "total",
type = "index",
palette = c("#1C8356", "#C4451C"),
title = "Tamaño muestral: País - # de Baños",
fontsize.title = 12
)

Distribuciones
- Baños, Dormitorios y Salas:
library(ggthemes)
train %>%
select(rooms, bedrooms, bathrooms) %>%
gather() %>%
group_by(key, value) %>%
count(name = "Total") %>%
ggplot(aes(x = value, y = Total)) +
facet_wrap(~key, scales = "free") +
geom_point(size = 2, color = "#C4451C") +
geom_segment(aes(y = 0, xend = value, yend = Total), color = "#1C8356") +
scale_x_continuous(n.breaks = 10) +
theme_fivethirtyeight()

- Precio y área en escala original y logarítmica:

Comparativos
- Distribución de precios y área por número de habitaciones:
train %>%
select(rooms, price, surface_total) %>%
mutate(rooms = factor(rooms)) %>%
mutate(priceLog = log(price),
surfaceLog = log(surface_total)) %>%
gather(key = "key", value = "valor", -c(rooms)) %>%
ggplot(aes(x = rooms, y = valor)) +
facet_wrap(~key, scales = "free") +
geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
color = "#C4451C", size = 0.1) +
stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
shape = 17) +
theme_fivethirtyeight() +
labs(caption = "Triángulo = promedio", subtitle = "Habitaciones")

- Distribución de precios y área por número de dormitorios:
train %>%
select(bedrooms, price, surface_total) %>%
mutate(bedrooms = factor(bedrooms)) %>%
mutate(priceLog = log(price),
surfaceLog = log(surface_total)) %>%
gather(key = "key", value = "valor", -c(bedrooms)) %>%
ggplot(aes(x = bedrooms, y = valor)) +
facet_wrap(~key, scales = "free") +
geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
color = "#C4451C", size = 0.1) +
stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
shape = 17) +
theme_fivethirtyeight() +
labs(caption = "Triángulo = promedio", subtitle = "Dormitorios")

- Distribución de precios y área por número de baños:
train %>%
select(bathrooms, price, surface_total) %>%
mutate(bathrooms = factor(bathrooms)) %>%
mutate(priceLog = log(price),
surfaceLog = log(surface_total)) %>%
gather(key = "key", value = "valor", -c(bathrooms)) %>%
ggplot(aes(x = bathrooms, y = valor)) +
facet_wrap(~key, scales = "free") +
geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
color = "#C4451C", size = 0.1) +
stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
shape = 17) +
theme_fivethirtyeight() +
labs(caption = "Triángulo = promedio", subtitle = "Baños")

Dispersiones
- Relación general de área vs precio: como son más de 25 mil observaciones es preferible utilizar
geom_bin2d()
en lugar de geom_point()
.
train %>%
ggplot(aes(x = surface_total, y = price)) +
geom_bin2d(color = "white", alpha = 0.8) +
scale_fill_gradient2(low = "white", mid = "#1C8356", high = "#C4451C") +
geom_smooth(method = "lm", color = "#C4451C", size = 2, se = FALSE) +
theme_fivethirtyeight() +
theme(legend.position = "right", legend.direction = "vertical")
