Variables

Datos Iniciales

Train

library(data.table)
train <- fread("../data/train.csv", encoding = "UTF-8") %>% 
  select(-c(Id, property_type, operation_type, currency)) %>% 
  mutate(rooms = factor(rooms),
         bedrooms = factor(bedrooms),
         bathrooms = factor(bathrooms))
head(train)

Test

test <- fread("../data/test.csv", encoding = "UTF-8") %>% 
  select(-c(Id, property_type, operation_type, currency)) %>% 
  mutate(rooms = factor(rooms),
         bedrooms = factor(bedrooms),
         bathrooms = factor(bathrooms))
head(test)

Sample Submission

sampleSub <- fread("../data/sampleSub.csv", encoding = "UTF-8")
head(sampleSub)

Exploratorio Train

Tamaño muestral

library(tidyverse)
library(treemap)
train %>% 
  group_by(pais, provincia_departamento) %>% 
  count(name = "total") %>% 
  treemap(.,
        index = c("pais","provincia_departamento"),
        vSize = "total", 
        type = "index", 
        palette = c("#1C8356", "#C4451C"),
        title = "Tamaño muestral: País - Departamento",   
        fontsize.title = 12
 
)

NA
NA
train %>% 
  group_by(pais, rooms) %>% 
  count(name = "total") %>% 
  treemap(.,
        index = c("pais","rooms"),
        vSize = "total", 
        type = "index", 
        palette = c("#1C8356", "#C4451C"),
        title = "Tamaño muestral: País - # de Salas",   
        fontsize.title = 12
 
)

train %>% 
  group_by(pais, bedrooms) %>% 
  count(name = "total") %>% 
  treemap(.,
        index = c("pais","bedrooms"),
        vSize = "total", 
        type = "index", 
        palette = c("#1C8356", "#C4451C"),
        title = "Tamaño muestral: País - # de Dormitorios",   
        fontsize.title = 12
 
)

train %>% 
  group_by(pais, bathrooms) %>% 
  count(name = "total") %>% 
  treemap(.,
        index = c("pais","bathrooms"),
        vSize = "total", 
        type = "index", 
        palette = c("#1C8356", "#C4451C"),
        title = "Tamaño muestral: País - # de Baños",   
        fontsize.title = 12
 
)

Distribuciones

  • Baños, Dormitorios y Salas:
library(ggthemes)
train %>% 
  select(rooms, bedrooms, bathrooms) %>% 
  gather() %>% 
  group_by(key, value) %>% 
  count(name = "Total") %>% 
  ggplot(aes(x = value, y = Total)) +
  facet_wrap(~key, scales = "free") +
  geom_point(size = 2, color = "#C4451C") +
  geom_segment(aes(y = 0, xend = value, yend = Total), color = "#1C8356") +
  scale_x_continuous(n.breaks = 10) +
  theme_fivethirtyeight()

  • Precio y área en escala original y logarítmica:

Comparativos

  • Distribución de precios y área por número de habitaciones:
train %>% 
  select(rooms, price, surface_total) %>% 
  mutate(rooms = factor(rooms)) %>% 
  mutate(priceLog = log(price),
         surfaceLog = log(surface_total)) %>% 
  gather(key = "key", value = "valor", -c(rooms)) %>% 
  ggplot(aes(x = rooms, y = valor)) +
  facet_wrap(~key, scales = "free") +
  geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
               color = "#C4451C", size = 0.1) +
  stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
               shape = 17) +
  theme_fivethirtyeight() +
  labs(caption = "Triángulo = promedio", subtitle = "Habitaciones")

  • Distribución de precios y área por número de dormitorios:
train %>% 
  select(bedrooms, price, surface_total) %>% 
  mutate(bedrooms = factor(bedrooms)) %>% 
  mutate(priceLog = log(price),
         surfaceLog = log(surface_total)) %>% 
  gather(key = "key", value = "valor", -c(bedrooms)) %>% 
  ggplot(aes(x = bedrooms, y = valor)) +
  facet_wrap(~key, scales = "free") +
  geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
               color = "#C4451C", size = 0.1) +
  stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
               shape = 17) +
  theme_fivethirtyeight() +
  labs(caption = "Triángulo = promedio", subtitle = "Dormitorios")

  • Distribución de precios y área por número de baños:
train %>% 
  select(bathrooms, price, surface_total) %>% 
  mutate(bathrooms = factor(bathrooms)) %>% 
  mutate(priceLog = log(price),
         surfaceLog = log(surface_total)) %>% 
  gather(key = "key", value = "valor", -c(bathrooms)) %>% 
  ggplot(aes(x = bathrooms, y = valor)) +
  facet_wrap(~key, scales = "free") +
  geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
               color = "#C4451C", size = 0.1) +
  stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
               shape = 17) +
  theme_fivethirtyeight() +
  labs(caption = "Triángulo = promedio", subtitle = "Baños")

Dispersiones

  • Relación general de área vs precio: como son más de 25 mil observaciones es preferible utilizar geom_bin2d() en lugar de geom_point().
train %>% 
  ggplot(aes(x = surface_total, y = price)) +
  geom_bin2d(color = "white", alpha = 0.8) +
  scale_fill_gradient2(low = "white", mid = "#1C8356", high = "#C4451C") +
  geom_smooth(method = "lm", color = "#C4451C", size = 2, se = FALSE) +
  theme_fivethirtyeight() +
  theme(legend.position = "right", legend.direction = "vertical")

NA

GLMNET

Train - Test

library(tidymodels)
set.seed(123)
datosTrain <- train %>% 
  select(-c(Id, property_type, operation_type, currency)) %>% 
  mutate(rooms = factor(rooms),
         bedrooms = factor(bedrooms),
         bathrooms = factor(bathrooms))
split_inicial <- initial_split(
                    data   = datosTrain,
                    prop   = 0.8,
                    strata = price
                 )
datos_train <- training(split_inicial)
datos_test  <- testing(split_inicial)

Modelo GLM - Tuning

# Modelo
mod_glm <- linear_reg(mode    = "regression",
                      penalty = tune(),
                      mixture = tune()) %>%
  set_engine(engine = "glmnet")

# Preprocesamiento
receta <- recipe(formula = price ~ .,
                 data =  datos_train) %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(), -all_outcomes()) %>%
  step_dummy(all_nominal(), -all_outcomes())

# Validación del modelo: validación cruzada K-folds con k = 10
set.seed(1992)
crossVal <- vfold_cv(data = datos_train,
                     v = 10,
                     strata = price)

# WORKFLOW
# =============================================================================
flujo_modelo <- workflow() %>%
  add_recipe(receta) %>%
  add_model(mod_glm)

# Grid de hiperparámetros
hiperpar_grid <- grid_regular(
  penalty(range = c(0, 1), trans = NULL),
  mixture(range = c(0, 1), trans = NULL),
  levels = c(10, 10))

# EJECUCIÓN DE LA OPTIMIZACIÓN DE HIPERPARÁMETROS
# =============================================================================
registerDoParallel(cores = parallel::detectCores() - 1)
myGrid <- tune_grid(
  object = flujo_modelo,
  resamples = crossVal,
  metrics = metric_set(rmse),
  control = control_resamples(save_pred = TRUE),
  grid = hiperpar_grid
)
stopImplicitCluster()
  • Mejores 10 modelos:
show_best(myGrid, metric = "rmse", n = 10)

Modelo GLM Final

mejorGrid <- select_best(myGrid, metric = "rmse")

flujo_final <- finalize_workflow(x = flujo_modelo, parameters = mejorGrid)


glm_final <-  flujo_final %>%
  fit(data = train)

Predichos GLM

predicciones <- glm_final %>%
  predict(new_data = datos_test,
          type = "numeric")
predicciones[is.na(predicciones)] <- 0
  • Error de test:
predicciones <- predicciones %>% 
                bind_cols(datos_test %>% select(price))

error_test_glm  <- rmse(
  data = predicciones,
  truth = price,
  estimate = .pred,
  na_rm = TRUE
) %>%
  mutate(modelo = "GLM")
error_test_glm

Predichos - Nuevos

prediccionesGLM_Subm1 <- glm_final %>%
  predict(new_data = test,
          type = "numeric")
prediccionesGLM_Subm1[is.na(prediccionesGLM_Subm1)] <- 0
prediccionesGLM_Subm1[prediccionesGLM_Subm1 < 0 ] <- 0
hist(prediccionesGLM_Subm1$.pred)

  • Submission 1:
subm1_glmnet <- data.frame(Id = sampleSub$Id,
                           price = prediccionesGLM_Subm1$.pred)
write.csv(subm1_glmnet, file = "Subm1.csv", row.names = FALSE)
  • Score: 2.72416885190957 - Posición 32.
