**Etapas de un problema de machine learning

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(skimr)
## Warning: package 'skimr' was built under R version 4.1.2
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.1.2
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.1.2
library(univariateML)
## Warning: package 'univariateML' was built under R version 4.1.2
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(mosaicData)
## Warning: package 'mosaicData' was built under R version 4.1.2
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.1.2
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip
## -- Attaching packages -------------------------------------- tidymodels 0.1.4 --
## v broom        0.7.9      v rsample      0.1.1 
## v dials        0.0.10     v tune         0.1.6 
## v infer        1.0.0      v workflows    0.2.4 
## v modeldata    0.1.1      v workflowsets 0.1.0 
## v parsnip      0.1.7      v yardstick    0.0.9 
## v recipes      0.1.17
## Warning: package 'dials' was built under R version 4.1.2
## Warning: package 'infer' was built under R version 4.1.2
## Warning: package 'modeldata' was built under R version 4.1.2
## Warning: package 'parsnip' was built under R version 4.1.2
## Warning: package 'rsample' was built under R version 4.1.2
## Warning: package 'tune' was built under R version 4.1.2
## Warning: package 'workflows' was built under R version 4.1.2
## Warning: package 'workflowsets' was built under R version 4.1.2
## Warning: package 'yardstick' was built under R version 4.1.2
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
data("SaratogaHouses", package = "mosaicData")
datos <- SaratogaHouses
str(datos)
## 'data.frame':    1728 obs. of  16 variables:
##  $ price          : int  132500 181115 109000 155000 86060 120000 153000 170000 90000 122900 ...
##  $ lotSize        : num  0.09 0.92 0.19 0.41 0.11 0.68 0.4 1.21 0.83 1.94 ...
##  $ age            : int  42 0 133 13 0 31 33 23 36 4 ...
##  $ landValue      : int  50000 22300 7300 18700 15000 14000 23300 14600 22200 21200 ...
##  $ livingArea     : int  906 1953 1944 1944 840 1152 2752 1662 1632 1416 ...
##  $ pctCollege     : int  35 51 51 51 51 22 51 35 51 44 ...
##  $ bedrooms       : int  2 3 4 3 2 4 4 4 3 3 ...
##  $ fireplaces     : int  1 0 1 1 0 1 1 1 0 0 ...
##  $ bathrooms      : num  1 2.5 1 1.5 1 1 1.5 1.5 1.5 1.5 ...
##  $ rooms          : int  5 6 8 5 3 8 8 9 8 6 ...
##  $ heating        : Factor w/ 3 levels "hot air","hot water/steam",..: 3 2 2 1 1 1 2 1 3 1 ...
##  $ fuel           : Factor w/ 3 levels "gas","electric",..: 2 1 1 1 1 1 3 3 2 1 ...
##  $ sewer          : Factor w/ 3 levels "septic","public/commercial",..: 1 1 2 1 2 1 1 1 1 3 ...
##  $ waterfront     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 2 ...
##  $ newConstruction: Factor w/ 2 levels "Yes","No": 2 2 2 2 1 2 2 2 2 2 ...
##  $ centralAir     : Factor w/ 2 levels "Yes","No": 2 2 2 2 1 2 2 2 2 2 ...
# Se renombran las columnas para que sean más descriptivas
colnames(datos) <- c("precio", "metros_totales", "antiguedad", "precio_terreno",
                     "metros_habitables", "universitarios",
                     "dormitorios", "chimenea", "banyos", "habitaciones",
                     "calefaccion", "consumo_calefacion", "desague",
                     "vistas_lago","nueva_construccion", "aire_acondicionado")

Análisis exploratorio

skim(datos)
Data summary
Name datos
Number of rows 1728
Number of columns 16
_______________________
Column type frequency:
factor 6
numeric 10
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
calefaccion 0 1 FALSE 3 hot: 1121, ele: 305, hot: 302
consumo_calefacion 0 1 FALSE 3 gas: 1197, ele: 315, oil: 216
desague 0 1 FALSE 3 pub: 1213, sep: 503, non: 12
vistas_lago 0 1 FALSE 2 No: 1713, Yes: 15
nueva_construccion 0 1 FALSE 2 No: 1647, Yes: 81
aire_acondicionado 0 1 FALSE 2 No: 1093, Yes: 635

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
precio 0 1 211966.71 98441.39 5000 1.45e+05 189900.00 259000.00 775000.0 ▅▇▂▁▁
metros_totales 0 1 0.50 0.70 0 1.70e-01 0.37 0.54 12.2 ▇▁▁▁▁
antiguedad 0 1 27.92 29.21 0 1.30e+01 19.00 34.00 225.0 ▇▁▁▁▁
precio_terreno 0 1 34557.19 35021.17 200 1.51e+04 25000.00 40200.00 412600.0 ▇▁▁▁▁
metros_habitables 0 1 1754.98 619.94 616 1.30e+03 1634.50 2137.75 5228.0 ▇▇▂▁▁
universitarios 0 1 55.57 10.33 20 5.20e+01 57.00 64.00 82.0 ▁▃▆▇▁
dormitorios 0 1 3.15 0.82 1 3.00e+00 3.00 4.00 7.0 ▃▇▅▁▁
chimenea 0 1 0.60 0.56 0 0.00e+00 1.00 1.00 4.0 ▆▇▁▁▁
banyos 0 1 1.90 0.66 0 1.50e+00 2.00 2.50 4.5 ▁▇▇▁▁
habitaciones 0 1 7.04 2.32 2 5.00e+00 7.00 8.25 12.0 ▃▇▇▅▂
head(datos, 3)
##   precio metros_totales antiguedad precio_terreno metros_habitables
## 1 132500           0.09         42          50000               906
## 2 181115           0.92          0          22300              1953
## 3 109000           0.19        133           7300              1944
##   universitarios dormitorios chimenea banyos habitaciones     calefaccion
## 1             35           2        1    1.0            5        electric
## 2             51           3        0    2.5            6 hot water/steam
## 3             51           4        1    1.0            8 hot water/steam
##   consumo_calefacion           desague vistas_lago nueva_construccion
## 1           electric            septic          No                 No
## 2                gas            septic          No                 No
## 3                gas public/commercial          No                 No
##   aire_acondicionado
## 1                 No
## 2                 No
## 3                 No

Número de observaciones y valores ausentes

# Número de datos ausentes por variable
datos %>% map_dbl(.f = function(x){sum(is.na(x))})
##             precio     metros_totales         antiguedad     precio_terreno 
##                  0                  0                  0                  0 
##  metros_habitables     universitarios        dormitorios           chimenea 
##                  0                  0                  0                  0 
##             banyos       habitaciones        calefaccion consumo_calefacion 
##                  0                  0                  0                  0 
##            desague        vistas_lago nueva_construccion aire_acondicionado 
##                  0                  0                  0                  0

Variable respuesta

p1 <- ggplot(data = datos, aes(x = precio)) +
      geom_density(fill = "steelblue", alpha = 0.8) +
      geom_rug(alpha = 0.1) +
      scale_x_continuous(labels = scales::comma) +
      labs(title = "Distribución original") +
      theme_bw() 

p2 <- ggplot(data = datos, aes(x = sqrt(precio))) +
      geom_density(fill = "steelblue", alpha = 0.8) +
      geom_rug(alpha = 0.1) +
      scale_x_continuous(labels = scales::comma) +
      labs(title = "Transformación raíz cuadrada") +
      theme_bw() 

p3 <- ggplot(data = datos, aes(x = log(precio))) +
      geom_density(fill = "steelblue", alpha = 0.8) +
      geom_rug(alpha = 0.1) +
      scale_x_continuous(labels = scales::comma) +
      labs(title = "Transformación logarítmica") +
      theme_bw() 

ggarrange(p1, p2, p3, ncol = 1, align = "v")

plot_bar(
  datos,
  ncol    = 3,
  title   = "Número de observaciones por grupo",
  ggtheme = theme_bw(),
  theme_config = list(
                   plot.title = element_text(size = 16, face = "bold"),
                   strip.text = element_text(colour = "black", size = 10, face = 2),
                   legend.position = "none"
                  )
)

# Se comparan únicamente las distribuciones con un dominio [0, +inf)
# Cuanto menor el valor AIC mejor el ajuste
comparacion_aic <- AIC(
                    mlbetapr(datos$precio),
                    mlexp(datos$precio),
                    mlinvgamma(datos$precio),
                    mlgamma(datos$precio),
                    mllnorm(datos$precio),
                    mlrayleigh(datos$precio),
                    mlinvgauss(datos$precio),
                    mlweibull(datos$precio),
                    mlinvweibull(datos$precio),
                    mllgamma(datos$precio)
                   )
comparacion_aic %>% rownames_to_column(var = "distribucion") %>% arrange(AIC)
##                  distribucion df      AIC
## 1       mlgamma(datos$precio)  2 44196.85
## 2       mllnorm(datos$precio)  2 44214.14
## 3    mlinvgauss(datos$precio)  2 44409.24
## 4     mlweibull(datos$precio)  2 44417.64
## 5    mlrayleigh(datos$precio)  1 44461.57
## 6      mlbetapr(datos$precio)  2 44662.15
## 7    mlinvgamma(datos$precio)  2 44662.19
## 8  mlinvweibull(datos$precio)  2 45410.38
## 9         mlexp(datos$precio)  1 45843.02
## 10     mllgamma(datos$precio)  2      Inf

Las dos distribuciones con mejor ajuste acorde al valor AIC son la normal y la gamma, es el menor de todos.

# Valores observados de chimenea.
table(datos$chimenea)
## 
##   0   1   2   3   4 
## 740 942  42   2   2
# Se convierte la variable chimenea a factor.
datos <- datos %>%
         mutate(chimenea = as.factor(chimenea))

#datos$chimenea <- as.factor(datos$chimenea)
#datos$calefaccion <- as.factor(datos$calefaccion)
#datos$desague <- as.factor(datos$desague)
#datos$vistas_lago <- as.factor(datos$vistas_lago)
#datos$nueva_construccion <- as.factor(datos$nueva_construccion)
#datos$consumo_calefaccion <- as.factor(datos$consumo_calefaccion)
#datos$aire_acondicionado <- as.factor(datos$aire_acondicionado)

****Como el objetivo del estudio es predecir el precio de las viviendas, el análisis de cada variable se hace también en relación a la variable respuesta precio. Analizando los datos de esta forma, se pueden empezar a extraer ideas sobre qué variables están más relacionadas con el precio y de qué forma. Las variables antiguedad, metros_totales y precio_terreno se convierten a escala logarítmica para mejorar su representación gráfica.****

datos <- datos %>%
         mutate(
           # Se le añade +0.1 a la antigüedad para que cuando la antigüedad es
           # 0 no de -Inf
           log_antiguedad     = log10(antiguedad + 0.1),
           log_metros_totales = log10(metros_totales),
           log_precio_terreno = log10(precio_terreno)
         )
custom_corr_plot <- function(variable1, variable2, df, alpha=0.3){
  p <- df %>%
       mutate(
         # Truco para que se ponga el título estilo facet
        title = paste(toupper(variable2), "vs", toupper(variable1))
       ) %>%
       ggplot(aes(x = !!sym(variable1), y = !!sym(variable2))) + 
       geom_point(alpha = alpha) +
       # Tendencia no lineal
       geom_smooth(se = FALSE, method = "gam", formula =  y ~ splines::bs(x, 3)) +
       # Tendencia lineal
       geom_smooth(se = FALSE, method = "lm", color = "firebrick") +
       facet_grid(. ~ title) +
       theme_bw() +
       theme(strip.text = element_text(colour = "black", size = 8, face = 2),
             axis.title = element_blank())
  return(p)
}
variables_continuas <- c("metros_habitables", "universitarios", "dormitorios",
                         "banyos", "habitaciones", "log_antiguedad", "log_metros_totales",
                         "log_precio_terreno")

plots <- map(
            .x = variables_continuas,
            .f = custom_corr_plot,
            variable2 = "precio",
            df = datos
         )

ggarrange(plotlist = plots, ncol = 3, nrow = 3) %>%
  annotate_figure(
    top = text_grob("Correlación con precio", face = "bold", size = 16,
                    x = 0.20)
  )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'

datos <- datos %>%
         select(-c(log_antiguedad, log_metros_totales, log_precio_terreno))
plot_correlation(
  data = datos,
  type = "continuous",
  title = "Matriz de correlación variables continuas",
  theme_config = list(legend.position = "none",
                      plot.title = element_text(size = 12, face = "bold"),
                      axis.title = element_blank(),
                      axis.text.x = element_text(angle = -45, hjust = +0.1)
                     )
)

variables cuantitativas

plot_histogram(datos)

variables cualitativas

plot_bar(datos)

plot_missing(datos)

plot_density(datos)

plot_correlation(datos, type = 'continuous','Review.Date')

Los colores más intensos muestran una mayor correlación entre las variables

plot_bar(datos)

variables cualitativas

plot_bar(
  datos,
  ncol    = 3,
  title   = "Número de observaciones por grupo",
  ggtheme = theme_bw(),
  theme_config = list(
                   plot.title = element_text(size = 16, face = "bold"),
                   strip.text = element_text(colour = "black", size = 10, face = 2),
                   legend.position = "none"
                  )
)

GGally::ggscatmat(
  data = datos %>% select_if(is.numeric),
  alpha = 0.1) +
theme_bw() +
labs(title = "Correlación por pares") +
theme(
  plot.title = element_text(size = 16, face = "bold"),
  axis.text = element_blank(),
  strip.text = element_text(colour = "black", size = 5, face = 1)
)