Setup

Carga de librerías

# Cargamos las librerías necesarias para el análisis y visualización
library(dplyr)
#> 
#> Adjuntando el paquete: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(readxl)
library(leaflet)
library(shiny)
library(jsonlite)
#> 
#> Adjuntando el paquete: 'jsonlite'
#> The following object is masked from 'package:shiny':
#> 
#>     validate
library(stringr)
library(knitr)
library(FactoMineR)
library(factoextra)
#> Cargando paquete requerido: ggplot2
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(mice)
#> 
#> Adjuntando el paquete: 'mice'
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following objects are masked from 'package:base':
#> 
#>     cbind, rbind
library(ggplot2)

Carga de los datos

#> # A tibble: 6 × 45
#>   propertyCode thumbnail      externalReference numPhotos floor  price priceInfo
#>          <dbl> <chr>          <chr>                 <dbl> <chr>  <dbl> <chr>    
#> 1    106620603 https://img4.… W-02WWIS                 65 5     8.64e5 {'price'…
#> 2    107552311 https://img4.… LCGV-1107                38 6     1.35e6 {'price'…
#> 3    107048311 https://img4.… W-02YAIU                 49 2     1.05e6 {'price'…
#> 4    106614759 https://img4.… W-02XPUT                 69 6     1.35e6 {'price'…
#> 5    107124247 https://img4.… LCGV-1047                31 2     4.50e5 {'price'…
#> 6    107311950 https://img4.… LCGV-1074                42 2     1.05e6 {'price'…
#> # ℹ 38 more variables: propertyType <chr>, operation <chr>, size <dbl>,
#> #   exterior <dbl>, rooms <dbl>, bathrooms <dbl>, address <chr>,
#> #   province <chr>, municipality <chr>, district <chr>, country <chr>,
#> #   neighborhood <chr>, latitude <dbl>, longitude <dbl>, showAddress <lgl>,
#> #   url <chr>, distance <dbl>, description <chr>, hasVideo <lgl>, status <chr>,
#> #   newDevelopment <lgl>, hasLift <dbl>, priceByArea <dbl>, change <chr>,
#> #   detailedType <chr>, suggestedTexts <chr>, hasPlan <lgl>, has3DTour <lgl>, …
# Ejemplo de consulta de URL de una propiedad concreta
datos %>% filter(propertyCode == 106230584) %>% dplyr::select(url)
#> # A tibble: 1 × 1
#>   url                                          
#>   <chr>                                        
#> 1 https://www.idealista.com/inmueble/106230584/

Limpieza y exploración

Eliminar duplicados

Antes de analizar los datos, eliminamos duplicados generados por el scraping en dos zonas geográficas.

nrow(datos)
#> [1] 3913
duplicados <- datos %>% group_by(propertyCode) %>% filter(n() > 1) %>% ungroup()
nrow(duplicados)
#> [1] 1487
# Nos quedamos con un duplicado por propertyCode (el primero)
duplicados_buenos = duplicados %>% group_by(propertyCode) %>%  slice_head(n = 1) %>% ungroup()
nrow(duplicados_buenos)
#> [1] 722
# Eliminamos duplicados y los reemplazamos por los seleccionados
datos <- anti_join(datos, duplicados_buenos, by = c("propertyCode", "price"))
datos <- rbind(datos, duplicados_buenos)
nrow(datos)
#> [1] 3149

Arreglo de las variables en formato JSON

Algunas columnas contienen información en formato JSON. Las limpiamos y extraemos variables relevantes.

# Procesamos la columna priceInfo (precio, moneda, descuentos)
head(datos$priceInfo)
#> [1] "{'price': {'amount': 864000.0, 'currencySuffix': '€', 'priceDropInfo': {'formerPrice': 890000.0, 'priceDropValue': 26000, 'priceDropPercentage': 3}}}"
#> [2] "{'price': {'amount': 1350000.0, 'currencySuffix': '€'}}"                                                                                              
#> [3] "{'price': {'amount': 1050000.0, 'currencySuffix': '€'}}"                                                                                              
#> [4] "{'price': {'amount': 1350000.0, 'currencySuffix': '€'}}"                                                                                              
#> [5] "{'price': {'amount': 450000.0, 'currencySuffix': '€'}}"                                                                                               
#> [6] "{'price': {'amount': 1050000.0, 'currencySuffix': '€'}}"
datos <- datos %>% 
    mutate(
        priceInfo_clean = str_replace_all(priceInfo, "'", "\""),
        parsed_json = lapply(priceInfo_clean, function(x) tryCatch(fromJSON(x), error = function(e) NULL))
    ) %>%
    rowwise() %>% 
    mutate(
        amount = parsed_json$price$amount %||% NA,
        currency = parsed_json$price$currencySuffix %||% NA, 
        former_price = parsed_json$price$priceDropInfo$formerPrice %||% NA, 
        price_drop_value = parsed_json$price$priceDropInfo$priceDropValue %||% NA, 
        price_drop_percentage = parsed_json$price$priceDropInfo$priceDropPercentage %||% NA
    ) %>% 
    dplyr::select(-parsed_json, -priceInfo_clean, -priceInfo)

# Procesamos la columna detailedType (tipología)
head(datos$detailedType)
#> [1] "{'typology': 'flat'}" "{'typology': 'flat'}" "{'typology': 'flat'}"
#> [4] "{'typology': 'flat'}" "{'typology': 'flat'}" "{'typology': 'flat'}"
datos <- datos %>% 
  mutate(
    detailedType_clean = str_replace_all(detailedType, "'", "\""), 
    parsed_json = lapply(detailedType_clean, function(x) tryCatch(fromJSON(x), error = function(e) NULL))
  ) %>%
  rowwise() %>% 
  mutate(
    typology = parsed_json$typology %||% NA,
    subTypology = parsed_json$subTypology %||% NA
  ) %>% 
  dplyr::select(-parsed_json, -detailedType_clean, -detailedType)
# Procesamos la variable highlight para convertirla en categórica
head(table(datos$suggestedTexts)) # No se procesa para análisis exploratorio
#> 
#>               {'subtitle': 'Aiora, València', 'title': 'Ático en calle Maestro Valls, 21'} 
#>                                                                                          1 
#>     {'subtitle': 'Aiora, València', 'title': 'Casa independiente en calle de la Conserva'} 
#>                                                                                          1 
#>                                 {'subtitle': 'Aiora, València', 'title': 'Chalet adosado'} 
#>                                                                                          1 
#> {'subtitle': 'Aiora, València', 'title': 'Chalet adosado en calle de Francesc de Bellvís'} 
#>                                                                                          1 
#>        {'subtitle': 'Aiora, València', 'title': 'Chalet adosado en calle del Marí Albesa'} 
#>                                                                                          1 
#>                                        {'subtitle': 'Aiora, València', 'title': 'Estudio'} 
#>                                                                                          1
table(datos$highlight, useNA = "ifany")
#> 
#> {'groupDescription': 'Destacado'}       {'groupDescription': 'Top'} 
#>                               696                               403 
#>      {'groupDescription': 'Top+'}                             FALSE 
#>                                40                               356 
#>                              TRUE                              <NA> 
#>                                 9                              1645
datos <- datos %>%
    mutate(highlight = case_when(
        highlight == "{'groupDescription': 'Destacado'}" ~ "Destacado",
        highlight == "{'groupDescription': 'Top'}" ~ "Top",
        highlight == "{'groupDescription': 'Top+'}" ~ "Top+",
        highlight == "TRUE" ~ "True",
        highlight == "FALSE" ~ "False",
        TRUE ~ NA_character_
    ))
table(datos$highlight, useNA = "ifany")
#> 
#> Destacado     False       Top      Top+      True      <NA> 
#>       696       356       403        40         9      1645
# Extraemos información de parkingSpace (JSON anidado)
extract_parking_info <- function(parking_info) {
  if (is.na(parking_info)) {
    return(data.frame(hasParkingSpace = NA, isParkingSpaceIncludedInPrice = NA, parkingSpacePrice = NA))
  } else {
    parking_info <- gsub("'", "\"", parking_info)  
    parking_info <- gsub("\\bTrue\\b", "true", parking_info)  
    parking_info <- gsub("\\bFalse\\b", "false", parking_info)  
    tryCatch({
      parking_info <- fromJSON(parking_info)
      hasParkingSpace <- parking_info$hasParkingSpace
      isParkingSpaceIncludedInPrice <- parking_info$isParkingSpaceIncludedInPrice
      parkingSpacePrice <- ifelse(isParkingSpaceIncludedInPrice, NA, parking_info$parkingSpacePrice)
      return(data.frame(hasParkingSpace, isParkingSpaceIncludedInPrice, parkingSpacePrice))
    }, error = function(e) {
      return(data.frame(hasParkingSpace = NA, isParkingSpaceIncludedInPrice = NA, parkingSpacePrice = NA))
    })
  }
}
parking_info <- do.call(rbind, lapply(datos$parkingSpace, extract_parking_info))
datos <- cbind(datos, parking_info)
datos <- datos %>% dplyr::select(-parkingSpace)
names(datos)
#>  [1] "propertyCode"                  "thumbnail"                    
#>  [3] "externalReference"             "numPhotos"                    
#>  [5] "floor"                         "price"                        
#>  [7] "propertyType"                  "operation"                    
#>  [9] "size"                          "exterior"                     
#> [11] "rooms"                         "bathrooms"                    
#> [13] "address"                       "province"                     
#> [15] "municipality"                  "district"                     
#> [17] "country"                       "neighborhood"                 
#> [19] "latitude"                      "longitude"                    
#> [21] "showAddress"                   "url"                          
#> [23] "distance"                      "description"                  
#> [25] "hasVideo"                      "status"                       
#> [27] "newDevelopment"                "hasLift"                      
#> [29] "priceByArea"                   "change"                       
#> [31] "suggestedTexts"                "hasPlan"                      
#> [33] "has3DTour"                     "has360"                       
#> [35] "hasStaging"                    "highlight"                    
#> [37] "savedAd"                       "notes"                        
#> [39] "topNewDevelopment"             "topPlus"                      
#> [41] "newDevelopmentFinished"        "distancia_min_estacion_m"     
#> [43] "amount"                        "currency"                     
#> [45] "former_price"                  "price_drop_value"             
#> [47] "price_drop_percentage"         "typology"                     
#> [49] "subTypology"                   "hasParkingSpace"              
#> [51] "isParkingSpaceIncludedInPrice" "parkingSpacePrice"

Tabla auxiliar de tipos de variables

Creamos una tabla con el tipo de cada variable para facilitar análisis posteriores.

descProp = data.frame("variable" = colnames(datos),
                      "tipo" = c("id", "url", "text", rep("numerical", 3), rep("categorical", 2), "numerical", "binary", rep("numerical", 2), "text", rep("categorical", 5), rep("numerical", 2), "binary", "url", "numerical", "text", "binary", "categorical", rep("binary", 2), rep("numerical", 2), "text", rep("binary", 4), rep("categorical", 3), rep("binary", 2), "binary", rep("numerical", 2), "categorical", rep("numerical", 3), rep("categorical", 2), rep("binary", 2), "numerical"), stringsAsFactors = FALSE)
rownames(descProp) = descProp$variable
descProp
#>                                                    variable        tipo
#> propertyCode                                   propertyCode          id
#> thumbnail                                         thumbnail         url
#> externalReference                         externalReference        text
#> numPhotos                                         numPhotos   numerical
#> floor                                                 floor   numerical
#> price                                                 price   numerical
#> propertyType                                   propertyType categorical
#> operation                                         operation categorical
#> size                                                   size   numerical
#> exterior                                           exterior      binary
#> rooms                                                 rooms   numerical
#> bathrooms                                         bathrooms   numerical
#> address                                             address        text
#> province                                           province categorical
#> municipality                                   municipality categorical
#> district                                           district categorical
#> country                                             country categorical
#> neighborhood                                   neighborhood categorical
#> latitude                                           latitude   numerical
#> longitude                                         longitude   numerical
#> showAddress                                     showAddress      binary
#> url                                                     url         url
#> distance                                           distance   numerical
#> description                                     description        text
#> hasVideo                                           hasVideo      binary
#> status                                               status categorical
#> newDevelopment                               newDevelopment      binary
#> hasLift                                             hasLift      binary
#> priceByArea                                     priceByArea   numerical
#> change                                               change   numerical
#> suggestedTexts                               suggestedTexts        text
#> hasPlan                                             hasPlan      binary
#> has3DTour                                         has3DTour      binary
#> has360                                               has360      binary
#> hasStaging                                       hasStaging      binary
#> highlight                                         highlight categorical
#> savedAd                                             savedAd categorical
#> notes                                                 notes categorical
#> topNewDevelopment                         topNewDevelopment      binary
#> topPlus                                             topPlus      binary
#> newDevelopmentFinished               newDevelopmentFinished      binary
#> distancia_min_estacion_m           distancia_min_estacion_m   numerical
#> amount                                               amount   numerical
#> currency                                           currency categorical
#> former_price                                   former_price   numerical
#> price_drop_value                           price_drop_value   numerical
#> price_drop_percentage                 price_drop_percentage   numerical
#> typology                                           typology categorical
#> subTypology                                     subTypology categorical
#> hasParkingSpace                             hasParkingSpace      binary
#> isParkingSpaceIncludedInPrice isParkingSpaceIncludedInPrice      binary
#> parkingSpacePrice                         parkingSpacePrice   numerical

Variables que no se pueden incluir en el análisis

Eliminamos columnas de tipo texto y ajustamos la tabla de tipos.

datos = datos[,setdiff(colnames(datos), c("thumbnail", "externalReference", "address", "url", "description", "suggestedTexts"))]
descProp = descProp[colnames(datos),]
table(descProp$tipo)
#> 
#>      binary categorical          id   numerical 
#>          14          14           1          17

Variables constantes

Buscamos variables constantes o casi constantes para eliminarlas.

summary(datos[,descProp$variable[descProp$tipo == "numerical"]])
#>    numPhotos         floor               price              size       
#>  Min.   :  0.00   Length:3149        Min.   :  26800   Min.   :  12.0  
#>  1st Qu.: 15.00   Class :character   1st Qu.: 217000   1st Qu.:  76.0  
#>  Median : 24.00   Mode  :character   Median : 330000   Median : 104.0  
#>  Mean   : 26.47                      Mean   : 447143   Mean   : 126.4  
#>  3rd Qu.: 36.00                      3rd Qu.: 550000   3rd Qu.: 149.0  
#>  Max.   :125.00                      Max.   :4700000   Max.   :3446.0  
#>                                                                        
#>      rooms          bathrooms         latitude       longitude      
#>  Min.   : 0.000   Min.   : 0.000   Min.   :39.44   Min.   :-0.4172  
#>  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.:39.46   1st Qu.:-0.3852  
#>  Median : 3.000   Median : 2.000   Median :39.47   Median :-0.3740  
#>  Mean   : 2.945   Mean   : 1.785   Mean   :39.47   Mean   :-0.3696  
#>  3rd Qu.: 4.000   3rd Qu.: 2.000   3rd Qu.:39.48   3rd Qu.:-0.3544  
#>  Max.   :16.000   Max.   :10.000   Max.   :39.50   Max.   :-0.3236  
#>                                                                     
#>     distance     priceByArea       change          distancia_min_estacion_m
#>  Min.   :   1   Min.   :   55   Length:3149        Min.   :   6.492        
#>  1st Qu.:1077   1st Qu.: 2532   Class :character   1st Qu.: 239.489        
#>  Median :1923   Median : 3292   Mode  :character   Median : 384.427        
#>  Mean   :1912   Mean   : 3527                      Mean   : 442.141        
#>  3rd Qu.:2672   3rd Qu.: 4250                      3rd Qu.: 568.523        
#>  Max.   :3996   Max.   :26533                      Max.   :2132.112        
#>                 NA's   :365                                                
#>      amount         former_price     price_drop_value price_drop_percentage
#>  Min.   :  26800   Min.   :  43000   Min.   :  1000   Min.   : 1.000       
#>  1st Qu.: 217000   1st Qu.: 220000   1st Qu.:  9000   1st Qu.: 3.000       
#>  Median : 330000   Median : 349000   Median : 15000   Median : 4.000       
#>  Mean   : 447143   Mean   : 447221   Mean   : 24076   Mean   : 5.124       
#>  3rd Qu.: 550000   3rd Qu.: 575000   3rd Qu.: 25000   3rd Qu.: 6.000       
#>  Max.   :4700000   Max.   :2150000   Max.   :350000   Max.   :30.000       
#>                    NA's   :2874      NA's   :2874     NA's   :2874         
#>  parkingSpacePrice
#>  Min.   :     0   
#>  1st Qu.: 24750   
#>  Median : 35000   
#>  Mean   : 35100   
#>  3rd Qu.: 50000   
#>  Max.   :100000   
#>  NA's   :3001

Corrección de la variable distancia

Recalculamos la distancia al centro de Valencia usando la fórmula de Haversine.

datos %>%
  group_by(district) %>%
  summarise(mean_distance = mean(distance, na.rm = TRUE)) %>%
  arrange(desc(mean_distance))
#> # A tibble: 18 × 2
#>    district         mean_distance
#>    <chr>                    <dbl>
#>  1 Benicalap                2906.
#>  2 Rascanya                 2753.
#>  3 La Patacona              2747.
#>  4 Algirós                  2680.
#>  5 Alboraya Centro          2618 
#>  6 Camins al Grau           2581.
#>  7 Campanar                 2546.
#>  8 Poblats Marítims         2353.
#>  9 L'Olivereta              2348.
#> 10 Patraix                  2347.
#> 11 Benimaclet               2292.
#> 12 Quatre Carreres          2203.
#> 13 Jesús                    2130.
#> 14 La Saïdia                1903.
#> 15 El Pla del Real          1625.
#> 16 Extramurs                 927.
#> 17 L'Eixample                902.
#> 18 Ciutat Vella              565.

haversine <- function(lat1, lon1, lat2, lon2) {
  R <- 6371000  # Radio de la Tierra en metros
  phi1 <- lat1 * pi / 180
  phi2 <- lat2 * pi / 180
  dphi <- (lat2 - lat1) * pi / 180
  dlambda <- (lon2 - lon1) * pi / 180
  a <- sin(dphi / 2)^2 + cos(phi1) * cos(phi2) * sin(dlambda / 2)^2
  return(2 * R * asin(sqrt(a)))
}
calcular_distancia_centro <- function(propiedades, centro_lat, centro_lon) {
  distancias <- sapply(1:nrow(propiedades), function(i) {
    haversine(
      lat1 = propiedades$latitude[i],
      lon1 = propiedades$longitude[i],
      lat2 = centro_lat,
      lon2 = centro_lon
    )
  })
  return(distancias)
}
centro_lat = 39.4699
centro_lon = -0.3763
distancia_buena = calcular_distancia_centro(datos, centro_lat, centro_lon)
datos$distance = distancia_buena
datos %>%
  group_by(district) %>%
  summarise(mean_distance = mean(distance, na.rm = TRUE)) %>%
  arrange(desc(mean_distance))
#> # A tibble: 18 × 2
#>    district         mean_distance
#>    <chr>                    <dbl>
#>  1 La Patacona              4752.
#>  2 Alboraya Centro          4155.
#>  3 Poblats Marítims         4067.
#>  4 Benicalap                2907.
#>  5 Rascanya                 2754.
#>  6 Algirós                  2654.
#>  7 Camins al Grau           2617.
#>  8 Campanar                 2547.
#>  9 L'Olivereta              2349.
#> 10 Patraix                  2347.
#> 11 Benimaclet               2293.
#> 12 Quatre Carreres          2292.
#> 13 Jesús                    2131.
#> 14 La Saïdia                1904.
#> 15 El Pla del Real          1578.
#> 16 Extramurs                 928.
#> 17 L'Eixample                903.
#> 18 Ciutat Vella              566.

Arreglo de la variable floor

Normalizamos valores de la variable floor y realizamos imputación de valores faltantes.

table(datos$floor)
#> 
#>  -2   1  10  11  12  13  14  15  16  17  18  19   2  22  27   3  30   4   5   6 
#>   1 483  25  21  18  11  18   6   2   1   2   5 397   1   1 339   1 322 234 165 
#>   7   8   9  bj  en  ss  st 
#> 167  98  33 457  21   1   1
datos <- datos %>% filter(is.na(floor) | floor != "st")
datos[!is.na(datos$floor) & datos$floor == "ss", "floor"] <- "-0.5"
datos[!is.na(datos$floor) & datos$floor == "bj", "floor"] <- "0"
datos[!is.na(datos$floor) & datos$floor == "en", "floor"] <- "0.5"
datos[!is.na(datos$floor) & datos$floor == "-2", "floor"] <- "2"
datos$floor <- as.numeric(datos$floor)
floor_mice = datos[, c("floor", "price")]
imputed_data <- suppressMessages(
  suppressWarnings(
    suppressPackageStartupMessages(
      withCallingHandlers(
        mice(floor_mice, m = 5, method = "pmm", maxit = 50, seed = 33),
        message = function(m) invokeRestart("muffleMessage")
      )
    )
  )
)
#> 
#>  iter imp variable
#>   1   1  floor
#>   1   2  floor
#>   1   3  floor
#>   1   4  floor
#>   1   5  floor
#>   2   1  floor
#>   2   2  floor
#>   2   3  floor
#>   2   4  floor
#>   2   5  floor
#>   3   1  floor
#>   3   2  floor
#>   3   3  floor
#>   3   4  floor
#>   3   5  floor
#>   4   1  floor
#>   4   2  floor
#>   4   3  floor
#>   4   4  floor
#>   4   5  floor
#>   5   1  floor
#>   5   2  floor
#>   5   3  floor
#>   5   4  floor
#>   5   5  floor
#>   6   1  floor
#>   6   2  floor
#>   6   3  floor
#>   6   4  floor
#>   6   5  floor
#>   7   1  floor
#>   7   2  floor
#>   7   3  floor
#>   7   4  floor
#>   7   5  floor
#>   8   1  floor
#>   8   2  floor
#>   8   3  floor
#>   8   4  floor
#>   8   5  floor
#>   9   1  floor
#>   9   2  floor
#>   9   3  floor
#>   9   4  floor
#>   9   5  floor
#>   10   1  floor
#>   10   2  floor
#>   10   3  floor
#>   10   4  floor
#>   10   5  floor
#>   11   1  floor
#>   11   2  floor
#>   11   3  floor
#>   11   4  floor
#>   11   5  floor
#>   12   1  floor
#>   12   2  floor
#>   12   3  floor
#>   12   4  floor
#>   12   5  floor
#>   13   1  floor
#>   13   2  floor
#>   13   3  floor
#>   13   4  floor
#>   13   5  floor
#>   14   1  floor
#>   14   2  floor
#>   14   3  floor
#>   14   4  floor
#>   14   5  floor
#>   15   1  floor
#>   15   2  floor
#>   15   3  floor
#>   15   4  floor
#>   15   5  floor
#>   16   1  floor
#>   16   2  floor
#>   16   3  floor
#>   16   4  floor
#>   16   5  floor
#>   17   1  floor
#>   17   2  floor
#>   17   3  floor
#>   17   4  floor
#>   17   5  floor
#>   18   1  floor
#>   18   2  floor
#>   18   3  floor
#>   18   4  floor
#>   18   5  floor
#>   19   1  floor
#>   19   2  floor
#>   19   3  floor
#>   19   4  floor
#>   19   5  floor
#>   20   1  floor
#>   20   2  floor
#>   20   3  floor
#>   20   4  floor
#>   20   5  floor
#>   21   1  floor
#>   21   2  floor
#>   21   3  floor
#>   21   4  floor
#>   21   5  floor
#>   22   1  floor
#>   22   2  floor
#>   22   3  floor
#>   22   4  floor
#>   22   5  floor
#>   23   1  floor
#>   23   2  floor
#>   23   3  floor
#>   23   4  floor
#>   23   5  floor
#>   24   1  floor
#>   24   2  floor
#>   24   3  floor
#>   24   4  floor
#>   24   5  floor
#>   25   1  floor
#>   25   2  floor
#>   25   3  floor
#>   25   4  floor
#>   25   5  floor
#>   26   1  floor
#>   26   2  floor
#>   26   3  floor
#>   26   4  floor
#>   26   5  floor
#>   27   1  floor
#>   27   2  floor
#>   27   3  floor
#>   27   4  floor
#>   27   5  floor
#>   28   1  floor
#>   28   2  floor
#>   28   3  floor
#>   28   4  floor
#>   28   5  floor
#>   29   1  floor
#>   29   2  floor
#>   29   3  floor
#>   29   4  floor
#>   29   5  floor
#>   30   1  floor
#>   30   2  floor
#>   30   3  floor
#>   30   4  floor
#>   30   5  floor
#>   31   1  floor
#>   31   2  floor
#>   31   3  floor
#>   31   4  floor
#>   31   5  floor
#>   32   1  floor
#>   32   2  floor
#>   32   3  floor
#>   32   4  floor
#>   32   5  floor
#>   33   1  floor
#>   33   2  floor
#>   33   3  floor
#>   33   4  floor
#>   33   5  floor
#>   34   1  floor
#>   34   2  floor
#>   34   3  floor
#>   34   4  floor
#>   34   5  floor
#>   35   1  floor
#>   35   2  floor
#>   35   3  floor
#>   35   4  floor
#>   35   5  floor
#>   36   1  floor
#>   36   2  floor
#>   36   3  floor
#>   36   4  floor
#>   36   5  floor
#>   37   1  floor
#>   37   2  floor
#>   37   3  floor
#>   37   4  floor
#>   37   5  floor
#>   38   1  floor
#>   38   2  floor
#>   38   3  floor
#>   38   4  floor
#>   38   5  floor
#>   39   1  floor
#>   39   2  floor
#>   39   3  floor
#>   39   4  floor
#>   39   5  floor
#>   40   1  floor
#>   40   2  floor
#>   40   3  floor
#>   40   4  floor
#>   40   5  floor
#>   41   1  floor
#>   41   2  floor
#>   41   3  floor
#>   41   4  floor
#>   41   5  floor
#>   42   1  floor
#>   42   2  floor
#>   42   3  floor
#>   42   4  floor
#>   42   5  floor
#>   43   1  floor
#>   43   2  floor
#>   43   3  floor
#>   43   4  floor
#>   43   5  floor
#>   44   1  floor
#>   44   2  floor
#>   44   3  floor
#>   44   4  floor
#>   44   5  floor
#>   45   1  floor
#>   45   2  floor
#>   45   3  floor
#>   45   4  floor
#>   45   5  floor
#>   46   1  floor
#>   46   2  floor
#>   46   3  floor
#>   46   4  floor
#>   46   5  floor
#>   47   1  floor
#>   47   2  floor
#>   47   3  floor
#>   47   4  floor
#>   47   5  floor
#>   48   1  floor
#>   48   2  floor
#>   48   3  floor
#>   48   4  floor
#>   48   5  floor
#>   49   1  floor
#>   49   2  floor
#>   49   3  floor
#>   49   4  floor
#>   49   5  floor
#>   50   1  floor
#>   50   2  floor
#>   50   3  floor
#>   50   4  floor
#>   50   5  floor

datos$floor <- complete(imputed_data)$floor
summary(datos$floor)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  -0.500   1.000   3.000   3.375   5.000  30.000

Arreglo de la variable change

Convertimos la variable change a numérica y tratamos valores vacíos.

datos[datos$change == "{}", "change"] <- NA
datos$change <- as.numeric(datos$change)
summary(datos$change)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>     455    2329    3000    3233    3832   10385    2783

Coeficientes de variación de variables numéricas

Calculamos el coeficiente de variación para identificar variables poco informativas o muy dispersas.

mySD = apply(datos[,descProp$variable[descProp$tipo == "numerical"]], 2, sd, na.rm = TRUE)
myMU = colMeans(datos[,descProp$variable[descProp$tipo == "numerical"]], na.rm = TRUE)
myCV = mySD/myMU
sort(myCV)
#>                longitude                 latitude              priceByArea 
#>            -0.0616735003             0.0003282662             0.4514389140 
#>        parkingSpacePrice                    rooms                   change 
#>             0.4683234831             0.4687391919             0.4795358918 
#>                bathrooms                 distance                numPhotos 
#>             0.5220271471             0.5572306039             0.6323648760 
#> distancia_min_estacion_m             former_price    price_drop_percentage 
#>             0.6608994517             0.7509810570             0.7679294571 
#>                     size                    price                   amount 
#>             0.8064627304             0.8459890212             0.8459890212 
#>                    floor         price_drop_value 
#>             0.9311653938             1.4365141663

Distribución de variables binarias

Analizamos la distribución de las variables binarias.

apply(datos[,descProp$variable[descProp$tipo == "binary"]], 2, table, useNA = "i")
#> $exterior
#> 
#>    0    1 <NA> 
#>  173 2663  312 
#> 
#> $showAddress
#> 
#>    0    1 
#> 2691  457 
#> 
#> $hasVideo
#> 
#>    0    1 
#> 2107 1041 
#> 
#> $newDevelopment
#> 
#>    0    1 
#> 3024  124 
#> 
#> $hasLift
#> 
#>    0    1 <NA> 
#>  782 2185  181 
#> 
#> $hasPlan
#> 
#>    0    1 <NA> 
#> 1552 1231  365 
#> 
#> $has3DTour
#> 
#>    0    1 
#> 2856  292 
#> 
#> $has360
#> 
#>    0    1 
#> 2742  406 
#> 
#> $hasStaging
#> 
#>    0    1 
#> 3016  132 
#> 
#> $topNewDevelopment
#> 
#>    0 <NA> 
#> 2783  365 
#> 
#> $topPlus
#> 
#>    0    1 
#> 3108   40 
#> 
#> $newDevelopmentFinished
#> 
#>    0    1 <NA> 
#>  122    2 3024 
#> 
#> $hasParkingSpace
#> 
#>    1 <NA> 
#>  780 2368 
#> 
#> $isParkingSpaceIncludedInPrice
#> 
#>    0    1 <NA> 
#>  148  632 2368
apply(datos[,descProp$variable[descProp$tipo == "binary"]], 2,
      function (x) round(100*table(x)/sum(table(x)), 2))
#> $exterior
#> x
#>    0    1 
#>  6.1 93.9 
#> 
#> $showAddress
#> x
#>     0     1 
#> 85.48 14.52 
#> 
#> $hasVideo
#> x
#>     0     1 
#> 66.93 33.07 
#> 
#> $newDevelopment
#> x
#>     0     1 
#> 96.06  3.94 
#> 
#> $hasLift
#> x
#>     0     1 
#> 26.36 73.64 
#> 
#> $hasPlan
#> x
#>     0     1 
#> 55.77 44.23 
#> 
#> $has3DTour
#> x
#>     0     1 
#> 90.72  9.28 
#> 
#> $has360
#> x
#>    0    1 
#> 87.1 12.9 
#> 
#> $hasStaging
#> x
#>     0     1 
#> 95.81  4.19 
#> 
#> $topNewDevelopment
#> x
#>   0 
#> 100 
#> 
#> $topPlus
#> x
#>     0     1 
#> 98.73  1.27 
#> 
#> $newDevelopmentFinished
#> x
#>     0     1 
#> 98.39  1.61 
#> 
#> $hasParkingSpace
#> x
#>   1 
#> 100 
#> 
#> $isParkingSpaceIncludedInPrice
#> x
#>     0     1 
#> 18.97 81.03

Distribución de variables categóricas

Analizamos la distribución de las variables categóricas.

apply(datos[,descProp$variable[descProp$tipo == "categorical"]], 2, table, useNA = "i")
#> $propertyType
#> 
#>       chalet countryHouse       duplex         flat    penthouse       studio 
#>          135            6          126         2620          184           77 
#> 
#> $operation
#> 
#> sale 
#> 3148 
#> 
#> $province
#> 
#> València 
#>     3148 
#> 
#> $municipality
#> 
#> Alboraya València 
#>       67     3081 
#> 
#> $district
#> 
#>  Alboraya Centro          Algirós        Benicalap       Benimaclet 
#>                2               79              110               35 
#>   Camins al Grau         Campanar     Ciutat Vella  El Pla del Real 
#>              246              150              356               83 
#>        Extramurs            Jesús       L'Eixample      L'Olivereta 
#>              226              161              344              156 
#>      La Patacona        La Saïdia          Patraix Poblats Marítims 
#>               65              120              156              399 
#>  Quatre Carreres         Rascanya 
#>              287              173 
#> 
#> $country
#> 
#>   es 
#> 3148 
#> 
#> $neighborhood
#> 
#>                                Aiora                               Albors 
#>                                   88                                   16 
#>                          Arrancapins                     Barrio de Favara 
#>                                   92                                   12 
#>                            Benicalap                            Beniferri 
#>                                   76                                    5 
#>                           Benimaclet                               Beteró 
#>                                   31                                   27 
#>                         Camí de Vera                           Camí Fondo 
#>                                    4                                   16 
#>                           Camí Reial                             Campanar 
#>                                    9                                   46 
#> Ciutat de les Arts i de les Ciencies                       Ciutat Fallera 
#>                                   53                                   20 
#>                         Ciutat Jardí                 Ciutat Universitària 
#>                                   24                                   11 
#>                           El Botànic            El Cabanyal-El Canyamelar 
#>                                   45                                  139 
#>                           El Calvari                             El Carme 
#>                                   16                                   61 
#>                              El Grau                            El Mercat 
#>                                   56                                   55 
#>                             El Pilar                     El Pla del Remei 
#>                                   39                                  102 
#>                          Els Orriols                             En Corts 
#>                                   44                                   57 
#>                            Exposició                Fonteta de Sant Lluìs 
#>                                   30                                   28 
#>                             Gran Vía                           Jaume Roig 
#>                                   78                                   17 
#>                            L'Amistat                    L'Hort de Senabre 
#>                                   19                                   44 
#>                       L'Illa Perduda                          La Carrasca 
#>                                   15                                   17 
#>                      La Creu Coberta                     La Creu del Grau 
#>                                   11                                   36 
#>                         La Fontsanta                              La Llum 
#>                                   22                                   15 
#>                           La Petxina                             La Punta 
#>                                   44                                   25 
#>                            La Raiosa                           La Roqueta 
#>                                   75                                   45 
#>                               La Seu                        La Vega Baixa 
#>                                   36                                    4 
#>                             La Xerea                         Les Tendetes 
#>                                   35                                   12 
#>                              Malilla                           Marxalenes 
#>                                   54                                   22 
#>                             Mestalla                          Mont-Olivet 
#>                                   25                                   54 
#>                             Morvedre                           Na Rovella 
#>                                   35                                   16 
#>                             Natzaret                        Nou Benicalap 
#>                                   46                                   14 
#>                         Nou Campanar                            Nou Moles 
#>                                   18                                   65 
#>                              Patraix                           Penya-Roja 
#>                                   58                                   90 
#>               Playa de la Malvarrosa                              Russafa 
#>                                  131                                  164 
#>                             Safranar                          Sant Antoni 
#>                                   23                                   20 
#>                        Sant Francesc                          Sant Isidre 
#>                                  130                                   11 
#>                         Sant Llorenç                        Sant Marcellí 
#>                                   13                                   22 
#>                             Sant Pau                             Soternes 
#>                                   53                                   19 
#>                               Tormos                            Torrefiel 
#>                                   32                                  116 
#>                         Tres Forques                             Trinitat 
#>                                   35                                   11 
#>                        Vara de Quart                                 <NA> 
#>                                   52                                   67 
#> 
#> $status
#> 
#>           good newdevelopment          renew           <NA> 
#>           2640             95            384             29 
#> 
#> $highlight
#> 
#> Destacado     False       Top      Top+      True      <NA> 
#>       696       356       403        40         9      1644 
#> 
#> $savedAd
#> 
#> {'groupDescription': 'Destacado'}       {'groupDescription': 'Top'} 
#>                                89                                40 
#>                                {}                              <NA> 
#>                              2783                               236 
#> 
#> $notes
#> 
#>   []   {} 
#> 2783  365 
#> 
#> $currency
#> 
#>    € 
#> 3148 
#> 
#> $typology
#> 
#>       chalet countryHouse         flat         <NA> 
#>           82            6         2695          365 
#> 
#> $subTypology
#> 
#>      casaDePueblo      countryHouse            duplex  independantHouse 
#>                 4                 1               110                36 
#>             masia         penthouse semidetachedHouse            studio 
#>                 1               174                 8                75 
#>     terracedHouse              <NA> 
#>                35              2704
apply(datos[,descProp$variable[descProp$tipo == "categorical"]], 2,
      function (x) round(100*table(x)/sum(table(x)), 2))
#> $propertyType
#> x
#>       chalet countryHouse       duplex         flat    penthouse       studio 
#>         4.29         0.19         4.00        83.23         5.84         2.45 
#> 
#> $operation
#> x
#> sale 
#>  100 
#> 
#> $province
#> x
#> València 
#>      100 
#> 
#> $municipality
#> x
#> Alboraya València 
#>     2.13    97.87 
#> 
#> $district
#> x
#>  Alboraya Centro          Algirós        Benicalap       Benimaclet 
#>             0.06             2.51             3.49             1.11 
#>   Camins al Grau         Campanar     Ciutat Vella  El Pla del Real 
#>             7.81             4.76            11.31             2.64 
#>        Extramurs            Jesús       L'Eixample      L'Olivereta 
#>             7.18             5.11            10.93             4.96 
#>      La Patacona        La Saïdia          Patraix Poblats Marítims 
#>             2.06             3.81             4.96            12.67 
#>  Quatre Carreres         Rascanya 
#>             9.12             5.50 
#> 
#> $country
#> x
#>  es 
#> 100 
#> 
#> $neighborhood
#> x
#>                                Aiora                               Albors 
#>                                 2.86                                 0.52 
#>                          Arrancapins                     Barrio de Favara 
#>                                 2.99                                 0.39 
#>                            Benicalap                            Beniferri 
#>                                 2.47                                 0.16 
#>                           Benimaclet                               Beteró 
#>                                 1.01                                 0.88 
#>                         Camí de Vera                           Camí Fondo 
#>                                 0.13                                 0.52 
#>                           Camí Reial                             Campanar 
#>                                 0.29                                 1.49 
#> Ciutat de les Arts i de les Ciencies                       Ciutat Fallera 
#>                                 1.72                                 0.65 
#>                         Ciutat Jardí                 Ciutat Universitària 
#>                                 0.78                                 0.36 
#>                           El Botànic            El Cabanyal-El Canyamelar 
#>                                 1.46                                 4.51 
#>                           El Calvari                             El Carme 
#>                                 0.52                                 1.98 
#>                              El Grau                            El Mercat 
#>                                 1.82                                 1.79 
#>                             El Pilar                     El Pla del Remei 
#>                                 1.27                                 3.31 
#>                          Els Orriols                             En Corts 
#>                                 1.43                                 1.85 
#>                            Exposició                Fonteta de Sant Lluìs 
#>                                 0.97                                 0.91 
#>                             Gran Vía                           Jaume Roig 
#>                                 2.53                                 0.55 
#>                            L'Amistat                    L'Hort de Senabre 
#>                                 0.62                                 1.43 
#>                       L'Illa Perduda                          La Carrasca 
#>                                 0.49                                 0.55 
#>                      La Creu Coberta                     La Creu del Grau 
#>                                 0.36                                 1.17 
#>                         La Fontsanta                              La Llum 
#>                                 0.71                                 0.49 
#>                           La Petxina                             La Punta 
#>                                 1.43                                 0.81 
#>                            La Raiosa                           La Roqueta 
#>                                 2.43                                 1.46 
#>                               La Seu                        La Vega Baixa 
#>                                 1.17                                 0.13 
#>                             La Xerea                         Les Tendetes 
#>                                 1.14                                 0.39 
#>                              Malilla                           Marxalenes 
#>                                 1.75                                 0.71 
#>                             Mestalla                          Mont-Olivet 
#>                                 0.81                                 1.75 
#>                             Morvedre                           Na Rovella 
#>                                 1.14                                 0.52 
#>                             Natzaret                        Nou Benicalap 
#>                                 1.49                                 0.45 
#>                         Nou Campanar                            Nou Moles 
#>                                 0.58                                 2.11 
#>                              Patraix                           Penya-Roja 
#>                                 1.88                                 2.92 
#>               Playa de la Malvarrosa                              Russafa 
#>                                 4.25                                 5.32 
#>                             Safranar                          Sant Antoni 
#>                                 0.75                                 0.65 
#>                        Sant Francesc                          Sant Isidre 
#>                                 4.22                                 0.36 
#>                         Sant Llorenç                        Sant Marcellí 
#>                                 0.42                                 0.71 
#>                             Sant Pau                             Soternes 
#>                                 1.72                                 0.62 
#>                               Tormos                            Torrefiel 
#>                                 1.04                                 3.77 
#>                         Tres Forques                             Trinitat 
#>                                 1.14                                 0.36 
#>                        Vara de Quart 
#>                                 1.69 
#> 
#> $status
#> x
#>           good newdevelopment          renew 
#>          84.64           3.05          12.31 
#> 
#> $highlight
#> x
#> Destacado     False       Top      Top+      True 
#>     46.28     23.67     26.80      2.66      0.60 
#> 
#> $savedAd
#> x
#> {'groupDescription': 'Destacado'}       {'groupDescription': 'Top'} 
#>                              3.06                              1.37 
#>                                {} 
#>                             95.57 
#> 
#> $notes
#> x
#>    []    {} 
#> 88.41 11.59 
#> 
#> $currency
#> x
#>   € 
#> 100 
#> 
#> $typology
#> x
#>       chalet countryHouse         flat 
#>         2.95         0.22        96.84 
#> 
#> $subTypology
#> x
#>      casaDePueblo      countryHouse            duplex  independantHouse 
#>              0.90              0.23             24.77              8.11 
#>             masia         penthouse semidetachedHouse            studio 
#>              0.23             39.19              1.80             16.89 
#>     terracedHouse 
#>              7.88

Eliminación de variables irrelevantes

Eliminamos variables que no aportan información relevante al análisis.

datos = datos[,setdiff(colnames(datos), c("operation", "province", "municipality", "country", "hasVideo", "highlight", "savedAd", "notes", "change", "has3DTour", "has360", "hasStaging", "topNewDevelopment", "topPlus", "newDevelopmentFinished", "amount", "currency", "hasParkingSpace", "isParkingSpaceIncludedInPrice", "parkingSpacePrice"))]
descProp = descProp[colnames(datos),]

Tablas de frecuencias de variables seleccionadas

table(datos$exterior)
#> 
#>    0    1 
#>  173 2663
table(datos$showAddress)
#> 
#> FALSE  TRUE 
#>  2691   457
table(datos$newDevelopment)
#> 
#> FALSE  TRUE 
#>  3024   124
table(datos$hasLift)
#> 
#>    0    1 
#>  782 2185
table(datos$hasPlan)
#> 
#> FALSE  TRUE 
#>  1552  1231
nrow(datos)
#> [1] 3148

Análisis de valores inconsistentes o anómalos

Se identifican y tratan valores inconsistentes y anómalos en las variables numéricas y categóricas.

Boxplot de la variable rooms

ggplot(datos, aes(x = as.factor(1), y = rooms)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
  coord_flip() +
  labs(title = "Boxplot del Número de Habitaciones",
       x = "",
       y = "Número de Habitaciones") +
  theme_minimal()

muchas_habitaciones <- datos %>% 
  filter(rooms >= 7) %>% 
  arrange(desc(rooms)) %>% 
  slice_head(n = 6)
muchas_habitaciones
#>   propertyCode numPhotos floor   price propertyType size exterior rooms
#> 1    107160987        17     1  985000         flat  375        1    16
#> 2    106981181        29     1  985000         flat  380        1    16
#> 3    107167848        49     9 1250000         flat  303        1    15
#> 4    105151055        37     2 2100000       chalet 1000       NA    14
#> 5    102114500        36     6 1200000       chalet  734       NA    10
#> 6    100885871        26     1  850000         flat  321        1    10
#>   bathrooms         district neighborhood latitude  longitude showAddress
#> 1         9        La Saïdia     Morvedre 39.48414 -0.3752460       FALSE
#> 2         9        La Saïdia     Morvedre 39.48003 -0.3742156       FALSE
#> 3         5 Poblats Marítims       Beteró 39.47341 -0.3341399       FALSE
#> 4         8     Ciutat Vella     La Xerea 39.47514 -0.3692382       FALSE
#> 5         0        Extramurs  Arrancapins 39.46271 -0.3817061       FALSE
#> 6         2  El Pla del Real    Exposició 39.47702 -0.3620727       FALSE
#>    distance status newDevelopment hasLift priceByArea hasPlan
#> 1 1585.5427   good          FALSE       1        2627   FALSE
#> 2 1140.0629   good          FALSE       1        2592    TRUE
#> 3 3639.8176   good          FALSE       0        4125    TRUE
#> 4  840.7786   good          FALSE      NA        2100    TRUE
#> 5  924.6782  renew          FALSE      NA        1635    TRUE
#> 6 1455.6163   good          FALSE       1        2648    TRUE
#>   distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1                 261.6994           NA               NA                    NA
#> 2                 264.9308           NA               NA                    NA
#> 3                 367.2378           NA               NA                    NA
#> 4                 357.0123           NA               NA                    NA
#> 5                 230.8475           NA               NA                    NA
#> 6                 121.1812           NA               NA                    NA
#>   typology      subTypology
#> 1     flat             <NA>
#> 2     flat             <NA>
#> 3     flat             <NA>
#> 4   chalet independantHouse
#> 5   chalet    terracedHouse
#> 6     flat             <NA>
datos <- datos %>% filter(rooms < 15)

Boxplot de la variable bathrooms

ggplot(datos, aes(x = as.factor(1), y = bathrooms)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
  coord_flip() +
  labs(title = "Boxplot del Número de Baños",
       x = "",
       y = "Número de Baños") +
  theme_minimal()
#> Notch went outside hinges
#> ℹ Do you want `notch = FALSE`?

muchos_bathrooms <- datos %>% 
  filter(bathrooms > 5) %>% 
  arrange(desc(bathrooms)) %>% 
  slice_head(n = 6)
muchos_bathrooms
#>   propertyCode numPhotos floor   price propertyType size exterior rooms
#> 1    107233131        29     3  980000         flat  265        1     9
#> 2    107264893        22     0  700000         flat  195        1     5
#> 3    106983291        34     1  985000         flat  375        1     9
#> 4    103259850       122     5 4700000    penthouse  590        1     6
#> 5    106880590        34     0  950000         flat  360        1     8
#> 6    105151055        37     2 2100000       chalet 1000       NA    14
#>   bathrooms     district  neighborhood latitude  longitude showAddress
#> 1        10 Ciutat Vella Sant Francesc 39.47012 -0.3738187        TRUE
#> 2        10        Jesús     La Raiosa 39.45770 -0.3877716       FALSE
#> 3         9    La Saïdia      Morvedre 39.48052 -0.3752743       FALSE
#> 4         8 Ciutat Vella      La Xerea 39.47662 -0.3725033       FALSE
#> 5         8    Benicalap     Benicalap 39.49108 -0.3928557       FALSE
#> 6         8 Ciutat Vella      La Xerea 39.47514 -0.3692382       FALSE
#>    distance status newDevelopment hasLift priceByArea hasPlan
#> 1  214.3743  renew          FALSE       1        3698    TRUE
#> 2 1675.8956   good          FALSE       0        3590    TRUE
#> 3 1184.0674   good          FALSE       1        2627    TRUE
#> 4  814.7225   good          FALSE       1        7966    TRUE
#> 5 2750.6246   good          FALSE       1        2639    TRUE
#> 6  840.7786   good          FALSE      NA        2100    TRUE
#>   distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1                 255.9972           NA               NA                    NA
#> 2                 258.3893           NA               NA                    NA
#> 3                 274.8149           NA               NA                    NA
#> 4                 619.1010           NA               NA                    NA
#> 5                 162.0399           NA               NA                    NA
#> 6                 357.0123           NA               NA                    NA
#>   typology      subTypology
#> 1     flat             <NA>
#> 2     flat             <NA>
#> 3     flat             <NA>
#> 4     flat        penthouse
#> 5     flat             <NA>
#> 6   chalet independantHouse
datos <- datos %>% filter(bathrooms < 9)

Boxplot de la variable size

ggplot(datos, aes(x = as.factor(1), y = size)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
  coord_flip() +
  labs(title = "Boxplot de size",
       x = "",
       y = "Size") +
  theme_minimal()

grandes <- datos %>% 
  filter(size > 500) %>% 
  arrange(desc(size)) %>% 
  slice_head(n = 5)
grandes_property_code_1 <- grandes$propertyCode[1]
grandes_property_code_2 <- grandes$propertyCode[2]
datos <- datos %>% filter(propertyCode != grandes_property_code_1)
datos <- datos %>% filter(propertyCode != grandes_property_code_2)

Boxplot de la distancia mínima al metro

ggplot(datos, aes(x = as.factor(1), y = distancia_min_estacion_m)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
  coord_flip() +
  labs(title = "Boxplot de la Distancia al Metro",
       x = "",
       y = "Distancia al Metro") +
  theme_minimal()

lejos <- datos %>% 
  filter(distancia_min_estacion_m > 1000) %>% 
  arrange(desc(size)) %>% 
  slice_head(n = 5)
lejos
#>   propertyCode numPhotos floor   price propertyType size exterior rooms
#> 1    105867486        61   0.5 2000000         flat  372        1     3
#> 2    106628414        38   8.0 1100000    penthouse  353        1     3
#> 3    107600722        39   5.0  390000    penthouse  300        1     2
#> 4    103209543        53   0.0 2000000         flat  300        1     3
#> 5    106965933        47   5.0 1100000       chalet  267       NA     3
#>   bathrooms    district  neighborhood latitude  longitude showAddress distance
#> 1         3 La Patacona          <NA> 39.49192 -0.3247711       FALSE 5054.777
#> 2         2 La Patacona          <NA> 39.48796 -0.3305762       FALSE 4408.508
#> 3         3     Patraix Vara de Quart 39.46229 -0.4141896        TRUE 3360.682
#> 4         3 La Patacona          <NA> 39.49114 -0.3240358       FALSE 5069.203
#> 5         4 La Patacona          <NA> 39.48843 -0.3276807       FALSE 4653.836
#>   status newDevelopment hasLift priceByArea hasPlan distancia_min_estacion_m
#> 1   good          FALSE       1          NA      NA                 1881.744
#> 2   good          FALSE       1          NA      NA                 1292.388
#> 3   good          FALSE       1        1300    TRUE                 1131.892
#> 4   good          FALSE       1          NA      NA                 1822.884
#> 5   good          FALSE      NA          NA      NA                 1424.760
#>   former_price price_drop_value price_drop_percentage typology subTypology
#> 1           NA               NA                    NA     <NA>        <NA>
#> 2           NA               NA                    NA     <NA>        <NA>
#> 3           NA               NA                    NA     flat   penthouse
#> 4           NA               NA                    NA     <NA>        <NA>
#> 5           NA               NA                    NA     <NA>        <NA>

Boxplot del número de fotos

ggplot(datos, aes(x = as.factor(1), y = numPhotos)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
  coord_flip() +
  labs(title = "Boxplot del Número de Fotos",
       x = "",
       y = "Número de Fotos") +
  theme_minimal()

fotos <- datos %>% 
  filter(numPhotos > 100) %>% 
  arrange(desc(size)) %>% 
  slice_head(n = 10)
fotos
#>   propertyCode numPhotos floor   price propertyType size exterior rooms
#> 1    103259850       122     5 4700000    penthouse  590        1     6
#> 2    106644557       113     7  490000         flat  177        1     4
#> 3    107137950       110     3  999000         flat  165        1     3
#> 4    106256113       112     8  630000         flat  135        1     4
#> 5    104983712       125     3  750000         flat  130        1     3
#> 6    107431261       109     1  480000         flat  127        1     2
#>   bathrooms       district neighborhood latitude  longitude showAddress
#> 1         8   Ciutat Vella     La Xerea 39.47662 -0.3725033       FALSE
#> 2         3       Campanar     Sant Pau 39.48894 -0.4013619        TRUE
#> 3         3       Campanar     Sant Pau 39.49566 -0.4021869       FALSE
#> 4         2 Camins al Grau   Penya-Roja 39.46005 -0.3523510        TRUE
#> 5         2     L'Eixample      Russafa 39.45919 -0.3727697       FALSE
#> 6         2   Ciutat Vella       La Seu 39.47691 -0.3802234       FALSE
#>    distance status newDevelopment hasLift priceByArea hasPlan
#> 1  814.7225   good          FALSE       1        7966    TRUE
#> 2 3017.8600   good          FALSE       1        2768    TRUE
#> 3 3625.1553   good          FALSE       1        6055    TRUE
#> 4 2329.6240   good          FALSE       1        4667   FALSE
#> 5 1228.4116   good          FALSE       1        5769    TRUE
#> 6  849.1217   good          FALSE       1        3780    TRUE
#>   distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1                 619.1010           NA               NA                    NA
#> 2                 273.6567           NA               NA                    NA
#> 3                 252.3325           NA               NA                    NA
#> 4                 817.9295           NA               NA                    NA
#> 5                 609.5870           NA               NA                    NA
#> 6                 786.4972           NA               NA                    NA
#>   typology subTypology
#> 1     flat   penthouse
#> 2     flat        <NA>
#> 3     flat        <NA>
#> 4     flat        <NA>
#> 5     flat        <NA>
#> 6     flat        <NA>

Boxplot del número de piso

ggplot(datos, aes(x = as.factor(1), y = floor)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
  coord_flip() +
  labs(title = "Boxplot del Número de Piso",
       x = "",
       y = "Número de Piso") +
  theme_minimal()

altos <- datos %>% 
  filter(floor > 20) %>% 
  arrange(desc(size)) %>% 
  slice_head(n = 10)
altos
#>   propertyCode numPhotos floor  price propertyType size exterior rooms
#> 1    106323518        28    30 650000         flat  131        1     4
#> 2    107563294        35    27 612000         flat  105        1     2
#> 3     99710198        40    22 555000         flat   82        1     2
#>   bathrooms       district neighborhood latitude  longitude showAddress
#> 1         2 Camins al Grau   Penya-Roja 39.46005 -0.3523510        TRUE
#> 2         2       Campanar     Sant Pau 39.49582 -0.4032308       FALSE
#> 3         2       Campanar     Sant Pau 39.48907 -0.4004194       FALSE
#>   distance status newDevelopment hasLift priceByArea hasPlan
#> 1 2329.624   good          FALSE       1          NA      NA
#> 2 3694.741   good          FALSE       1        5829   FALSE
#> 3 2971.371   good          FALSE       1        6768   FALSE
#>   distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1                 817.9295           NA               NA                    NA
#> 2                 313.5163           NA               NA                    NA
#> 3                 218.8787           NA               NA                    NA
#>   typology subTypology
#> 1     <NA>        <NA>
#> 2     flat        <NA>
#> 3     flat        <NA>

Boxplot del precio

ggplot(datos, aes(x = as.factor(1), y = price)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
  coord_flip() +
  labs(title = "Boxplot del Precio",
       x = "",
       y = "Precio") +
  theme_minimal()

caros <- datos %>% 
  filter(price > 2500000) %>% 
  arrange(desc(size)) %>% 
  slice_head(n = 10)
caros
#>    propertyCode numPhotos floor   price propertyType size exterior rooms
#> 1     103259850       122     5 4700000    penthouse  590        1     6
#> 2     106413311        51     3 2599000    penthouse  457        1     5
#> 3     106261302        40     3 2600000       duplex  457        1     5
#> 4     104867597        68     3 2600000    penthouse  398        1     5
#> 5     107260483         1     4 2800000         flat  377        1     7
#> 6     103488287         0     1 2650000    penthouse  365        1     3
#> 7     105204408        85     5 3500000         flat  352        1     4
#> 8     107211693        50     6 3500000         flat  350        1     4
#> 9     105973487        47     7 3500000         flat  350        1     4
#> 10    105367713        69     5 3500000         flat  342        1     4
#>    bathrooms     district     neighborhood latitude  longitude showAddress
#> 1          8 Ciutat Vella         La Xerea 39.47662 -0.3725033       FALSE
#> 2          6   L'Eixample El Pla del Remei 39.46382 -0.3724053       FALSE
#> 3          2   L'Eixample El Pla del Remei 39.46621 -0.3742678       FALSE
#> 4          6   L'Eixample El Pla del Remei 39.46429 -0.3742040       FALSE
#> 5          4   L'Eixample El Pla del Remei 39.46648 -0.3683944       FALSE
#> 6          2   L'Eixample El Pla del Remei 39.46786 -0.3695561       FALSE
#> 7          5   L'Eixample El Pla del Remei 39.47025 -0.3696776       FALSE
#> 8          5   L'Eixample El Pla del Remei 39.46938 -0.3698621       FALSE
#> 9          6   L'Eixample El Pla del Remei 39.46881 -0.3690677       FALSE
#> 10         5   L'Eixample El Pla del Remei 39.47116 -0.3681202       FALSE
#>    distance status newDevelopment hasLift priceByArea hasPlan
#> 1  814.7225   good          FALSE       1        7966    TRUE
#> 2  754.1344   good          FALSE       1        5687    TRUE
#> 3  445.9856   good          FALSE       1        5689   FALSE
#> 4  649.6709   good          FALSE       1        6533    TRUE
#> 5  777.7054   good          FALSE       1        7427   FALSE
#> 6  621.6540   good          FALSE       1        7260   FALSE
#> 7  569.7504   good          FALSE       1        9943    TRUE
#> 8  555.6693   good          FALSE       1       10000   FALSE
#> 9  632.5821   good          FALSE       1       10000   FALSE
#> 10 716.0182   good          FALSE       1       10234   FALSE
#>    distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1                 619.10097           NA               NA                    NA
#> 2                 247.64045           NA               NA                    NA
#> 3                 262.84052           NA               NA                    NA
#> 4                 282.91430           NA               NA                    NA
#> 5                 280.16763           NA               NA                    NA
#> 6                 280.75340           NA               NA                    NA
#> 7                  99.79849           NA               NA                    NA
#> 8                 122.84038           NA               NA                    NA
#> 9                 215.75330           NA               NA                    NA
#> 10                257.30206           NA               NA                    NA
#>    typology subTypology
#> 1      flat   penthouse
#> 2      flat   penthouse
#> 3      flat      duplex
#> 4      flat   penthouse
#> 5      flat        <NA>
#> 6      flat   penthouse
#> 7      flat        <NA>
#> 8      flat        <NA>
#> 9      flat        <NA>
#> 10     flat        <NA>

Boxplot del precio por m2

datos$priceByArea[is.na(datos$priceByArea)] <- datos$price[is.na(datos$priceByArea)] / datos$size[is.na(datos$priceByArea)]
ggplot(datos, aes(x = as.factor(1), y = priceByArea)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
  coord_flip() +
  labs(title = "Boxplot del Precio por m2",
       x = "",
       y = "Precio m2") +
  theme_minimal()

caros_m2 <- datos %>% 
  filter(priceByArea > 10000) %>% 
  arrange(desc(size)) %>% 
  slice_head(n = 10)
caros_m2
#>    propertyCode numPhotos floor   price propertyType size exterior rooms
#> 1     105367713        69     5 3500000         flat  342        1     4
#> 2     107049698        63     5 3500000       duplex  342        1     4
#> 3     107537312        60     6 3500000         flat  342        1     4
#> 4     101942142        50     3 2150000         flat  208        1     3
#> 5     107458992        10     2 2000000         flat  185        1     3
#> 6     104639503        76     6 1800000    penthouse  148        1     3
#> 7      93697954        19     2 1900000         flat  140        1     2
#> 8     104652530        22     7 1300000    penthouse  121        1     3
#> 9     106749590        72     7 1100000         flat  109        1     1
#> 10    103340081        37     9 1190000    penthouse  102        1     2
#>    bathrooms       district     neighborhood latitude  longitude showAddress
#> 1          5     L'Eixample El Pla del Remei 39.47116 -0.3681202       FALSE
#> 2          5     L'Eixample El Pla del Remei 39.46971 -0.3700135       FALSE
#> 3          5     L'Eixample El Pla del Remei 39.46928 -0.3693383       FALSE
#> 4          3     L'Eixample El Pla del Remei 39.47053 -0.3651211       FALSE
#> 5          4   Ciutat Vella         La Xerea 39.47150 -0.3763945       FALSE
#> 6          2     L'Eixample          Russafa 39.46321 -0.3676446       FALSE
#> 7          2     L'Eixample El Pla del Remei 39.46878 -0.3682352       FALSE
#> 8          2 Camins al Grau       Penya-Roja 39.46327 -0.3532254        TRUE
#> 9          2 Camins al Grau       Penya-Roja 39.45817 -0.3478155       FALSE
#> 10         2   Ciutat Vella    Sant Francesc 39.47236 -0.3763066       FALSE
#>     distance         status newDevelopment hasLift priceByArea hasPlan
#> 1   716.0182           good          FALSE       1       10234   FALSE
#> 2   540.0503           good          FALSE       1       10234    TRUE
#> 3   601.6053           good          FALSE       1       10234   FALSE
#> 4   962.1120           good          FALSE       1       10337    TRUE
#> 5   178.2078           good          FALSE       1       10811    TRUE
#> 6  1051.3122           good          FALSE       1       12162    TRUE
#> 7   703.4529           good          FALSE       1       13571   FALSE
#> 8  2113.3477 newdevelopment           TRUE       1       10744    TRUE
#> 9  2771.5469           good          FALSE       1       10092    TRUE
#> 10  273.4178           good          FALSE       1       11667    TRUE
#>    distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1                 257.30206           NA               NA                    NA
#> 2                  88.54874           NA               NA                    NA
#> 3                 163.71948           NA               NA                    NA
#> 4                 296.96193           NA               NA                    NA
#> 5                 498.91799           NA               NA                    NA
#> 6                 186.69132           NA               NA                    NA
#> 7                 272.83861           NA               NA                    NA
#> 8                 819.22002           NA               NA                    NA
#> 9                 692.02661           NA               NA                    NA
#> 10                528.00360           NA               NA                    NA
#>    typology subTypology
#> 1      flat        <NA>
#> 2      flat      duplex
#> 3      flat        <NA>
#> 4      flat        <NA>
#> 5      flat        <NA>
#> 6      flat   penthouse
#> 7      flat        <NA>
#> 8      flat   penthouse
#> 9      flat        <NA>
#> 10     flat   penthouse

Conversión de variables binarias a 0/1

binary_vars <- descProp$variable[descProp$tipo == "binary"]
datos <- datos %>%
  mutate(across(all_of(binary_vars), ~ ifelse(. == TRUE, 1, ifelse(. == FALSE, 0, NA))))

PCA (Análisis de Componentes Principales)

datos_pca = dplyr::select(datos, - which(descProp$tipo == "binary"), - which(colnames(datos) %in% c("former_price", "price_drop_value", "price_drop_percentage", "propertyCode", "latitude", "longitude", "subTypology")))
res.pca <- PCA(datos_pca, scale.unit = TRUE, graph = FALSE, ncp = 10,  quali.sup = c(4, 8, 9, 11, 14))
#> Warning in PCA(datos_pca, scale.unit = TRUE, graph = FALSE, ncp = 10, quali.sup
#> = c(4, : Missing values are imputed by the mean of the variable: you should use
#> the imputePCA function of the missMDA package
eig.val <- get_eigenvalue(res.pca)
VPmedio = 100 * (1/nrow(eig.val))
fviz_eig(res.pca, addlabels = TRUE) +
  geom_hline(yintercept=VPmedio, linetype=2, color="red")

K = 3
res.pca <- PCA(datos_pca, scale.unit = TRUE, graph = FALSE, ncp = K,  quali.sup = c(4, 8, 9, 11, 14))
#> Warning in PCA(datos_pca, scale.unit = TRUE, graph = FALSE, ncp = K, quali.sup
#> = c(4, : Missing values are imputed by the mean of the variable: you should use
#> the imputePCA function of the missMDA package
misScores = res.pca$ind$coord[,1:K]
miT2 = colSums(t(misScores**2)/eig.val[1:K,1])
I = nrow(datos_pca)
F95 = K*(I**2 - 1)/(I*(I - K)) * qf(0.95, K, I-K)
F99 = K*(I**2 - 1)/(I*(I - K)) * qf(0.99, K, I-K)

plot(1:length(miT2), miT2, type = "p", xlab = "Pisos", ylab = "T2")
abline(h = F95, col = "orange", lty = 2, lwd = 2)
abline(h = F99, col = "red3", lty = 2, lwd = 2)

anomalas = which(miT2 > F95)
anomalas
#>    7   23   37   57   71  144  145  149  152  154  157  161  168  190  195  207 
#>    7   23   37   57   71  144  145  149  152  154  157  161  168  190  195  207 
#>  213  243  249  252  275  279  286  288  289  321  358  366  375  423  432  441 
#>  213  243  249  252  275  279  286  288  289  321  358  366  375  423  432  441 
#>  449  452  465  509  510  551  571  597  613  653  657  663  698  740  767  771 
#>  449  452  465  509  510  551  571  597  613  653  657  663  698  740  767  771 
#>  806  861  873  890  966  975 1012 1022 1023 1035 1049 1056 1059 1068 1111 1112 
#>  806  861  873  890  966  975 1012 1022 1023 1035 1049 1056 1059 1068 1111 1112 
#> 1152 1153 1159 1160 1164 1205 1225 1228 1254 1284 1288 1307 1310 1333 1363 1372 
#> 1152 1153 1159 1160 1164 1205 1225 1228 1254 1284 1288 1307 1310 1333 1363 1372 
#> 1377 1385 1392 1402 1420 1425 1451 1470 1506 1537 1571 1589 1598 1659 1826 2052 
#> 1377 1385 1392 1402 1420 1425 1451 1470 1506 1537 1571 1589 1598 1659 1826 2052 
#> 2058 2067 2072 2073 2074 2077 2079 2084 2085 2087 2088 2089 2092 2093 2094 2095 
#> 2058 2067 2072 2073 2074 2077 2079 2084 2085 2087 2088 2089 2092 2093 2094 2095 
#> 2096 2098 2099 2100 2101 2103 2107 2111 2122 2123 2125 2130 2131 2136 2144 2145 
#> 2096 2098 2099 2100 2101 2103 2107 2111 2122 2123 2125 2130 2131 2136 2144 2145 
#> 2146 2154 2159 2160 2163 2167 2168 2169 2176 2179 2180 2182 2194 2196 2197 2198 
#> 2146 2154 2159 2160 2163 2167 2168 2169 2176 2179 2180 2182 2194 2196 2197 2198 
#> 2208 2216 2229 2230 2248 2254 2257 2259 2274 2280 2297 2298 2304 2313 2317 2331 
#> 2208 2216 2229 2230 2248 2254 2257 2259 2274 2280 2297 2298 2304 2313 2317 2331 
#> 2342 2345 2354 2358 2363 2368 2370 2381 2386 2389 2397 2412 2416 2431 2432 2434 
#> 2342 2345 2354 2358 2363 2368 2370 2381 2386 2389 2397 2412 2416 2431 2432 2434 
#> 2436 2442 2454 2468 2469 2472 2500 2531 2554 2585 2623 2643 2651 2720 2726 2730 
#> 2436 2442 2454 2468 2469 2472 2500 2531 2554 2585 2623 2643 2651 2720 2726 2730 
#> 2738 2744 2761 2763 2805 2819 2872 2888 2896 2905 2919 2955 3007 3113 
#> 2738 2744 2761 2763 2805 2819 2872 2888 2896 2905 2919 2955 3007 3113
contribT2 = function (X, scores, loadings, eigenval, observ, cutoff = 2) {
  # X is data matrix and must be centered (or centered and scaled if data were scaled)
  misScoresNorm = t(t(scores**2) / eigenval)
  misContrib = NULL
  for (oo in observ) {
    print(rownames(scores)[oo])
    print(scores[oo,])
    misPCs = which(as.numeric(misScoresNorm[oo,]) > cutoff)
    lacontri = sapply(misPCs, function (cc) (scores[oo,cc]/eigenval[cc])*loadings[,cc]*X[oo,])
    lacontri = rowSums((1*(sign(lacontri) == 1))*lacontri)
    misContrib = cbind(misContrib, lacontri)
  }
  colnames(misContrib) = rownames(misScoresNorm[observ,])
  return(misContrib)
}
# Recuperamos los datos utilizados en el modelo PCA, centrados y escalados
pisosCE = datos_pca[, c(1, 2, 3, 5, 6, 7, 10, 12, 13)]
pisosCE = pisosCE[,setdiff(colnames(pisosCE), c("rating", "weight", "cups"))]
pisosCE = scale(pisosCE, center = TRUE, scale = TRUE)
X = as.matrix(pisosCE)
# Calculamos los loadings a partir de las coordenadas de las variables
misLoadings = sweep(res.pca$var$coord, 2, sqrt(res.pca$eig[1:K,1]), FUN="/")
# Calculamos las contribuciones
mycontrisT2 = contribT2(X = X, scores = misScores, loadings = misLoadings, 
                        eigenval = eig.val[1:K,1], observ = which.max(miT2),
                        cutoff = 2)
#> [1] "2761"
#>      Dim.1      Dim.2      Dim.3 
#>   5.908302 -10.994043   3.877539
par(mar = c(10,2.3,3,1))
barplot(mycontrisT2[,1],las=2,
        main= paste0("Observación: ", rownames(datos_pca)[which.max(miT2)]))

Se trata de un lote de 6 pisos, por tanto eliminamos el anuncio.

datos <- datos[-which.max(miT2), ]
datos_pca <- datos_pca[-which.max(miT2), ]
par(mar = c(10,2.3,3,1))
barplot(mycontrisT2[,1],las=2, 
main= paste0("Observación: ", rownames(datos_pca)[which.max(miT2)]))

myE = X - misScores %*% t(misLoadings) 
mySCR = rowSums(myE^2)  
plot(1:length(mySCR), mySCR, type = "l", main = "Distancia al modelo", 
     ylab = "SCR", xlab = "Propiedades", ylim = c(0,11))
g = var(mySCR)/(2*mean(mySCR))
h = (2*mean(mySCR)^2)/var(mySCR)
chi2lim = g*qchisq(0.95, df = h)
chi2lim99 = g*qchisq(0.99, df = h)
abline(h = chi2lim, col = "orange", lty = 2, lwd = 2)
abline(h = chi2lim99, col = "red3", lty = 2, lwd = 2)

anomalosSCR = which(mySCR > chi2lim99)
anomalosSCR
#>  [1]  168  184  213  236  321  432  548  597  613 1049 1056 1225 1228 1263 1571
#> [16] 1826 1977 2058 2067 2088 2163 2229 2313 2368 2447 2495 2687 2761 2826

Menos de un 1% de las propiedades, asumible.

## Función para calcular las contribuciones a la SCR
ContriSCR = function(E, SCR) {
  # E es la matriz de residuos del modelo 
  # SCR es la suma de cuadrados residual
  contribucion = NULL
  for (j in 1:length(SCR)){
    eind<-E[j,]
    signo<-sign(eind)
    contri<-(signo*(eind^2)/SCR[j])*100
    contribucion<-rbind(contribucion,contri)
  }
  rownames(contribucion) = rownames(E)
  return(contribucion)
}
## Calculamos las contribuciones de todas las observaciones
mycontris = ContriSCR(E = myE, SCR = mySCR)
## Gráfico para Special_K
barplot(mycontris[which.max(mySCR),],las=2, cex.names = 0.7,
        main=paste0('Contribuciones a SCR para ', which.max(mySCR)))

Error no tiene baños

datos[2311,]$bathrooms = 3
datos_pca[2311,]$bathrooms = 3
for (i in order(mySCR, decreasing = TRUE)[1:10]) {
  barplot(mycontris[i,], las = 2, cex.names = 0.7,
          main = paste0('Contribuciones a SCR para ', i))
}