# Cargamos las librerías necesarias para el análisis y visualización
library(dplyr)
#>
#> Adjuntando el paquete: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(readxl)
library(leaflet)
library(shiny)
library(jsonlite)
#>
#> Adjuntando el paquete: 'jsonlite'
#> The following object is masked from 'package:shiny':
#>
#> validate
library(stringr)
library(knitr)
library(FactoMineR)
library(factoextra)
#> Cargando paquete requerido: ggplot2
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(mice)
#>
#> Adjuntando el paquete: 'mice'
#> The following object is masked from 'package:stats':
#>
#> filter
#> The following objects are masked from 'package:base':
#>
#> cbind, rbind
library(ggplot2)
#> # A tibble: 6 × 45
#> propertyCode thumbnail externalReference numPhotos floor price priceInfo
#> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
#> 1 106620603 https://img4.… W-02WWIS 65 5 8.64e5 {'price'…
#> 2 107552311 https://img4.… LCGV-1107 38 6 1.35e6 {'price'…
#> 3 107048311 https://img4.… W-02YAIU 49 2 1.05e6 {'price'…
#> 4 106614759 https://img4.… W-02XPUT 69 6 1.35e6 {'price'…
#> 5 107124247 https://img4.… LCGV-1047 31 2 4.50e5 {'price'…
#> 6 107311950 https://img4.… LCGV-1074 42 2 1.05e6 {'price'…
#> # ℹ 38 more variables: propertyType <chr>, operation <chr>, size <dbl>,
#> # exterior <dbl>, rooms <dbl>, bathrooms <dbl>, address <chr>,
#> # province <chr>, municipality <chr>, district <chr>, country <chr>,
#> # neighborhood <chr>, latitude <dbl>, longitude <dbl>, showAddress <lgl>,
#> # url <chr>, distance <dbl>, description <chr>, hasVideo <lgl>, status <chr>,
#> # newDevelopment <lgl>, hasLift <dbl>, priceByArea <dbl>, change <chr>,
#> # detailedType <chr>, suggestedTexts <chr>, hasPlan <lgl>, has3DTour <lgl>, …
# Ejemplo de consulta de URL de una propiedad concreta
datos %>% filter(propertyCode == 106230584) %>% dplyr::select(url)
#> # A tibble: 1 × 1
#> url
#> <chr>
#> 1 https://www.idealista.com/inmueble/106230584/
Antes de analizar los datos, eliminamos duplicados generados por el scraping en dos zonas geográficas.
nrow(datos)
#> [1] 3913
duplicados <- datos %>% group_by(propertyCode) %>% filter(n() > 1) %>% ungroup()
nrow(duplicados)
#> [1] 1487
# Nos quedamos con un duplicado por propertyCode (el primero)
duplicados_buenos = duplicados %>% group_by(propertyCode) %>% slice_head(n = 1) %>% ungroup()
nrow(duplicados_buenos)
#> [1] 722
# Eliminamos duplicados y los reemplazamos por los seleccionados
datos <- anti_join(datos, duplicados_buenos, by = c("propertyCode", "price"))
datos <- rbind(datos, duplicados_buenos)
nrow(datos)
#> [1] 3149
Algunas columnas contienen información en formato JSON. Las limpiamos y extraemos variables relevantes.
# Procesamos la columna priceInfo (precio, moneda, descuentos)
head(datos$priceInfo)
#> [1] "{'price': {'amount': 864000.0, 'currencySuffix': '€', 'priceDropInfo': {'formerPrice': 890000.0, 'priceDropValue': 26000, 'priceDropPercentage': 3}}}"
#> [2] "{'price': {'amount': 1350000.0, 'currencySuffix': '€'}}"
#> [3] "{'price': {'amount': 1050000.0, 'currencySuffix': '€'}}"
#> [4] "{'price': {'amount': 1350000.0, 'currencySuffix': '€'}}"
#> [5] "{'price': {'amount': 450000.0, 'currencySuffix': '€'}}"
#> [6] "{'price': {'amount': 1050000.0, 'currencySuffix': '€'}}"
datos <- datos %>%
mutate(
priceInfo_clean = str_replace_all(priceInfo, "'", "\""),
parsed_json = lapply(priceInfo_clean, function(x) tryCatch(fromJSON(x), error = function(e) NULL))
) %>%
rowwise() %>%
mutate(
amount = parsed_json$price$amount %||% NA,
currency = parsed_json$price$currencySuffix %||% NA,
former_price = parsed_json$price$priceDropInfo$formerPrice %||% NA,
price_drop_value = parsed_json$price$priceDropInfo$priceDropValue %||% NA,
price_drop_percentage = parsed_json$price$priceDropInfo$priceDropPercentage %||% NA
) %>%
dplyr::select(-parsed_json, -priceInfo_clean, -priceInfo)
# Procesamos la columna detailedType (tipología)
head(datos$detailedType)
#> [1] "{'typology': 'flat'}" "{'typology': 'flat'}" "{'typology': 'flat'}"
#> [4] "{'typology': 'flat'}" "{'typology': 'flat'}" "{'typology': 'flat'}"
datos <- datos %>%
mutate(
detailedType_clean = str_replace_all(detailedType, "'", "\""),
parsed_json = lapply(detailedType_clean, function(x) tryCatch(fromJSON(x), error = function(e) NULL))
) %>%
rowwise() %>%
mutate(
typology = parsed_json$typology %||% NA,
subTypology = parsed_json$subTypology %||% NA
) %>%
dplyr::select(-parsed_json, -detailedType_clean, -detailedType)
# Procesamos la variable highlight para convertirla en categórica
head(table(datos$suggestedTexts)) # No se procesa para análisis exploratorio
#>
#> {'subtitle': 'Aiora, València', 'title': 'Ático en calle Maestro Valls, 21'}
#> 1
#> {'subtitle': 'Aiora, València', 'title': 'Casa independiente en calle de la Conserva'}
#> 1
#> {'subtitle': 'Aiora, València', 'title': 'Chalet adosado'}
#> 1
#> {'subtitle': 'Aiora, València', 'title': 'Chalet adosado en calle de Francesc de Bellvís'}
#> 1
#> {'subtitle': 'Aiora, València', 'title': 'Chalet adosado en calle del Marí Albesa'}
#> 1
#> {'subtitle': 'Aiora, València', 'title': 'Estudio'}
#> 1
table(datos$highlight, useNA = "ifany")
#>
#> {'groupDescription': 'Destacado'} {'groupDescription': 'Top'}
#> 696 403
#> {'groupDescription': 'Top+'} FALSE
#> 40 356
#> TRUE <NA>
#> 9 1645
datos <- datos %>%
mutate(highlight = case_when(
highlight == "{'groupDescription': 'Destacado'}" ~ "Destacado",
highlight == "{'groupDescription': 'Top'}" ~ "Top",
highlight == "{'groupDescription': 'Top+'}" ~ "Top+",
highlight == "TRUE" ~ "True",
highlight == "FALSE" ~ "False",
TRUE ~ NA_character_
))
table(datos$highlight, useNA = "ifany")
#>
#> Destacado False Top Top+ True <NA>
#> 696 356 403 40 9 1645
# Extraemos información de parkingSpace (JSON anidado)
extract_parking_info <- function(parking_info) {
if (is.na(parking_info)) {
return(data.frame(hasParkingSpace = NA, isParkingSpaceIncludedInPrice = NA, parkingSpacePrice = NA))
} else {
parking_info <- gsub("'", "\"", parking_info)
parking_info <- gsub("\\bTrue\\b", "true", parking_info)
parking_info <- gsub("\\bFalse\\b", "false", parking_info)
tryCatch({
parking_info <- fromJSON(parking_info)
hasParkingSpace <- parking_info$hasParkingSpace
isParkingSpaceIncludedInPrice <- parking_info$isParkingSpaceIncludedInPrice
parkingSpacePrice <- ifelse(isParkingSpaceIncludedInPrice, NA, parking_info$parkingSpacePrice)
return(data.frame(hasParkingSpace, isParkingSpaceIncludedInPrice, parkingSpacePrice))
}, error = function(e) {
return(data.frame(hasParkingSpace = NA, isParkingSpaceIncludedInPrice = NA, parkingSpacePrice = NA))
})
}
}
parking_info <- do.call(rbind, lapply(datos$parkingSpace, extract_parking_info))
datos <- cbind(datos, parking_info)
datos <- datos %>% dplyr::select(-parkingSpace)
names(datos)
#> [1] "propertyCode" "thumbnail"
#> [3] "externalReference" "numPhotos"
#> [5] "floor" "price"
#> [7] "propertyType" "operation"
#> [9] "size" "exterior"
#> [11] "rooms" "bathrooms"
#> [13] "address" "province"
#> [15] "municipality" "district"
#> [17] "country" "neighborhood"
#> [19] "latitude" "longitude"
#> [21] "showAddress" "url"
#> [23] "distance" "description"
#> [25] "hasVideo" "status"
#> [27] "newDevelopment" "hasLift"
#> [29] "priceByArea" "change"
#> [31] "suggestedTexts" "hasPlan"
#> [33] "has3DTour" "has360"
#> [35] "hasStaging" "highlight"
#> [37] "savedAd" "notes"
#> [39] "topNewDevelopment" "topPlus"
#> [41] "newDevelopmentFinished" "distancia_min_estacion_m"
#> [43] "amount" "currency"
#> [45] "former_price" "price_drop_value"
#> [47] "price_drop_percentage" "typology"
#> [49] "subTypology" "hasParkingSpace"
#> [51] "isParkingSpaceIncludedInPrice" "parkingSpacePrice"
Creamos una tabla con el tipo de cada variable para facilitar análisis posteriores.
descProp = data.frame("variable" = colnames(datos),
"tipo" = c("id", "url", "text", rep("numerical", 3), rep("categorical", 2), "numerical", "binary", rep("numerical", 2), "text", rep("categorical", 5), rep("numerical", 2), "binary", "url", "numerical", "text", "binary", "categorical", rep("binary", 2), rep("numerical", 2), "text", rep("binary", 4), rep("categorical", 3), rep("binary", 2), "binary", rep("numerical", 2), "categorical", rep("numerical", 3), rep("categorical", 2), rep("binary", 2), "numerical"), stringsAsFactors = FALSE)
rownames(descProp) = descProp$variable
descProp
#> variable tipo
#> propertyCode propertyCode id
#> thumbnail thumbnail url
#> externalReference externalReference text
#> numPhotos numPhotos numerical
#> floor floor numerical
#> price price numerical
#> propertyType propertyType categorical
#> operation operation categorical
#> size size numerical
#> exterior exterior binary
#> rooms rooms numerical
#> bathrooms bathrooms numerical
#> address address text
#> province province categorical
#> municipality municipality categorical
#> district district categorical
#> country country categorical
#> neighborhood neighborhood categorical
#> latitude latitude numerical
#> longitude longitude numerical
#> showAddress showAddress binary
#> url url url
#> distance distance numerical
#> description description text
#> hasVideo hasVideo binary
#> status status categorical
#> newDevelopment newDevelopment binary
#> hasLift hasLift binary
#> priceByArea priceByArea numerical
#> change change numerical
#> suggestedTexts suggestedTexts text
#> hasPlan hasPlan binary
#> has3DTour has3DTour binary
#> has360 has360 binary
#> hasStaging hasStaging binary
#> highlight highlight categorical
#> savedAd savedAd categorical
#> notes notes categorical
#> topNewDevelopment topNewDevelopment binary
#> topPlus topPlus binary
#> newDevelopmentFinished newDevelopmentFinished binary
#> distancia_min_estacion_m distancia_min_estacion_m numerical
#> amount amount numerical
#> currency currency categorical
#> former_price former_price numerical
#> price_drop_value price_drop_value numerical
#> price_drop_percentage price_drop_percentage numerical
#> typology typology categorical
#> subTypology subTypology categorical
#> hasParkingSpace hasParkingSpace binary
#> isParkingSpaceIncludedInPrice isParkingSpaceIncludedInPrice binary
#> parkingSpacePrice parkingSpacePrice numerical
Eliminamos columnas de tipo texto y ajustamos la tabla de tipos.
datos = datos[,setdiff(colnames(datos), c("thumbnail", "externalReference", "address", "url", "description", "suggestedTexts"))]
descProp = descProp[colnames(datos),]
table(descProp$tipo)
#>
#> binary categorical id numerical
#> 14 14 1 17
Buscamos variables constantes o casi constantes para eliminarlas.
summary(datos[,descProp$variable[descProp$tipo == "numerical"]])
#> numPhotos floor price size
#> Min. : 0.00 Length:3149 Min. : 26800 Min. : 12.0
#> 1st Qu.: 15.00 Class :character 1st Qu.: 217000 1st Qu.: 76.0
#> Median : 24.00 Mode :character Median : 330000 Median : 104.0
#> Mean : 26.47 Mean : 447143 Mean : 126.4
#> 3rd Qu.: 36.00 3rd Qu.: 550000 3rd Qu.: 149.0
#> Max. :125.00 Max. :4700000 Max. :3446.0
#>
#> rooms bathrooms latitude longitude
#> Min. : 0.000 Min. : 0.000 Min. :39.44 Min. :-0.4172
#> 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.:39.46 1st Qu.:-0.3852
#> Median : 3.000 Median : 2.000 Median :39.47 Median :-0.3740
#> Mean : 2.945 Mean : 1.785 Mean :39.47 Mean :-0.3696
#> 3rd Qu.: 4.000 3rd Qu.: 2.000 3rd Qu.:39.48 3rd Qu.:-0.3544
#> Max. :16.000 Max. :10.000 Max. :39.50 Max. :-0.3236
#>
#> distance priceByArea change distancia_min_estacion_m
#> Min. : 1 Min. : 55 Length:3149 Min. : 6.492
#> 1st Qu.:1077 1st Qu.: 2532 Class :character 1st Qu.: 239.489
#> Median :1923 Median : 3292 Mode :character Median : 384.427
#> Mean :1912 Mean : 3527 Mean : 442.141
#> 3rd Qu.:2672 3rd Qu.: 4250 3rd Qu.: 568.523
#> Max. :3996 Max. :26533 Max. :2132.112
#> NA's :365
#> amount former_price price_drop_value price_drop_percentage
#> Min. : 26800 Min. : 43000 Min. : 1000 Min. : 1.000
#> 1st Qu.: 217000 1st Qu.: 220000 1st Qu.: 9000 1st Qu.: 3.000
#> Median : 330000 Median : 349000 Median : 15000 Median : 4.000
#> Mean : 447143 Mean : 447221 Mean : 24076 Mean : 5.124
#> 3rd Qu.: 550000 3rd Qu.: 575000 3rd Qu.: 25000 3rd Qu.: 6.000
#> Max. :4700000 Max. :2150000 Max. :350000 Max. :30.000
#> NA's :2874 NA's :2874 NA's :2874
#> parkingSpacePrice
#> Min. : 0
#> 1st Qu.: 24750
#> Median : 35000
#> Mean : 35100
#> 3rd Qu.: 50000
#> Max. :100000
#> NA's :3001
Recalculamos la distancia al centro de Valencia usando la fórmula de Haversine.
datos %>%
group_by(district) %>%
summarise(mean_distance = mean(distance, na.rm = TRUE)) %>%
arrange(desc(mean_distance))
#> # A tibble: 18 × 2
#> district mean_distance
#> <chr> <dbl>
#> 1 Benicalap 2906.
#> 2 Rascanya 2753.
#> 3 La Patacona 2747.
#> 4 Algirós 2680.
#> 5 Alboraya Centro 2618
#> 6 Camins al Grau 2581.
#> 7 Campanar 2546.
#> 8 Poblats Marítims 2353.
#> 9 L'Olivereta 2348.
#> 10 Patraix 2347.
#> 11 Benimaclet 2292.
#> 12 Quatre Carreres 2203.
#> 13 Jesús 2130.
#> 14 La Saïdia 1903.
#> 15 El Pla del Real 1625.
#> 16 Extramurs 927.
#> 17 L'Eixample 902.
#> 18 Ciutat Vella 565.
haversine <- function(lat1, lon1, lat2, lon2) {
R <- 6371000 # Radio de la Tierra en metros
phi1 <- lat1 * pi / 180
phi2 <- lat2 * pi / 180
dphi <- (lat2 - lat1) * pi / 180
dlambda <- (lon2 - lon1) * pi / 180
a <- sin(dphi / 2)^2 + cos(phi1) * cos(phi2) * sin(dlambda / 2)^2
return(2 * R * asin(sqrt(a)))
}
calcular_distancia_centro <- function(propiedades, centro_lat, centro_lon) {
distancias <- sapply(1:nrow(propiedades), function(i) {
haversine(
lat1 = propiedades$latitude[i],
lon1 = propiedades$longitude[i],
lat2 = centro_lat,
lon2 = centro_lon
)
})
return(distancias)
}
centro_lat = 39.4699
centro_lon = -0.3763
distancia_buena = calcular_distancia_centro(datos, centro_lat, centro_lon)
datos$distance = distancia_buena
datos %>%
group_by(district) %>%
summarise(mean_distance = mean(distance, na.rm = TRUE)) %>%
arrange(desc(mean_distance))
#> # A tibble: 18 × 2
#> district mean_distance
#> <chr> <dbl>
#> 1 La Patacona 4752.
#> 2 Alboraya Centro 4155.
#> 3 Poblats Marítims 4067.
#> 4 Benicalap 2907.
#> 5 Rascanya 2754.
#> 6 Algirós 2654.
#> 7 Camins al Grau 2617.
#> 8 Campanar 2547.
#> 9 L'Olivereta 2349.
#> 10 Patraix 2347.
#> 11 Benimaclet 2293.
#> 12 Quatre Carreres 2292.
#> 13 Jesús 2131.
#> 14 La Saïdia 1904.
#> 15 El Pla del Real 1578.
#> 16 Extramurs 928.
#> 17 L'Eixample 903.
#> 18 Ciutat Vella 566.
Normalizamos valores de la variable floor y realizamos imputación de valores faltantes.
table(datos$floor)
#>
#> -2 1 10 11 12 13 14 15 16 17 18 19 2 22 27 3 30 4 5 6
#> 1 483 25 21 18 11 18 6 2 1 2 5 397 1 1 339 1 322 234 165
#> 7 8 9 bj en ss st
#> 167 98 33 457 21 1 1
datos <- datos %>% filter(is.na(floor) | floor != "st")
datos[!is.na(datos$floor) & datos$floor == "ss", "floor"] <- "-0.5"
datos[!is.na(datos$floor) & datos$floor == "bj", "floor"] <- "0"
datos[!is.na(datos$floor) & datos$floor == "en", "floor"] <- "0.5"
datos[!is.na(datos$floor) & datos$floor == "-2", "floor"] <- "2"
datos$floor <- as.numeric(datos$floor)
floor_mice = datos[, c("floor", "price")]
imputed_data <- suppressMessages(
suppressWarnings(
suppressPackageStartupMessages(
withCallingHandlers(
mice(floor_mice, m = 5, method = "pmm", maxit = 50, seed = 33),
message = function(m) invokeRestart("muffleMessage")
)
)
)
)
#>
#> iter imp variable
#> 1 1 floor
#> 1 2 floor
#> 1 3 floor
#> 1 4 floor
#> 1 5 floor
#> 2 1 floor
#> 2 2 floor
#> 2 3 floor
#> 2 4 floor
#> 2 5 floor
#> 3 1 floor
#> 3 2 floor
#> 3 3 floor
#> 3 4 floor
#> 3 5 floor
#> 4 1 floor
#> 4 2 floor
#> 4 3 floor
#> 4 4 floor
#> 4 5 floor
#> 5 1 floor
#> 5 2 floor
#> 5 3 floor
#> 5 4 floor
#> 5 5 floor
#> 6 1 floor
#> 6 2 floor
#> 6 3 floor
#> 6 4 floor
#> 6 5 floor
#> 7 1 floor
#> 7 2 floor
#> 7 3 floor
#> 7 4 floor
#> 7 5 floor
#> 8 1 floor
#> 8 2 floor
#> 8 3 floor
#> 8 4 floor
#> 8 5 floor
#> 9 1 floor
#> 9 2 floor
#> 9 3 floor
#> 9 4 floor
#> 9 5 floor
#> 10 1 floor
#> 10 2 floor
#> 10 3 floor
#> 10 4 floor
#> 10 5 floor
#> 11 1 floor
#> 11 2 floor
#> 11 3 floor
#> 11 4 floor
#> 11 5 floor
#> 12 1 floor
#> 12 2 floor
#> 12 3 floor
#> 12 4 floor
#> 12 5 floor
#> 13 1 floor
#> 13 2 floor
#> 13 3 floor
#> 13 4 floor
#> 13 5 floor
#> 14 1 floor
#> 14 2 floor
#> 14 3 floor
#> 14 4 floor
#> 14 5 floor
#> 15 1 floor
#> 15 2 floor
#> 15 3 floor
#> 15 4 floor
#> 15 5 floor
#> 16 1 floor
#> 16 2 floor
#> 16 3 floor
#> 16 4 floor
#> 16 5 floor
#> 17 1 floor
#> 17 2 floor
#> 17 3 floor
#> 17 4 floor
#> 17 5 floor
#> 18 1 floor
#> 18 2 floor
#> 18 3 floor
#> 18 4 floor
#> 18 5 floor
#> 19 1 floor
#> 19 2 floor
#> 19 3 floor
#> 19 4 floor
#> 19 5 floor
#> 20 1 floor
#> 20 2 floor
#> 20 3 floor
#> 20 4 floor
#> 20 5 floor
#> 21 1 floor
#> 21 2 floor
#> 21 3 floor
#> 21 4 floor
#> 21 5 floor
#> 22 1 floor
#> 22 2 floor
#> 22 3 floor
#> 22 4 floor
#> 22 5 floor
#> 23 1 floor
#> 23 2 floor
#> 23 3 floor
#> 23 4 floor
#> 23 5 floor
#> 24 1 floor
#> 24 2 floor
#> 24 3 floor
#> 24 4 floor
#> 24 5 floor
#> 25 1 floor
#> 25 2 floor
#> 25 3 floor
#> 25 4 floor
#> 25 5 floor
#> 26 1 floor
#> 26 2 floor
#> 26 3 floor
#> 26 4 floor
#> 26 5 floor
#> 27 1 floor
#> 27 2 floor
#> 27 3 floor
#> 27 4 floor
#> 27 5 floor
#> 28 1 floor
#> 28 2 floor
#> 28 3 floor
#> 28 4 floor
#> 28 5 floor
#> 29 1 floor
#> 29 2 floor
#> 29 3 floor
#> 29 4 floor
#> 29 5 floor
#> 30 1 floor
#> 30 2 floor
#> 30 3 floor
#> 30 4 floor
#> 30 5 floor
#> 31 1 floor
#> 31 2 floor
#> 31 3 floor
#> 31 4 floor
#> 31 5 floor
#> 32 1 floor
#> 32 2 floor
#> 32 3 floor
#> 32 4 floor
#> 32 5 floor
#> 33 1 floor
#> 33 2 floor
#> 33 3 floor
#> 33 4 floor
#> 33 5 floor
#> 34 1 floor
#> 34 2 floor
#> 34 3 floor
#> 34 4 floor
#> 34 5 floor
#> 35 1 floor
#> 35 2 floor
#> 35 3 floor
#> 35 4 floor
#> 35 5 floor
#> 36 1 floor
#> 36 2 floor
#> 36 3 floor
#> 36 4 floor
#> 36 5 floor
#> 37 1 floor
#> 37 2 floor
#> 37 3 floor
#> 37 4 floor
#> 37 5 floor
#> 38 1 floor
#> 38 2 floor
#> 38 3 floor
#> 38 4 floor
#> 38 5 floor
#> 39 1 floor
#> 39 2 floor
#> 39 3 floor
#> 39 4 floor
#> 39 5 floor
#> 40 1 floor
#> 40 2 floor
#> 40 3 floor
#> 40 4 floor
#> 40 5 floor
#> 41 1 floor
#> 41 2 floor
#> 41 3 floor
#> 41 4 floor
#> 41 5 floor
#> 42 1 floor
#> 42 2 floor
#> 42 3 floor
#> 42 4 floor
#> 42 5 floor
#> 43 1 floor
#> 43 2 floor
#> 43 3 floor
#> 43 4 floor
#> 43 5 floor
#> 44 1 floor
#> 44 2 floor
#> 44 3 floor
#> 44 4 floor
#> 44 5 floor
#> 45 1 floor
#> 45 2 floor
#> 45 3 floor
#> 45 4 floor
#> 45 5 floor
#> 46 1 floor
#> 46 2 floor
#> 46 3 floor
#> 46 4 floor
#> 46 5 floor
#> 47 1 floor
#> 47 2 floor
#> 47 3 floor
#> 47 4 floor
#> 47 5 floor
#> 48 1 floor
#> 48 2 floor
#> 48 3 floor
#> 48 4 floor
#> 48 5 floor
#> 49 1 floor
#> 49 2 floor
#> 49 3 floor
#> 49 4 floor
#> 49 5 floor
#> 50 1 floor
#> 50 2 floor
#> 50 3 floor
#> 50 4 floor
#> 50 5 floor
datos$floor <- complete(imputed_data)$floor
summary(datos$floor)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> -0.500 1.000 3.000 3.375 5.000 30.000
Convertimos la variable change a numérica y tratamos valores vacíos.
datos[datos$change == "{}", "change"] <- NA
datos$change <- as.numeric(datos$change)
summary(datos$change)
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> 455 2329 3000 3233 3832 10385 2783
Calculamos el coeficiente de variación para identificar variables poco informativas o muy dispersas.
mySD = apply(datos[,descProp$variable[descProp$tipo == "numerical"]], 2, sd, na.rm = TRUE)
myMU = colMeans(datos[,descProp$variable[descProp$tipo == "numerical"]], na.rm = TRUE)
myCV = mySD/myMU
sort(myCV)
#> longitude latitude priceByArea
#> -0.0616735003 0.0003282662 0.4514389140
#> parkingSpacePrice rooms change
#> 0.4683234831 0.4687391919 0.4795358918
#> bathrooms distance numPhotos
#> 0.5220271471 0.5572306039 0.6323648760
#> distancia_min_estacion_m former_price price_drop_percentage
#> 0.6608994517 0.7509810570 0.7679294571
#> size price amount
#> 0.8064627304 0.8459890212 0.8459890212
#> floor price_drop_value
#> 0.9311653938 1.4365141663
Analizamos la distribución de las variables binarias.
apply(datos[,descProp$variable[descProp$tipo == "binary"]], 2, table, useNA = "i")
#> $exterior
#>
#> 0 1 <NA>
#> 173 2663 312
#>
#> $showAddress
#>
#> 0 1
#> 2691 457
#>
#> $hasVideo
#>
#> 0 1
#> 2107 1041
#>
#> $newDevelopment
#>
#> 0 1
#> 3024 124
#>
#> $hasLift
#>
#> 0 1 <NA>
#> 782 2185 181
#>
#> $hasPlan
#>
#> 0 1 <NA>
#> 1552 1231 365
#>
#> $has3DTour
#>
#> 0 1
#> 2856 292
#>
#> $has360
#>
#> 0 1
#> 2742 406
#>
#> $hasStaging
#>
#> 0 1
#> 3016 132
#>
#> $topNewDevelopment
#>
#> 0 <NA>
#> 2783 365
#>
#> $topPlus
#>
#> 0 1
#> 3108 40
#>
#> $newDevelopmentFinished
#>
#> 0 1 <NA>
#> 122 2 3024
#>
#> $hasParkingSpace
#>
#> 1 <NA>
#> 780 2368
#>
#> $isParkingSpaceIncludedInPrice
#>
#> 0 1 <NA>
#> 148 632 2368
apply(datos[,descProp$variable[descProp$tipo == "binary"]], 2,
function (x) round(100*table(x)/sum(table(x)), 2))
#> $exterior
#> x
#> 0 1
#> 6.1 93.9
#>
#> $showAddress
#> x
#> 0 1
#> 85.48 14.52
#>
#> $hasVideo
#> x
#> 0 1
#> 66.93 33.07
#>
#> $newDevelopment
#> x
#> 0 1
#> 96.06 3.94
#>
#> $hasLift
#> x
#> 0 1
#> 26.36 73.64
#>
#> $hasPlan
#> x
#> 0 1
#> 55.77 44.23
#>
#> $has3DTour
#> x
#> 0 1
#> 90.72 9.28
#>
#> $has360
#> x
#> 0 1
#> 87.1 12.9
#>
#> $hasStaging
#> x
#> 0 1
#> 95.81 4.19
#>
#> $topNewDevelopment
#> x
#> 0
#> 100
#>
#> $topPlus
#> x
#> 0 1
#> 98.73 1.27
#>
#> $newDevelopmentFinished
#> x
#> 0 1
#> 98.39 1.61
#>
#> $hasParkingSpace
#> x
#> 1
#> 100
#>
#> $isParkingSpaceIncludedInPrice
#> x
#> 0 1
#> 18.97 81.03
Analizamos la distribución de las variables categóricas.
apply(datos[,descProp$variable[descProp$tipo == "categorical"]], 2, table, useNA = "i")
#> $propertyType
#>
#> chalet countryHouse duplex flat penthouse studio
#> 135 6 126 2620 184 77
#>
#> $operation
#>
#> sale
#> 3148
#>
#> $province
#>
#> València
#> 3148
#>
#> $municipality
#>
#> Alboraya València
#> 67 3081
#>
#> $district
#>
#> Alboraya Centro Algirós Benicalap Benimaclet
#> 2 79 110 35
#> Camins al Grau Campanar Ciutat Vella El Pla del Real
#> 246 150 356 83
#> Extramurs Jesús L'Eixample L'Olivereta
#> 226 161 344 156
#> La Patacona La Saïdia Patraix Poblats Marítims
#> 65 120 156 399
#> Quatre Carreres Rascanya
#> 287 173
#>
#> $country
#>
#> es
#> 3148
#>
#> $neighborhood
#>
#> Aiora Albors
#> 88 16
#> Arrancapins Barrio de Favara
#> 92 12
#> Benicalap Beniferri
#> 76 5
#> Benimaclet Beteró
#> 31 27
#> Camí de Vera Camí Fondo
#> 4 16
#> Camí Reial Campanar
#> 9 46
#> Ciutat de les Arts i de les Ciencies Ciutat Fallera
#> 53 20
#> Ciutat Jardí Ciutat Universitària
#> 24 11
#> El Botànic El Cabanyal-El Canyamelar
#> 45 139
#> El Calvari El Carme
#> 16 61
#> El Grau El Mercat
#> 56 55
#> El Pilar El Pla del Remei
#> 39 102
#> Els Orriols En Corts
#> 44 57
#> Exposició Fonteta de Sant Lluìs
#> 30 28
#> Gran Vía Jaume Roig
#> 78 17
#> L'Amistat L'Hort de Senabre
#> 19 44
#> L'Illa Perduda La Carrasca
#> 15 17
#> La Creu Coberta La Creu del Grau
#> 11 36
#> La Fontsanta La Llum
#> 22 15
#> La Petxina La Punta
#> 44 25
#> La Raiosa La Roqueta
#> 75 45
#> La Seu La Vega Baixa
#> 36 4
#> La Xerea Les Tendetes
#> 35 12
#> Malilla Marxalenes
#> 54 22
#> Mestalla Mont-Olivet
#> 25 54
#> Morvedre Na Rovella
#> 35 16
#> Natzaret Nou Benicalap
#> 46 14
#> Nou Campanar Nou Moles
#> 18 65
#> Patraix Penya-Roja
#> 58 90
#> Playa de la Malvarrosa Russafa
#> 131 164
#> Safranar Sant Antoni
#> 23 20
#> Sant Francesc Sant Isidre
#> 130 11
#> Sant Llorenç Sant Marcellí
#> 13 22
#> Sant Pau Soternes
#> 53 19
#> Tormos Torrefiel
#> 32 116
#> Tres Forques Trinitat
#> 35 11
#> Vara de Quart <NA>
#> 52 67
#>
#> $status
#>
#> good newdevelopment renew <NA>
#> 2640 95 384 29
#>
#> $highlight
#>
#> Destacado False Top Top+ True <NA>
#> 696 356 403 40 9 1644
#>
#> $savedAd
#>
#> {'groupDescription': 'Destacado'} {'groupDescription': 'Top'}
#> 89 40
#> {} <NA>
#> 2783 236
#>
#> $notes
#>
#> [] {}
#> 2783 365
#>
#> $currency
#>
#> €
#> 3148
#>
#> $typology
#>
#> chalet countryHouse flat <NA>
#> 82 6 2695 365
#>
#> $subTypology
#>
#> casaDePueblo countryHouse duplex independantHouse
#> 4 1 110 36
#> masia penthouse semidetachedHouse studio
#> 1 174 8 75
#> terracedHouse <NA>
#> 35 2704
apply(datos[,descProp$variable[descProp$tipo == "categorical"]], 2,
function (x) round(100*table(x)/sum(table(x)), 2))
#> $propertyType
#> x
#> chalet countryHouse duplex flat penthouse studio
#> 4.29 0.19 4.00 83.23 5.84 2.45
#>
#> $operation
#> x
#> sale
#> 100
#>
#> $province
#> x
#> València
#> 100
#>
#> $municipality
#> x
#> Alboraya València
#> 2.13 97.87
#>
#> $district
#> x
#> Alboraya Centro Algirós Benicalap Benimaclet
#> 0.06 2.51 3.49 1.11
#> Camins al Grau Campanar Ciutat Vella El Pla del Real
#> 7.81 4.76 11.31 2.64
#> Extramurs Jesús L'Eixample L'Olivereta
#> 7.18 5.11 10.93 4.96
#> La Patacona La Saïdia Patraix Poblats Marítims
#> 2.06 3.81 4.96 12.67
#> Quatre Carreres Rascanya
#> 9.12 5.50
#>
#> $country
#> x
#> es
#> 100
#>
#> $neighborhood
#> x
#> Aiora Albors
#> 2.86 0.52
#> Arrancapins Barrio de Favara
#> 2.99 0.39
#> Benicalap Beniferri
#> 2.47 0.16
#> Benimaclet Beteró
#> 1.01 0.88
#> Camí de Vera Camí Fondo
#> 0.13 0.52
#> Camí Reial Campanar
#> 0.29 1.49
#> Ciutat de les Arts i de les Ciencies Ciutat Fallera
#> 1.72 0.65
#> Ciutat Jardí Ciutat Universitària
#> 0.78 0.36
#> El Botànic El Cabanyal-El Canyamelar
#> 1.46 4.51
#> El Calvari El Carme
#> 0.52 1.98
#> El Grau El Mercat
#> 1.82 1.79
#> El Pilar El Pla del Remei
#> 1.27 3.31
#> Els Orriols En Corts
#> 1.43 1.85
#> Exposició Fonteta de Sant Lluìs
#> 0.97 0.91
#> Gran Vía Jaume Roig
#> 2.53 0.55
#> L'Amistat L'Hort de Senabre
#> 0.62 1.43
#> L'Illa Perduda La Carrasca
#> 0.49 0.55
#> La Creu Coberta La Creu del Grau
#> 0.36 1.17
#> La Fontsanta La Llum
#> 0.71 0.49
#> La Petxina La Punta
#> 1.43 0.81
#> La Raiosa La Roqueta
#> 2.43 1.46
#> La Seu La Vega Baixa
#> 1.17 0.13
#> La Xerea Les Tendetes
#> 1.14 0.39
#> Malilla Marxalenes
#> 1.75 0.71
#> Mestalla Mont-Olivet
#> 0.81 1.75
#> Morvedre Na Rovella
#> 1.14 0.52
#> Natzaret Nou Benicalap
#> 1.49 0.45
#> Nou Campanar Nou Moles
#> 0.58 2.11
#> Patraix Penya-Roja
#> 1.88 2.92
#> Playa de la Malvarrosa Russafa
#> 4.25 5.32
#> Safranar Sant Antoni
#> 0.75 0.65
#> Sant Francesc Sant Isidre
#> 4.22 0.36
#> Sant Llorenç Sant Marcellí
#> 0.42 0.71
#> Sant Pau Soternes
#> 1.72 0.62
#> Tormos Torrefiel
#> 1.04 3.77
#> Tres Forques Trinitat
#> 1.14 0.36
#> Vara de Quart
#> 1.69
#>
#> $status
#> x
#> good newdevelopment renew
#> 84.64 3.05 12.31
#>
#> $highlight
#> x
#> Destacado False Top Top+ True
#> 46.28 23.67 26.80 2.66 0.60
#>
#> $savedAd
#> x
#> {'groupDescription': 'Destacado'} {'groupDescription': 'Top'}
#> 3.06 1.37
#> {}
#> 95.57
#>
#> $notes
#> x
#> [] {}
#> 88.41 11.59
#>
#> $currency
#> x
#> €
#> 100
#>
#> $typology
#> x
#> chalet countryHouse flat
#> 2.95 0.22 96.84
#>
#> $subTypology
#> x
#> casaDePueblo countryHouse duplex independantHouse
#> 0.90 0.23 24.77 8.11
#> masia penthouse semidetachedHouse studio
#> 0.23 39.19 1.80 16.89
#> terracedHouse
#> 7.88
Eliminamos variables que no aportan información relevante al análisis.
datos = datos[,setdiff(colnames(datos), c("operation", "province", "municipality", "country", "hasVideo", "highlight", "savedAd", "notes", "change", "has3DTour", "has360", "hasStaging", "topNewDevelopment", "topPlus", "newDevelopmentFinished", "amount", "currency", "hasParkingSpace", "isParkingSpaceIncludedInPrice", "parkingSpacePrice"))]
descProp = descProp[colnames(datos),]
table(datos$exterior)
#>
#> 0 1
#> 173 2663
table(datos$showAddress)
#>
#> FALSE TRUE
#> 2691 457
table(datos$newDevelopment)
#>
#> FALSE TRUE
#> 3024 124
table(datos$hasLift)
#>
#> 0 1
#> 782 2185
table(datos$hasPlan)
#>
#> FALSE TRUE
#> 1552 1231
nrow(datos)
#> [1] 3148
Se identifican y tratan valores inconsistentes y anómalos en las variables numéricas y categóricas.
ggplot(datos, aes(x = as.factor(1), y = rooms)) +
geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
coord_flip() +
labs(title = "Boxplot del Número de Habitaciones",
x = "",
y = "Número de Habitaciones") +
theme_minimal()
muchas_habitaciones <- datos %>%
filter(rooms >= 7) %>%
arrange(desc(rooms)) %>%
slice_head(n = 6)
muchas_habitaciones
#> propertyCode numPhotos floor price propertyType size exterior rooms
#> 1 107160987 17 1 985000 flat 375 1 16
#> 2 106981181 29 1 985000 flat 380 1 16
#> 3 107167848 49 9 1250000 flat 303 1 15
#> 4 105151055 37 2 2100000 chalet 1000 NA 14
#> 5 102114500 36 6 1200000 chalet 734 NA 10
#> 6 100885871 26 1 850000 flat 321 1 10
#> bathrooms district neighborhood latitude longitude showAddress
#> 1 9 La Saïdia Morvedre 39.48414 -0.3752460 FALSE
#> 2 9 La Saïdia Morvedre 39.48003 -0.3742156 FALSE
#> 3 5 Poblats Marítims Beteró 39.47341 -0.3341399 FALSE
#> 4 8 Ciutat Vella La Xerea 39.47514 -0.3692382 FALSE
#> 5 0 Extramurs Arrancapins 39.46271 -0.3817061 FALSE
#> 6 2 El Pla del Real Exposició 39.47702 -0.3620727 FALSE
#> distance status newDevelopment hasLift priceByArea hasPlan
#> 1 1585.5427 good FALSE 1 2627 FALSE
#> 2 1140.0629 good FALSE 1 2592 TRUE
#> 3 3639.8176 good FALSE 0 4125 TRUE
#> 4 840.7786 good FALSE NA 2100 TRUE
#> 5 924.6782 renew FALSE NA 1635 TRUE
#> 6 1455.6163 good FALSE 1 2648 TRUE
#> distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1 261.6994 NA NA NA
#> 2 264.9308 NA NA NA
#> 3 367.2378 NA NA NA
#> 4 357.0123 NA NA NA
#> 5 230.8475 NA NA NA
#> 6 121.1812 NA NA NA
#> typology subTypology
#> 1 flat <NA>
#> 2 flat <NA>
#> 3 flat <NA>
#> 4 chalet independantHouse
#> 5 chalet terracedHouse
#> 6 flat <NA>
datos <- datos %>% filter(rooms < 15)
ggplot(datos, aes(x = as.factor(1), y = bathrooms)) +
geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
coord_flip() +
labs(title = "Boxplot del Número de Baños",
x = "",
y = "Número de Baños") +
theme_minimal()
#> Notch went outside hinges
#> ℹ Do you want `notch = FALSE`?
muchos_bathrooms <- datos %>%
filter(bathrooms > 5) %>%
arrange(desc(bathrooms)) %>%
slice_head(n = 6)
muchos_bathrooms
#> propertyCode numPhotos floor price propertyType size exterior rooms
#> 1 107233131 29 3 980000 flat 265 1 9
#> 2 107264893 22 0 700000 flat 195 1 5
#> 3 106983291 34 1 985000 flat 375 1 9
#> 4 103259850 122 5 4700000 penthouse 590 1 6
#> 5 106880590 34 0 950000 flat 360 1 8
#> 6 105151055 37 2 2100000 chalet 1000 NA 14
#> bathrooms district neighborhood latitude longitude showAddress
#> 1 10 Ciutat Vella Sant Francesc 39.47012 -0.3738187 TRUE
#> 2 10 Jesús La Raiosa 39.45770 -0.3877716 FALSE
#> 3 9 La Saïdia Morvedre 39.48052 -0.3752743 FALSE
#> 4 8 Ciutat Vella La Xerea 39.47662 -0.3725033 FALSE
#> 5 8 Benicalap Benicalap 39.49108 -0.3928557 FALSE
#> 6 8 Ciutat Vella La Xerea 39.47514 -0.3692382 FALSE
#> distance status newDevelopment hasLift priceByArea hasPlan
#> 1 214.3743 renew FALSE 1 3698 TRUE
#> 2 1675.8956 good FALSE 0 3590 TRUE
#> 3 1184.0674 good FALSE 1 2627 TRUE
#> 4 814.7225 good FALSE 1 7966 TRUE
#> 5 2750.6246 good FALSE 1 2639 TRUE
#> 6 840.7786 good FALSE NA 2100 TRUE
#> distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1 255.9972 NA NA NA
#> 2 258.3893 NA NA NA
#> 3 274.8149 NA NA NA
#> 4 619.1010 NA NA NA
#> 5 162.0399 NA NA NA
#> 6 357.0123 NA NA NA
#> typology subTypology
#> 1 flat <NA>
#> 2 flat <NA>
#> 3 flat <NA>
#> 4 flat penthouse
#> 5 flat <NA>
#> 6 chalet independantHouse
datos <- datos %>% filter(bathrooms < 9)
ggplot(datos, aes(x = as.factor(1), y = size)) +
geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
coord_flip() +
labs(title = "Boxplot de size",
x = "",
y = "Size") +
theme_minimal()
grandes <- datos %>%
filter(size > 500) %>%
arrange(desc(size)) %>%
slice_head(n = 5)
grandes_property_code_1 <- grandes$propertyCode[1]
grandes_property_code_2 <- grandes$propertyCode[2]
datos <- datos %>% filter(propertyCode != grandes_property_code_1)
datos <- datos %>% filter(propertyCode != grandes_property_code_2)
ggplot(datos, aes(x = as.factor(1), y = distancia_min_estacion_m)) +
geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
coord_flip() +
labs(title = "Boxplot de la Distancia al Metro",
x = "",
y = "Distancia al Metro") +
theme_minimal()
lejos <- datos %>%
filter(distancia_min_estacion_m > 1000) %>%
arrange(desc(size)) %>%
slice_head(n = 5)
lejos
#> propertyCode numPhotos floor price propertyType size exterior rooms
#> 1 105867486 61 0.5 2000000 flat 372 1 3
#> 2 106628414 38 8.0 1100000 penthouse 353 1 3
#> 3 107600722 39 5.0 390000 penthouse 300 1 2
#> 4 103209543 53 0.0 2000000 flat 300 1 3
#> 5 106965933 47 5.0 1100000 chalet 267 NA 3
#> bathrooms district neighborhood latitude longitude showAddress distance
#> 1 3 La Patacona <NA> 39.49192 -0.3247711 FALSE 5054.777
#> 2 2 La Patacona <NA> 39.48796 -0.3305762 FALSE 4408.508
#> 3 3 Patraix Vara de Quart 39.46229 -0.4141896 TRUE 3360.682
#> 4 3 La Patacona <NA> 39.49114 -0.3240358 FALSE 5069.203
#> 5 4 La Patacona <NA> 39.48843 -0.3276807 FALSE 4653.836
#> status newDevelopment hasLift priceByArea hasPlan distancia_min_estacion_m
#> 1 good FALSE 1 NA NA 1881.744
#> 2 good FALSE 1 NA NA 1292.388
#> 3 good FALSE 1 1300 TRUE 1131.892
#> 4 good FALSE 1 NA NA 1822.884
#> 5 good FALSE NA NA NA 1424.760
#> former_price price_drop_value price_drop_percentage typology subTypology
#> 1 NA NA NA <NA> <NA>
#> 2 NA NA NA <NA> <NA>
#> 3 NA NA NA flat penthouse
#> 4 NA NA NA <NA> <NA>
#> 5 NA NA NA <NA> <NA>
ggplot(datos, aes(x = as.factor(1), y = numPhotos)) +
geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
coord_flip() +
labs(title = "Boxplot del Número de Fotos",
x = "",
y = "Número de Fotos") +
theme_minimal()
fotos <- datos %>%
filter(numPhotos > 100) %>%
arrange(desc(size)) %>%
slice_head(n = 10)
fotos
#> propertyCode numPhotos floor price propertyType size exterior rooms
#> 1 103259850 122 5 4700000 penthouse 590 1 6
#> 2 106644557 113 7 490000 flat 177 1 4
#> 3 107137950 110 3 999000 flat 165 1 3
#> 4 106256113 112 8 630000 flat 135 1 4
#> 5 104983712 125 3 750000 flat 130 1 3
#> 6 107431261 109 1 480000 flat 127 1 2
#> bathrooms district neighborhood latitude longitude showAddress
#> 1 8 Ciutat Vella La Xerea 39.47662 -0.3725033 FALSE
#> 2 3 Campanar Sant Pau 39.48894 -0.4013619 TRUE
#> 3 3 Campanar Sant Pau 39.49566 -0.4021869 FALSE
#> 4 2 Camins al Grau Penya-Roja 39.46005 -0.3523510 TRUE
#> 5 2 L'Eixample Russafa 39.45919 -0.3727697 FALSE
#> 6 2 Ciutat Vella La Seu 39.47691 -0.3802234 FALSE
#> distance status newDevelopment hasLift priceByArea hasPlan
#> 1 814.7225 good FALSE 1 7966 TRUE
#> 2 3017.8600 good FALSE 1 2768 TRUE
#> 3 3625.1553 good FALSE 1 6055 TRUE
#> 4 2329.6240 good FALSE 1 4667 FALSE
#> 5 1228.4116 good FALSE 1 5769 TRUE
#> 6 849.1217 good FALSE 1 3780 TRUE
#> distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1 619.1010 NA NA NA
#> 2 273.6567 NA NA NA
#> 3 252.3325 NA NA NA
#> 4 817.9295 NA NA NA
#> 5 609.5870 NA NA NA
#> 6 786.4972 NA NA NA
#> typology subTypology
#> 1 flat penthouse
#> 2 flat <NA>
#> 3 flat <NA>
#> 4 flat <NA>
#> 5 flat <NA>
#> 6 flat <NA>
ggplot(datos, aes(x = as.factor(1), y = floor)) +
geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
coord_flip() +
labs(title = "Boxplot del Número de Piso",
x = "",
y = "Número de Piso") +
theme_minimal()
altos <- datos %>%
filter(floor > 20) %>%
arrange(desc(size)) %>%
slice_head(n = 10)
altos
#> propertyCode numPhotos floor price propertyType size exterior rooms
#> 1 106323518 28 30 650000 flat 131 1 4
#> 2 107563294 35 27 612000 flat 105 1 2
#> 3 99710198 40 22 555000 flat 82 1 2
#> bathrooms district neighborhood latitude longitude showAddress
#> 1 2 Camins al Grau Penya-Roja 39.46005 -0.3523510 TRUE
#> 2 2 Campanar Sant Pau 39.49582 -0.4032308 FALSE
#> 3 2 Campanar Sant Pau 39.48907 -0.4004194 FALSE
#> distance status newDevelopment hasLift priceByArea hasPlan
#> 1 2329.624 good FALSE 1 NA NA
#> 2 3694.741 good FALSE 1 5829 FALSE
#> 3 2971.371 good FALSE 1 6768 FALSE
#> distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1 817.9295 NA NA NA
#> 2 313.5163 NA NA NA
#> 3 218.8787 NA NA NA
#> typology subTypology
#> 1 <NA> <NA>
#> 2 flat <NA>
#> 3 flat <NA>
ggplot(datos, aes(x = as.factor(1), y = price)) +
geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
coord_flip() +
labs(title = "Boxplot del Precio",
x = "",
y = "Precio") +
theme_minimal()
caros <- datos %>%
filter(price > 2500000) %>%
arrange(desc(size)) %>%
slice_head(n = 10)
caros
#> propertyCode numPhotos floor price propertyType size exterior rooms
#> 1 103259850 122 5 4700000 penthouse 590 1 6
#> 2 106413311 51 3 2599000 penthouse 457 1 5
#> 3 106261302 40 3 2600000 duplex 457 1 5
#> 4 104867597 68 3 2600000 penthouse 398 1 5
#> 5 107260483 1 4 2800000 flat 377 1 7
#> 6 103488287 0 1 2650000 penthouse 365 1 3
#> 7 105204408 85 5 3500000 flat 352 1 4
#> 8 107211693 50 6 3500000 flat 350 1 4
#> 9 105973487 47 7 3500000 flat 350 1 4
#> 10 105367713 69 5 3500000 flat 342 1 4
#> bathrooms district neighborhood latitude longitude showAddress
#> 1 8 Ciutat Vella La Xerea 39.47662 -0.3725033 FALSE
#> 2 6 L'Eixample El Pla del Remei 39.46382 -0.3724053 FALSE
#> 3 2 L'Eixample El Pla del Remei 39.46621 -0.3742678 FALSE
#> 4 6 L'Eixample El Pla del Remei 39.46429 -0.3742040 FALSE
#> 5 4 L'Eixample El Pla del Remei 39.46648 -0.3683944 FALSE
#> 6 2 L'Eixample El Pla del Remei 39.46786 -0.3695561 FALSE
#> 7 5 L'Eixample El Pla del Remei 39.47025 -0.3696776 FALSE
#> 8 5 L'Eixample El Pla del Remei 39.46938 -0.3698621 FALSE
#> 9 6 L'Eixample El Pla del Remei 39.46881 -0.3690677 FALSE
#> 10 5 L'Eixample El Pla del Remei 39.47116 -0.3681202 FALSE
#> distance status newDevelopment hasLift priceByArea hasPlan
#> 1 814.7225 good FALSE 1 7966 TRUE
#> 2 754.1344 good FALSE 1 5687 TRUE
#> 3 445.9856 good FALSE 1 5689 FALSE
#> 4 649.6709 good FALSE 1 6533 TRUE
#> 5 777.7054 good FALSE 1 7427 FALSE
#> 6 621.6540 good FALSE 1 7260 FALSE
#> 7 569.7504 good FALSE 1 9943 TRUE
#> 8 555.6693 good FALSE 1 10000 FALSE
#> 9 632.5821 good FALSE 1 10000 FALSE
#> 10 716.0182 good FALSE 1 10234 FALSE
#> distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1 619.10097 NA NA NA
#> 2 247.64045 NA NA NA
#> 3 262.84052 NA NA NA
#> 4 282.91430 NA NA NA
#> 5 280.16763 NA NA NA
#> 6 280.75340 NA NA NA
#> 7 99.79849 NA NA NA
#> 8 122.84038 NA NA NA
#> 9 215.75330 NA NA NA
#> 10 257.30206 NA NA NA
#> typology subTypology
#> 1 flat penthouse
#> 2 flat penthouse
#> 3 flat duplex
#> 4 flat penthouse
#> 5 flat <NA>
#> 6 flat penthouse
#> 7 flat <NA>
#> 8 flat <NA>
#> 9 flat <NA>
#> 10 flat <NA>
datos$priceByArea[is.na(datos$priceByArea)] <- datos$price[is.na(datos$priceByArea)] / datos$size[is.na(datos$priceByArea)]
ggplot(datos, aes(x = as.factor(1), y = priceByArea)) +
geom_boxplot(fill = "lightblue", color = "darkblue", notch = TRUE) +
coord_flip() +
labs(title = "Boxplot del Precio por m2",
x = "",
y = "Precio m2") +
theme_minimal()
caros_m2 <- datos %>%
filter(priceByArea > 10000) %>%
arrange(desc(size)) %>%
slice_head(n = 10)
caros_m2
#> propertyCode numPhotos floor price propertyType size exterior rooms
#> 1 105367713 69 5 3500000 flat 342 1 4
#> 2 107049698 63 5 3500000 duplex 342 1 4
#> 3 107537312 60 6 3500000 flat 342 1 4
#> 4 101942142 50 3 2150000 flat 208 1 3
#> 5 107458992 10 2 2000000 flat 185 1 3
#> 6 104639503 76 6 1800000 penthouse 148 1 3
#> 7 93697954 19 2 1900000 flat 140 1 2
#> 8 104652530 22 7 1300000 penthouse 121 1 3
#> 9 106749590 72 7 1100000 flat 109 1 1
#> 10 103340081 37 9 1190000 penthouse 102 1 2
#> bathrooms district neighborhood latitude longitude showAddress
#> 1 5 L'Eixample El Pla del Remei 39.47116 -0.3681202 FALSE
#> 2 5 L'Eixample El Pla del Remei 39.46971 -0.3700135 FALSE
#> 3 5 L'Eixample El Pla del Remei 39.46928 -0.3693383 FALSE
#> 4 3 L'Eixample El Pla del Remei 39.47053 -0.3651211 FALSE
#> 5 4 Ciutat Vella La Xerea 39.47150 -0.3763945 FALSE
#> 6 2 L'Eixample Russafa 39.46321 -0.3676446 FALSE
#> 7 2 L'Eixample El Pla del Remei 39.46878 -0.3682352 FALSE
#> 8 2 Camins al Grau Penya-Roja 39.46327 -0.3532254 TRUE
#> 9 2 Camins al Grau Penya-Roja 39.45817 -0.3478155 FALSE
#> 10 2 Ciutat Vella Sant Francesc 39.47236 -0.3763066 FALSE
#> distance status newDevelopment hasLift priceByArea hasPlan
#> 1 716.0182 good FALSE 1 10234 FALSE
#> 2 540.0503 good FALSE 1 10234 TRUE
#> 3 601.6053 good FALSE 1 10234 FALSE
#> 4 962.1120 good FALSE 1 10337 TRUE
#> 5 178.2078 good FALSE 1 10811 TRUE
#> 6 1051.3122 good FALSE 1 12162 TRUE
#> 7 703.4529 good FALSE 1 13571 FALSE
#> 8 2113.3477 newdevelopment TRUE 1 10744 TRUE
#> 9 2771.5469 good FALSE 1 10092 TRUE
#> 10 273.4178 good FALSE 1 11667 TRUE
#> distancia_min_estacion_m former_price price_drop_value price_drop_percentage
#> 1 257.30206 NA NA NA
#> 2 88.54874 NA NA NA
#> 3 163.71948 NA NA NA
#> 4 296.96193 NA NA NA
#> 5 498.91799 NA NA NA
#> 6 186.69132 NA NA NA
#> 7 272.83861 NA NA NA
#> 8 819.22002 NA NA NA
#> 9 692.02661 NA NA NA
#> 10 528.00360 NA NA NA
#> typology subTypology
#> 1 flat <NA>
#> 2 flat duplex
#> 3 flat <NA>
#> 4 flat <NA>
#> 5 flat <NA>
#> 6 flat penthouse
#> 7 flat <NA>
#> 8 flat penthouse
#> 9 flat <NA>
#> 10 flat penthouse
binary_vars <- descProp$variable[descProp$tipo == "binary"]
datos <- datos %>%
mutate(across(all_of(binary_vars), ~ ifelse(. == TRUE, 1, ifelse(. == FALSE, 0, NA))))
datos_pca = dplyr::select(datos, - which(descProp$tipo == "binary"), - which(colnames(datos) %in% c("former_price", "price_drop_value", "price_drop_percentage", "propertyCode", "latitude", "longitude", "subTypology")))
res.pca <- PCA(datos_pca, scale.unit = TRUE, graph = FALSE, ncp = 10, quali.sup = c(4, 8, 9, 11, 14))
#> Warning in PCA(datos_pca, scale.unit = TRUE, graph = FALSE, ncp = 10, quali.sup
#> = c(4, : Missing values are imputed by the mean of the variable: you should use
#> the imputePCA function of the missMDA package
eig.val <- get_eigenvalue(res.pca)
VPmedio = 100 * (1/nrow(eig.val))
fviz_eig(res.pca, addlabels = TRUE) +
geom_hline(yintercept=VPmedio, linetype=2, color="red")
K = 3
res.pca <- PCA(datos_pca, scale.unit = TRUE, graph = FALSE, ncp = K, quali.sup = c(4, 8, 9, 11, 14))
#> Warning in PCA(datos_pca, scale.unit = TRUE, graph = FALSE, ncp = K, quali.sup
#> = c(4, : Missing values are imputed by the mean of the variable: you should use
#> the imputePCA function of the missMDA package
misScores = res.pca$ind$coord[,1:K]
miT2 = colSums(t(misScores**2)/eig.val[1:K,1])
I = nrow(datos_pca)
F95 = K*(I**2 - 1)/(I*(I - K)) * qf(0.95, K, I-K)
F99 = K*(I**2 - 1)/(I*(I - K)) * qf(0.99, K, I-K)
plot(1:length(miT2), miT2, type = "p", xlab = "Pisos", ylab = "T2")
abline(h = F95, col = "orange", lty = 2, lwd = 2)
abline(h = F99, col = "red3", lty = 2, lwd = 2)
anomalas = which(miT2 > F95)
anomalas
#> 7 23 37 57 71 144 145 149 152 154 157 161 168 190 195 207
#> 7 23 37 57 71 144 145 149 152 154 157 161 168 190 195 207
#> 213 243 249 252 275 279 286 288 289 321 358 366 375 423 432 441
#> 213 243 249 252 275 279 286 288 289 321 358 366 375 423 432 441
#> 449 452 465 509 510 551 571 597 613 653 657 663 698 740 767 771
#> 449 452 465 509 510 551 571 597 613 653 657 663 698 740 767 771
#> 806 861 873 890 966 975 1012 1022 1023 1035 1049 1056 1059 1068 1111 1112
#> 806 861 873 890 966 975 1012 1022 1023 1035 1049 1056 1059 1068 1111 1112
#> 1152 1153 1159 1160 1164 1205 1225 1228 1254 1284 1288 1307 1310 1333 1363 1372
#> 1152 1153 1159 1160 1164 1205 1225 1228 1254 1284 1288 1307 1310 1333 1363 1372
#> 1377 1385 1392 1402 1420 1425 1451 1470 1506 1537 1571 1589 1598 1659 1826 2052
#> 1377 1385 1392 1402 1420 1425 1451 1470 1506 1537 1571 1589 1598 1659 1826 2052
#> 2058 2067 2072 2073 2074 2077 2079 2084 2085 2087 2088 2089 2092 2093 2094 2095
#> 2058 2067 2072 2073 2074 2077 2079 2084 2085 2087 2088 2089 2092 2093 2094 2095
#> 2096 2098 2099 2100 2101 2103 2107 2111 2122 2123 2125 2130 2131 2136 2144 2145
#> 2096 2098 2099 2100 2101 2103 2107 2111 2122 2123 2125 2130 2131 2136 2144 2145
#> 2146 2154 2159 2160 2163 2167 2168 2169 2176 2179 2180 2182 2194 2196 2197 2198
#> 2146 2154 2159 2160 2163 2167 2168 2169 2176 2179 2180 2182 2194 2196 2197 2198
#> 2208 2216 2229 2230 2248 2254 2257 2259 2274 2280 2297 2298 2304 2313 2317 2331
#> 2208 2216 2229 2230 2248 2254 2257 2259 2274 2280 2297 2298 2304 2313 2317 2331
#> 2342 2345 2354 2358 2363 2368 2370 2381 2386 2389 2397 2412 2416 2431 2432 2434
#> 2342 2345 2354 2358 2363 2368 2370 2381 2386 2389 2397 2412 2416 2431 2432 2434
#> 2436 2442 2454 2468 2469 2472 2500 2531 2554 2585 2623 2643 2651 2720 2726 2730
#> 2436 2442 2454 2468 2469 2472 2500 2531 2554 2585 2623 2643 2651 2720 2726 2730
#> 2738 2744 2761 2763 2805 2819 2872 2888 2896 2905 2919 2955 3007 3113
#> 2738 2744 2761 2763 2805 2819 2872 2888 2896 2905 2919 2955 3007 3113
contribT2 = function (X, scores, loadings, eigenval, observ, cutoff = 2) {
# X is data matrix and must be centered (or centered and scaled if data were scaled)
misScoresNorm = t(t(scores**2) / eigenval)
misContrib = NULL
for (oo in observ) {
print(rownames(scores)[oo])
print(scores[oo,])
misPCs = which(as.numeric(misScoresNorm[oo,]) > cutoff)
lacontri = sapply(misPCs, function (cc) (scores[oo,cc]/eigenval[cc])*loadings[,cc]*X[oo,])
lacontri = rowSums((1*(sign(lacontri) == 1))*lacontri)
misContrib = cbind(misContrib, lacontri)
}
colnames(misContrib) = rownames(misScoresNorm[observ,])
return(misContrib)
}
# Recuperamos los datos utilizados en el modelo PCA, centrados y escalados
pisosCE = datos_pca[, c(1, 2, 3, 5, 6, 7, 10, 12, 13)]
pisosCE = pisosCE[,setdiff(colnames(pisosCE), c("rating", "weight", "cups"))]
pisosCE = scale(pisosCE, center = TRUE, scale = TRUE)
X = as.matrix(pisosCE)
# Calculamos los loadings a partir de las coordenadas de las variables
misLoadings = sweep(res.pca$var$coord, 2, sqrt(res.pca$eig[1:K,1]), FUN="/")
# Calculamos las contribuciones
mycontrisT2 = contribT2(X = X, scores = misScores, loadings = misLoadings,
eigenval = eig.val[1:K,1], observ = which.max(miT2),
cutoff = 2)
#> [1] "2761"
#> Dim.1 Dim.2 Dim.3
#> 5.908302 -10.994043 3.877539
par(mar = c(10,2.3,3,1))
barplot(mycontrisT2[,1],las=2,
main= paste0("Observación: ", rownames(datos_pca)[which.max(miT2)]))
Se trata de un lote de 6 pisos, por tanto eliminamos el anuncio.
datos <- datos[-which.max(miT2), ]
datos_pca <- datos_pca[-which.max(miT2), ]
par(mar = c(10,2.3,3,1))
barplot(mycontrisT2[,1],las=2,
main= paste0("Observación: ", rownames(datos_pca)[which.max(miT2)]))
myE = X - misScores %*% t(misLoadings)
mySCR = rowSums(myE^2)
plot(1:length(mySCR), mySCR, type = "l", main = "Distancia al modelo",
ylab = "SCR", xlab = "Propiedades", ylim = c(0,11))
g = var(mySCR)/(2*mean(mySCR))
h = (2*mean(mySCR)^2)/var(mySCR)
chi2lim = g*qchisq(0.95, df = h)
chi2lim99 = g*qchisq(0.99, df = h)
abline(h = chi2lim, col = "orange", lty = 2, lwd = 2)
abline(h = chi2lim99, col = "red3", lty = 2, lwd = 2)
anomalosSCR = which(mySCR > chi2lim99)
anomalosSCR
#> [1] 168 184 213 236 321 432 548 597 613 1049 1056 1225 1228 1263 1571
#> [16] 1826 1977 2058 2067 2088 2163 2229 2313 2368 2447 2495 2687 2761 2826
Menos de un 1% de las propiedades, asumible.
## Función para calcular las contribuciones a la SCR
ContriSCR = function(E, SCR) {
# E es la matriz de residuos del modelo
# SCR es la suma de cuadrados residual
contribucion = NULL
for (j in 1:length(SCR)){
eind<-E[j,]
signo<-sign(eind)
contri<-(signo*(eind^2)/SCR[j])*100
contribucion<-rbind(contribucion,contri)
}
rownames(contribucion) = rownames(E)
return(contribucion)
}
## Calculamos las contribuciones de todas las observaciones
mycontris = ContriSCR(E = myE, SCR = mySCR)
## Gráfico para Special_K
barplot(mycontris[which.max(mySCR),],las=2, cex.names = 0.7,
main=paste0('Contribuciones a SCR para ', which.max(mySCR)))
Error no tiene baños
datos[2311,]$bathrooms = 3
datos_pca[2311,]$bathrooms = 3
for (i in order(mySCR, decreasing = TRUE)[1:10]) {
barplot(mycontris[i,], las = 2, cex.names = 0.7,
main = paste0('Contribuciones a SCR para ', i))
}