1. Objetivo y resumen ejecutivo

Este informe realiza un análisis holístico del mercado de vivienda urbana utilizando la base de datos vivienda del paquete paqueteMODELOS. Se abordan: (i) Análisis Exploratorio, (ii) Análisis de Componentes Principales (PCA), (iii) Análisis de Conglomerados (clústeres), (iv) Análisis de Correspondencia (CA/MCA) y (v) visualizaciones, incluyendo un mapa interactivo. Al final se presentan conclusiones y recomendaciones para decisiones estratégicas (compra, venta y valoración).

2. Paquetes y datos

Se instalan los paquetes necesarios y se accede a los datos:

data("vivienda", package = "paqueteMODELOS")
# Copia de trabajo
viv <- vivienda
str(viv)

## spc_tbl_ [8,322 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id          : num [1:8322] 1147 1169 1350 5992 1212 ...
##  $ zona        : chr [1:8322] "Zona Oriente" "Zona Oriente" "Zona Oriente" "Zona Sur" ...
##  $ piso        : chr [1:8322] NA NA NA "02" ...
##  $ estrato     : num [1:8322] 3 3 3 4 5 5 4 5 5 5 ...
##  $ preciom     : num [1:8322] 250 320 350 400 260 240 220 310 320 780 ...
##  $ areaconst   : num [1:8322] 70 120 220 280 90 87 52 137 150 380 ...
##  $ parqueaderos: num [1:8322] 1 1 2 3 1 1 2 2 2 2 ...
##  $ banios      : num [1:8322] 3 2 2 5 2 3 2 3 4 3 ...
##  $ habitaciones: num [1:8322] 6 3 4 3 3 3 3 4 6 3 ...
##  $ tipo        : chr [1:8322] "Casa" "Casa" "Casa" "Casa" ...
##  $ barrio      : chr [1:8322] "20 de julio" "20 de julio" "20 de julio" "3 de julio" ...
##  $ longitud    : num [1:8322] -76.5 -76.5 -76.5 -76.5 -76.5 ...
##  $ latitud     : num [1:8322] 3.43 3.43 3.44 3.44 3.46 ...
##  - attr(*, "spec")=List of 3
##   ..$ cols   :List of 13
##   .. ..$ id          : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   .. ..$ zona        : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
##   .. ..$ piso        : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
##   .. ..$ estrato     : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   .. ..$ preciom     : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   .. ..$ areaconst   : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   .. ..$ parqueaderos: list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   .. ..$ banios      : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   .. ..$ habitaciones: list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   .. ..$ tipo        : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
##   .. ..$ barrio      : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
##   .. ..$ longitud    : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   .. ..$ latitud     : list()
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr [1:2] "collector_guess" "collector"
##   ..$ delim  : chr ";"
##   ..- attr(*, "class")= chr "col_spec"
##  - attr(*, "problems")=<externalptr>

3. Exploración y limpieza

3.1. Perfilado rápido

skimr::skim(viv)

Data summary
Name	viv
Number of rows	8322
Number of columns	13
_______________________
Column type frequency:
character	4
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
zona	3	1.00	8	12	5
piso	2638	0.68	2	2	12
tipo	3	1.00	4	11	2
barrio	3	1.00	4	29	436

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	3	1.00	4160.00	2401.63	1.00	2080.50	4160.00	6239.50	8319.00	▇▇▇▇▇
estrato	3	1.00	4.63	1.03	3.00	4.00	5.00	5.00	6.00	▅▆▁▇▆
preciom	2	1.00	433.89	328.65	58.00	220.00	330.00	540.00	1999.00	▇▂▁▁▁
areaconst	3	1.00	174.93	142.96	30.00	80.00	123.00	229.00	1745.00	▇▁▁▁▁
parqueaderos	1605	0.81	1.84	1.12	1.00	1.00	2.00	2.00	10.00	▇▁▁▁▁
banios	3	1.00	3.11	1.43	0.00	2.00	3.00	4.00	10.00	▇▇▃▁▁
habitaciones	3	1.00	3.61	1.46	0.00	3.00	3.00	4.00	10.00	▂▇▂▁▁
longitud	3	1.00	-76.53	0.02	-76.59	-76.54	-76.53	-76.52	-76.46	▁▅▇▂▁
latitud	3	1.00	3.42	0.04	3.33	3.38	3.42	3.45	3.50	▃▇▅▇▅

DataExplorer::plot_missing(viv)

3.2. Normalización de nombres, eliminación de duplicados y tipos

suppressPackageStartupMessages(library(dplyr))

viv <- viv %>%
  janitor::clean_names() %>%
  distinct()  # eliminación de duplicados exactos

# --- Detectar columna de ID (varios nombres posibles) ---
pick_first <- function(cands, nm) {
  hit <- cands[cands %in% nm]
  if (length(hit) == 0) NA_character_ else hit[1]
}
id_var <- pick_first(
  c("id","id_inmueble","id_inmu","id_propiedad","cod_inmueble","codigo","cod"),
  names(viv)
)

# --- Eliminar registros con ID NA o vacío (tras trim) ---
if (!is.na(id_var)) {
  if (is.numeric(viv[[id_var]]) || is.integer(viv[[id_var]])) {
    # Para ID numérico: solo quitar NA
    viv <- viv %>% filter(!is.na(.data[[id_var]]))
  } else {
    # Para ID carácter/factor: quitar NA y cadenas vacías/espacios
    viv <- viv %>%
      filter(!is.na(.data[[id_var]])) %>%
      filter(trimws(as.character(.data[[id_var]])) != "")
  }
} else {
  message("No se encontró una columna de ID reconocible. Se omite el filtrado por ID.")
}

# --- Heurística de tipos ---
num_cols <- names(viv)[sapply(viv, is.numeric)]
cat_cols <- names(viv)[sapply(viv, function(x) is.character(x) || is.factor(x))]

# --- Convertir caracteres a factor ---
viv <- viv %>% mutate(across(all_of(cat_cols), ~ as.factor(.)))

# --- Variable derivada (si existen columnas esperadas) ---
viv <- viv %>% mutate(
  precio_m2 = dplyr::if_else(
    ("preciom" %in% names(.)) & ("areaconst" %in% names(.)) & areaconst > 0,
    preciom/areaconst, NA_real_
  )
)

# --- Recalcular listas tras transformaciones ---
num_cols <- names(viv)[sapply(viv, is.numeric)]
cat_cols <- names(viv)[sapply(viv, is.factor)]

3.3. Imputación (métodos múltiples) y selección de variables

# Imputación simple (mediana para numéricas, moda para categóricas)
impute_mode <- function(x) {
  ux <- stats::na.omit(x)
  if (length(ux) == 0) return(NA)
  as.vector(names(sort(table(ux), decreasing = TRUE)))[1]
}

# Versiones imputadas
viv_mediana <- viv
viv_mediana[num_cols] <- lapply(viv_mediana[num_cols], function(x) {x[is.na(x)] <- median(x, na.rm = TRUE); x})
viv_mediana[cat_cols] <- lapply(viv_mediana[cat_cols], function(x) {x[is.na(x)] <- impute_mode(x); factor(x)})

viv_media <- viv
viv_media[num_cols] <- lapply(viv_media[num_cols], function(x) {x[is.na(x)] <- mean(x, na.rm = TRUE); x})
viv_media[cat_cols] <- lapply(viv_media[cat_cols], function(x) {x[is.na(x)] <- impute_mode(x); factor(x)})

# KNN con VIM::kNN (añade columnas *.imp; las retiramos)
viv_knn <- VIM::kNN(viv, k = 5)
viv_knn <- viv_knn %>% dplyr::select(-tidyselect::ends_with(".imp"))

# Dataset de referencia para modelado (usaremos KNN)
viv_imp <- viv_knn

# Selección numéricas para PCA (excluye ids/coords y estrato si apareciera como numérica)
num_for_pca <- setdiff(num_cols, c("id","codigo","lat","lng","longitud","latitud","latitude","longitude","estrato"))

4. Análisis exploratorio (EDA)

# Distribuciones de variables numéricas (KNN por defecto)
viv_imp %>%
  dplyr::select(all_of(num_cols)) %>%
  DataExplorer::plot_histogram(ncol = 3L)

# Matriz de correlación para numéricas (si >= 2 columnas)
if (length(num_cols) >= 2) {
  cm <- cor(viv_imp[, num_cols], use = "pairwise.complete.obs")
  corrplot::corrplot(cm, method = "color", type = "upper", tl.cex = 0.7)
}

4.1. Boxplots comparando métodos de imputación

# --- Elegimos columna de "parqueadero" y "tipo" con nombres alternativos ---
pick_first <- function(cands, nm) {
  hit <- cands[cands %in% nm]
  if (length(hit) == 0) NA_character_ else hit[1]
}

parqueadero_var <- pick_first(
  c("parqueadero","parqueaderos","parq","parking","parqueo","estacionamiento","garaje","garajes"),
  names(viv)
)

tipo_var <- pick_first(c("tipo","tipo_inmueble","clase","categoria","uso"), names(viv))

if (is.na(parqueadero_var) || is.na(tipo_var)) {
  message("No se encontraron columnas claras para 'parqueadero' y/o 'tipo'.")
} else {
  # --- Construir dataset largo con tres escenarios ---
  df_sin  <- viv        %>% dplyr::select(all_of(c(tipo_var, parqueadero_var))) %>% mutate(metodo = "Sin imputar")
  df_knn  <- viv_knn    %>% dplyr::select(all_of(c(tipo_var, parqueadero_var))) %>% mutate(metodo = "KNN")
  df_med  <- viv_mediana%>% dplyr::select(all_of(c(tipo_var, parqueadero_var))) %>% mutate(metodo = "Mediana")

  dx <- bind_rows(df_sin, df_knn, df_med)

  # Limpiar NAs de tipo
  dx <- dx %>% filter(!is.na(.data[[tipo_var]]))

  # ¿Parqueadero numérico o categórico?
  es_numerico <- is.numeric(dx[[parqueadero_var]]) || is.integer(dx[[parqueadero_var]])

  if (es_numerico) {
    # --- BOXPLOTS (parqueadero numérico) ---
    dx_num <- dx %>% filter(!is.na(.data[[parqueadero_var]]))
    ggplot(dx_num, aes(x = .data[[tipo_var]], y = .data[[parqueadero_var]], fill = .data[[tipo_var]])) +
      geom_boxplot(outlier.alpha = 0.25) +
      facet_wrap(~ metodo) +
      labs(title = "Parqueadero por tipo de inmueble y método de imputación",
           x = "Tipo", y = "Parqueadero (valor numérico)") +
      theme_minimal() +
      theme(legend.position = "none")
  } else {
    # --- BARRAS APILADAS EN PROPORCIÓN (parqueadero categórico) ---
    # Normalizamos a factor
    dx_cat <- dx %>%
      mutate(
        across(all_of(parqueadero_var), ~ as.factor(.x)),
        across(all_of(tipo_var), ~ as.factor(.x))
      ) %>%
      filter(!is.na(.data[[parqueadero_var]]))

    # Resumen de proporciones por tipo y método
    sum_cat <- dx_cat %>%
      group_by(metodo, !!sym(tipo_var), !!sym(parqueadero_var)) %>%
      summarise(n = n(), .groups = "drop_last") %>%
      mutate(prop = n / sum(n)) %>%
      ungroup()

    ggplot(sum_cat,
           aes(x = .data[[tipo_var]], y = prop, fill = .data[[parqueadero_var]])) +
      geom_col(position = "fill") +
      facet_wrap(~ metodo) +
      scale_y_continuous(labels = percent_format(accuracy = 1)) +
      labs(title = "Distribución de parqueadero por tipo y método de imputación",
           x = "Tipo",
           y = "Proporción",
           fill = parqueadero_var) +
      theme_minimal() +
      theme(legend.position = "right")
  }
}

5. PCA – Componentes Principales (sin estrato)

5.1. Preparación y escalado

pca_data <- viv_imp %>% dplyr::select(all_of(num_for_pca)) %>% scale() %>% as.data.frame()
pca_fit <- FactoMineR::PCA(pca_data, graph = FALSE)

5.2. Varianza explicada (objetivo 70–80%) y selección de componentes

factoextra::fviz_eig(pca_fit, addlabels = TRUE, barfill = "#2c7fb8", barcolor = "#2c7fb8")

var_cum <- cumsum(pca_fit$eig[,2]) # porcentaje acumulado
# índices que logran entre 70 y 80
idx_70 <- which(var_cum >= 70)[1]
idx_80 <- which(var_cum >= 80)[1]
# escoger el más cercano a 75 en ese rango; si no existe 80, tomar >=70
cand <- idx_70
if (!is.na(idx_80)) {
  idx_seq <- idx_70:idx_80
  if (length(idx_seq) > 0) {
    d <- abs(var_cum[idx_seq] - 75)
    cand <- idx_seq[which.min(d)]
  }
}
q <- cand
message(sprintf("PCs seleccionadas: %d (%.2f%% varianza acumulada)", q, var_cum[q]))

5.3. Biplot y contribuciones

# Variables
factoextra::fviz_pca_var(
  pca_fit,
  col.var  = "contrib",
  repel    = FALSE,
  labelsize = 3
)

# Individuos (muestra si hay muchos)
set.seed(123)
rn_active <- rownames(pca_fit$ind$coord)
n_active  <- nrow(pca_fit$ind$coord)
idx <- if (n_active > 2000) sample.int(n_active, 2000) else seq_len(n_active)
names_sel <- rn_active[idx]

hab <- NULL
if ("estrato" %in% names(viv_imp)) {
  e <- viv_imp$estrato
  if (is.numeric(e)) {
    if (length(unique(e[!is.na(e)])) <= 10) e <- factor(e) else e <- NULL
  } else if (is.character(e)) {
    e <- factor(e)
  }
  if (!is.null(e)) {
    e_full <- e; names(e_full) <- rownames(viv_imp)
    hab <- droplevels(e_full[rn_active])
  }
}

factoextra::fviz_pca_ind(
  pca_fit,
  geom        = "point",
  select.ind  = list(name = names_sel),
  pointsize   = 0.6,
  alpha       = 0.4,
  habillage   = hab,
  addEllipses = FALSE
)

# Contribuciones
fviz_contrib(pca_fit, choice = "var", axes = 1, top = 15)

fviz_contrib(pca_fit, choice = "var", axes = 2, top = 15)

6. Clustering – Segmentación de propiedades (en PCs)

6.1. Determinación de k (WSS y Silhouette)

pcs <- pca_fit$ind$coord[, 1:q, drop = FALSE]

# Elbow/WSS
fviz_nbclust(as.data.frame(pcs), kmeans, method = "wss") + ggtitle("Metodo del codo (WSS)")

# Silhouette
fviz_nbclust(as.data.frame(pcs), kmeans, method = "silhouette") + ggtitle("Indice Silhouette")

6.2. K-means y evaluación

# Elegir k = 3 por defecto; ajustar según gráficos anteriores
k_opt <- 3
km <- kmeans(pcs, centers = k_opt, nstart = 50)
viv_seg <- viv_imp %>% mutate(cluster = factor(km$cluster))

# Visualización en el plano PC1–PC2
fviz_cluster(list(data = as.data.frame(pcs), cluster = km$cluster),
             geom = "point", ellipse.type = "norm")

# Tamaños y promedios por clúster
sizes <- dplyr::count(viv_seg, cluster)
knitr::kable(sizes, caption = "Tamano de clusteres") %>% kableExtra::kable_styling(full_width = FALSE)

Tamano de clusteres
cluster	n
1	5371
2	1258
3	1690

means_by_cluster <- viv_seg %>% dplyr::group_by(cluster) %>% dplyr::summarise(across(all_of(num_for_pca), \(x) mean(x, na.rm = TRUE)))
knitr::kable(means_by_cluster, caption = "Promedios por cluster (variables numericas)") %>% kableExtra::kable_styling(full_width = TRUE)

Promedios por cluster (variables numericas)
cluster	preciom	areaconst	parqueaderos	banios	habitaciones	precio_m2
1	278.0875	100.8915	1.315770	2.355241	2.948241	2.802474
2	1039.2401	312.0221	3.266296	4.814785	3.932432	3.787635
3	478.5083	308.2082	1.928994	4.246154	5.450296	1.674076

6.3. Clustering jerárquico

hc <- hclust(dist(scale(pcs)), method = "ward.D2")
plot(hc, cex = 0.6)
rect.hclust(hc, k = k_opt, border = 2:5)

7. Análisis de Correspondencia (CA) y Correspondencia Múltiple (MCA)

7.1. CA bivariado (ej.: tipo x zona, zona x barrio) – si existen

# --- Helper robusto para CA ---
safe_ca_biplot <- function(tab, title = "CA", top_cols = NA, repel = TRUE) {
  if (!is.na(top_cols)) {
    keep <- names(sort(colSums(tab), decreasing = TRUE))[1:min(top_cols, ncol(tab))]
    tab  <- tab[, keep, drop = FALSE]
  }
  tab <- tab[rowSums(tab) > 0, , drop = FALSE]
  tab <- tab[, colSums(tab) > 0, drop = FALSE]

  if (nrow(tab) < 2 || ncol(tab) < 2 || sum(tab) == 0) {
    message("CA omitido (", title, "): tabla insuficiente (necesita al menos 2x2 y >0 conteos).")
    return(invisible(NULL))
  }

  ca <- FactoMineR::CA(as.data.frame.matrix(tab), graph = FALSE)
  ndim <- min(nrow(tab) - 1, ncol(tab) - 1)
  if (!is.finite(ndim) || ndim < 2) {
    message("CA omitido (", title, "): ndim < 2 (demasiadas categorías vacías o redundantes).")
    return(invisible(NULL))
  }

  factoextra::fviz_ca_biplot(ca, repel = repel) + ggtitle(title)
}

# Tipo x Zona
if (all(c("tipo","zona") %in% names(viv_imp))) {
  tab_tz <- table(viv_imp$tipo, viv_imp$zona)
  safe_ca_biplot(tab_tz, title = "CA: Tipo × Zona")
}

# Zona x Barrio (Top 20 barrios por frecuencia)
if (all(c("zona","barrio") %in% names(viv_imp))) {
  tab_zb <- table(viv_imp$zona, viv_imp$barrio)
  safe_ca_biplot(tab_zb, title = "CA: Zona por Barrios (Top 20)", top_cols = 20)
}

7.2. MCA para múltiples categóricas

cat_for_mca <- intersect(cat_cols, c("tipo", "zona", "barrio", "estrato"))
if (length(cat_for_mca) >= 2) {
  mca_fit <- FactoMineR::MCA(viv_imp[, cat_for_mca, drop = FALSE], graph = FALSE)
  fviz_mca_biplot(mca_fit, repel = TRUE, ggtheme = theme_minimal())
  fviz_contrib(mca_fit, choice = "var", axes = 1, top = 15)
}

8. Mapa interactivo (coloreado por clúster)

possible_lat <- intersect(names(viv_seg), c("lat","latitud","latitude"))
possible_lng <- intersect(names(viv_seg), c("lng","long","longitud","longitude"))

if (length(possible_lat) >= 1 && length(possible_lng) >= 1) {
  lat_col <- possible_lat[1]; lng_col <- possible_lng[1]

  coords <- viv_seg %>%
    dplyr::filter(!is.na(.data[[lat_col]]), !is.na(.data[[lng_col]])) %>%
    dplyr::rename(lat = dplyr::all_of(lat_col), lng = dplyr::all_of(lng_col))

  if (nrow(coords) > 0) {
    pal <- leaflet::colorFactor(topo.colors(length(levels(viv_seg$cluster))), viv_seg$cluster)

    leaflet::leaflet(coords) %>% leaflet::addTiles() %>%
      leaflet::addCircleMarkers(lng = ~lng, lat = ~lat,
                                radius = 4, stroke = FALSE, fillOpacity = 0.7,
                                color = ~pal(viv_seg$cluster),
                                popup = ~paste0("Cluster: ", viv_seg$cluster)) %>%
      leaflet::addLegend(pal = pal, values = viv_seg$cluster, title = "Cluster")
  } else {
    cat("Sin filas válidas con coordenadas.\n")
  }
} else {
  cat("No se hallaron columnas tipo lat*/lng*; se omite el mapa.\n")
}

EVALUACIÓN MERCADO INMOBILIARIO

Fredy Alejandro Ordonez Marquez

2025-08-08