AnĂ¡lisis de la base de datos

head(vivienda)
# A tibble: 6 Ă— 13
     id zona    piso  estrato preciom areaconst parqueaderos banios habitaciones
  <dbl> <chr>   <chr>   <dbl>   <dbl>     <dbl>        <dbl>  <dbl>        <dbl>
1  1147 Zona O… <NA>        3     250        70            1      3            6
2  1169 Zona O… <NA>        3     320       120            1      2            3
3  1350 Zona O… <NA>        3     350       220            2      2            4
4  5992 Zona S… 02          4     400       280            3      5            3
5  1212 Zona N… 01          5     260        90            1      2            3
6  1724 Zona N… 01          5     240        87            1      3            3
# ℹ 4 more variables: tipo <chr>, barrio <chr>, longitud <dbl>, latitud <dbl>

AnĂ¡lisis descriptivo

[1] 8322   13
spc_tbl_ [8,322 Ă— 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ id          : num [1:8322] 1147 1169 1350 5992 1212 ...
 $ zona        : chr [1:8322] "Zona Oriente" "Zona Oriente" "Zona Oriente" "Zona Sur" ...
 $ piso        : chr [1:8322] NA NA NA "02" ...
 $ estrato     : num [1:8322] 3 3 3 4 5 5 4 5 5 5 ...
 $ preciom     : num [1:8322] 250 320 350 400 260 240 220 310 320 780 ...
 $ areaconst   : num [1:8322] 70 120 220 280 90 87 52 137 150 380 ...
 $ parqueaderos: num [1:8322] 1 1 2 3 1 1 2 2 2 2 ...
 $ banios      : num [1:8322] 3 2 2 5 2 3 2 3 4 3 ...
 $ habitaciones: num [1:8322] 6 3 4 3 3 3 3 4 6 3 ...
 $ tipo        : chr [1:8322] "Casa" "Casa" "Casa" "Casa" ...
 $ barrio      : chr [1:8322] "20 de julio" "20 de julio" "20 de julio" "3 de julio" ...
 $ longitud    : num [1:8322] -76.5 -76.5 -76.5 -76.5 -76.5 ...
 $ latitud     : num [1:8322] 3.43 3.43 3.44 3.44 3.46 ...
 - attr(*, "spec")=List of 3
  ..$ cols   :List of 13
  .. ..$ id          : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  .. ..$ zona        : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
  .. ..$ piso        : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
  .. ..$ estrato     : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  .. ..$ preciom     : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  .. ..$ areaconst   : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  .. ..$ parqueaderos: list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  .. ..$ banios      : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  .. ..$ habitaciones: list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  .. ..$ tipo        : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
  .. ..$ barrio      : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
  .. ..$ longitud    : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  .. ..$ latitud     : list()
  .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr [1:2] "collector_guess" "collector"
  ..$ delim  : chr ";"
  ..- attr(*, "class")= chr "col_spec"
 - attr(*, "problems")=<externalptr> 
       id           zona               piso              estrato     
 Min.   :   1   Length:8322        Length:8322        Min.   :3.000  
 1st Qu.:2080   Class :character   Class :character   1st Qu.:4.000  
 Median :4160   Mode  :character   Mode  :character   Median :5.000  
 Mean   :4160                                         Mean   :4.634  
 3rd Qu.:6240                                         3rd Qu.:5.000  
 Max.   :8319                                         Max.   :6.000  
 NA's   :3                                            NA's   :3      
    preciom         areaconst       parqueaderos        banios      
 Min.   :  58.0   Min.   :  30.0   Min.   : 1.000   Min.   : 0.000  
 1st Qu.: 220.0   1st Qu.:  80.0   1st Qu.: 1.000   1st Qu.: 2.000  
 Median : 330.0   Median : 123.0   Median : 2.000   Median : 3.000  
 Mean   : 433.9   Mean   : 174.9   Mean   : 1.835   Mean   : 3.111  
 3rd Qu.: 540.0   3rd Qu.: 229.0   3rd Qu.: 2.000   3rd Qu.: 4.000  
 Max.   :1999.0   Max.   :1745.0   Max.   :10.000   Max.   :10.000  
 NA's   :2        NA's   :3        NA's   :1605     NA's   :3       
  habitaciones        tipo              barrio             longitud     
 Min.   : 0.000   Length:8322        Length:8322        Min.   :-76.59  
 1st Qu.: 3.000   Class :character   Class :character   1st Qu.:-76.54  
 Median : 3.000   Mode  :character   Mode  :character   Median :-76.53  
 Mean   : 3.605                                         Mean   :-76.53  
 3rd Qu.: 4.000                                         3rd Qu.:-76.52  
 Max.   :10.000                                         Max.   :-76.46  
 NA's   :3                                              NA's   :3       
    latitud     
 Min.   :3.333  
 1st Qu.:3.381  
 Median :3.416  
 Mean   :3.418  
 3rd Qu.:3.452  
 Max.   :3.498  
 NA's   :3      
          id         zona         piso      estrato      preciom    areaconst 
           3            3         2638            3            2            3 
parqueaderos       banios habitaciones         tipo       barrio     longitud 
        1605            3            3            3            3            3 
     latitud 
           3 
[1] 3514

Base sin datos faltantes

# Eliminar registros con datos faltantes
viviendaSNA <- na.omit(vivienda)

# VerificaciĂ³n rĂ¡pida
dim(viviendaSNA)
[1] 4808   13
sum(!complete.cases(viviendaSNA))
[1] 0

VisualizaciĂ³n en Viewer

# Nota: View() funciona en RStudio Viewer (no en el HTML knit)
View(vivienda)
View(viviendaSNA)



AnĂ¡lisis de componentes principales

viviendaZ <- scale(viviendaSNA[,c(5:9)])
head(viviendaZ)
        preciom  areaconst parqueaderos     banios habitaciones
[1,] -0.1756310  0.7609789    1.0779092  1.3178809   -0.4241459
[2,] -0.6055839 -0.6129041   -0.7415001 -0.9022913   -0.4241459
[3,] -0.6670057 -0.6345970   -0.7415001 -0.1622339   -0.4241459
[4,] -0.7284276 -0.8876807    0.1682046 -0.9022913   -0.4241459
[5,] -0.4520293 -0.2730489    0.1682046 -0.1622339    0.3272519
[6,] -0.4213184 -0.1790463    0.1682046  0.5778235    1.8300475
summary(viviendaZ)
    preciom          areaconst        parqueaderos         banios       
 Min.   :-1.2259   Min.   :-0.9745   Min.   :-0.7415   Min.   :-2.3824  
 1st Qu.:-0.6532   1st Qu.:-0.6491   1st Qu.:-0.7415   1st Qu.:-0.9023  
 Median :-0.3292   Median :-0.3743   Median : 0.1682   Median :-0.1622  
 Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
 3rd Qu.: 0.3157   3rd Qu.: 0.3633   3rd Qu.: 0.1682   3rd Qu.: 0.5778  
 Max.   : 4.7350   Max.   : 9.5828   Max.   : 7.4458   Max.   : 5.0182  
  habitaciones    
 Min.   :-2.6783  
 1st Qu.:-0.4241  
 Median :-0.4241  
 Mean   : 0.0000  
 3rd Qu.: 0.3273  
 Max.   : 4.8356  
library(mice)
md.pattern(viviendaZ)
 /\     /\
{  `---'  }
{  O   O  }
==>  V <==  No need for mice. This data set is completely observed.
 \  \|/  /
  `-----'

     preciom areaconst parqueaderos banios habitaciones  
4808       1         1            1      1            1 0
           0         0            0      0            0 0
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.pca <-prcomp(viviendaZ)
fviz_eig(res.pca, addlabels = TRUE)
Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
Ignoring empty aesthetic: `width`.

fviz_pca_var(res.pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#FF7F00",  "#034D94"),
repel = TRUE     # Avoid text overlapping
)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ℹ The deprecated feature was likely used in the ggpubr package.
  Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
This warning is displayed once per session.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.
This warning is displayed once per session.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.

AnĂ¡lisis de conglomerados**

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ lubridate 1.9.5     ✔ tibble    3.3.1
✔ purrr     1.2.1     ✔ tidyr     1.3.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::combine() masks gridExtra::combine()
✖ dplyr::filter()  masks mice::filter(), stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ tibble::view()   masks summarytools::view()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
viviendaZ <- as.data.frame(viviendaZ)
# distancia euclidiana
dist_viv <- dist(viviendaZ, method = 'euclidean')

# ClĂºster jerĂ¡rquico con el mĂ©todo complete
hc_viv <- hclust(dist_viv, method = 'complete')

# Determinamos a dĂ³nde pertenece cada observaciĂ³n
cluster_assigments <- cutree(hc_viv, k = 4)

# asignamos los clusters
assigned_cluster <- viviendaZ %>% mutate(cluster = as.factor(cluster_assigments))
set.seed(123)

# Muestreo para visualizacion clara
viv_muestra <- viviendaZ %>% dplyr::slice_sample(n = min(200, nrow(viviendaZ)))
dist_muestra <- dist(viv_muestra, method = "euclidean")
hc_muestra <- hclust(dist_muestra, method = "complete")

plot(
  hc_muestra,
  cex = 0.6,
  main = "Dendrograma (muestra viviendaZ)",
  las = 1,
  ylab = "Distancia euclidiana",
  xlab = "Observaciones"
)
rect.hclust(hc_muestra, k = 4, border = 2:5)

# Visualizacion de clusters en 2D con PCA (muestra)
res_pca_m <- prcomp(viv_muestra)
cluster_m <- cutree(hc_muestra, k = 4)
pca_df <- data.frame(
  PC1 = res_pca_m$x[, 1],
  PC2 = res_pca_m$x[, 2],
  cluster = factor(cluster_m)
)

ggplot(pca_df, aes(PC1, PC2, color = cluster)) +
  geom_point(alpha = 0.8, size = 2) +
  labs(
    title = "Clusters de viviendaZ (muestra)",
    x = "PC1",
    y = "PC2",
    color = "Cluster"
  ) +
  theme_minimal()

AnĂ¡lisis de correspondencia