knitr::opts_chunk$set(warning = TRUE, echo = TRUE)
getwd()
## [1] "C:/Users/ruber/OneDrive/Documentos/Taller 2"
setwd("C:/Users/ruber/OneDrive/Documentos/Taller 2")
datos<-read.csv("C:/Users/ruber/OneDrive/Documentos/Taller 2/Inmuebles_Disponibles_Para_La_Venta_20250520.csv")
dim(datos)
## [1] 448  12
options(scipen=999)

str(datos)
## 'data.frame':    448 obs. of  12 variables:
##  $ Codigo                : int  17745 17778 17797 17798 12115 12116 12117 17800 2330 2363 ...
##  $ Ciudad                : chr  "BOGOTA" "BOGOTA" "BOGOTA" "PEREIRA" ...
##  $ Departamento          : chr  "CUNDINAMARCA" "CUNDINAMARCA" "CUNDINAMARCA" "RISARALDA" ...
##  $ Barrio                : chr  "" "" "" "" ...
##  $ Direccion             : chr  "CALLE 22D NO. 120-19  -FONTIBÓN" "KR 77 H  No. 65 C - 33 SUR" "CR 70 No. 49-77" "CALLES 18 Y 19  CARRERAS 5 Y 6  SALIDA CARRERA 5  No. 18 - 43 / CARRERA 5  CALLES 18 Y 19  No. 18 - 49 / CALLE "| __truncated__ ...
##  $ Area.Terreno          : int  0 0 0 0 6400000 13162700 13400000 0 559804 302079 ...
##  $ Area.Construida       : int  0 0 0 0 70000 0 76306 0 0 0 ...
##  $ Detalle.Disponibilidad: chr  "COMERCIALIZABLE CON RESTRICCION" "COMERCIALIZABLE CON RESTRICCION" "COMERCIALIZABLE CON RESTRICCION" "COMERCIALIZABLE CON RESTRICCION" ...
##  $ Estrato               : chr  "INDUSTRIAL" "DOS" "CUATRO" "CINCO" ...
##  $ Precio                : num  274462556400000 27076410000000 14482416000000 43343535000000 10428866940000 ...
##  $ Tipo.de.Inmueble      : chr  "BODEGA" "LOTE CON CONSTRUCCION" "CASA" "CLINICA" ...
##  $ Datos.Adicionales     : chr  "" "" "" "" ...
datos[datos==""] <- NA

datos[datos=="0"] <- NA

Total_NA<-sum(is.na(datos))
print(Total_NA)
## [1] 1623
Total_NA / (nrow(datos)*ncol(datos)) * 100
## [1] 30.18973
any(duplicated(datos))
## [1] FALSE
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'stringr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.4.3
library(survival)


Precio <- as.numeric(datos$Precio)

Q1 <- quantile(datos$Precio, 0.25, na.rm = TRUE)
Q3 <- quantile(datos$Precio, 0.75, na.rm = TRUE)

IQR_Valor = Q3 - Q1

LimF <- Q1 - 1.5 * IQR_Valor
LimS <- Q3 + 1.5 * IQR_Valor


datos_filtrados <- datos %>%
  filter(!is.na(Precio), !is.na(Departamento), Precio >= LimF, Precio <= LimS) %>%
  select(Precio, Departamento)

ggplot(datos_filtrados, aes(x = fct_reorder(Departamento, Precio, .fun = median), y = Precio, fill = Departamento))+
  geom_boxplot(outlier.colour = "blue", outlier.shape = 16 )+
  labs(
    title = "Distribución de Precios de Inmuebles Por Departamento",
    x = "Departamento",
    y = "Precio"
  )+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme(legend.position = "none")

library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
mediana <- median(datos$Precio, na.rm =TRUE)

Datos_Dstr <- datos %>% 
  filter(!is.na(Precio), Precio > 0, Precio <= 1.5*mediana)

Distribucion1 <- skewness(Datos_Dstr$Precio, na.rm = TRUE)

ggplot(Datos_Dstr, aes(x =Datos_Dstr$Precio))+
  geom_histogram(bins = 30, fill = "blue", color ="black")+
  labs(
    title = "Distribución de Precios de Inmuebles (COP)",
    x = "Precio",
    y = "Frecuencia"
  )+
  annotate("text", x = Inf, y = Inf, label = paste0("Skewness: ", round(Distribucion1, 2)),
           hjust = 1.1, vjust = 2, size = 5, color = "#301")
## Warning: Use of `Datos_Dstr$Precio` is discouraged.
## ℹ Use `Precio` instead.

datos_grafico <- datos %>%
  filter(!is.na(Tipo.de.Inmueble), !is.na(Precio)) %>%
           group_by(Tipo.de.Inmueble)%>%
           summarise(
             conteo = n(),
             precio_medio = mean(Precio, na.rm =TRUE)
           ) %>%
  arrange(desc(conteo))%>%
  slice_head(n=10)

ggplot(datos_grafico, aes(x = reorder(Tipo.de.Inmueble, -conteo)))+
  geom_bar(aes(y=conteo), stat = "identity", fill = "purple", alpha = 0.7)+
  geom_line(aes(y= precio_medio / max(precio_medio)*max(conteo)),
             group = 1, color = "#300", size = 1)+
  geom_point(aes(y=precio_medio/max(precio_medio)*max(conteo)),
             color = "#300", size = 3)+
  scale_y_continuous(
    name = "Cantidad de Inmuebles",
    sec.axis = sec_axis(~.*max(datos_grafico$precio_medio)/max(datos_grafico$conteo),
                        name = "Precio Promedio (COP)"))+
  labs(title = "Tipos de Inmueble Más Comunes y su Precio Promedio", x = "Tipo de Inmueble")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Estadisticas_Clave <- datos %>%
  summarise(
    Medina = median(Precio, na.rm = TRUE),
    Mediana = mean(Precio,na.rm = TRUE),
    SD = sd(Precio, na.rm = TRUE),
    Q1 = quantile(datos$Precio, 0.25, na.rm = TRUE),
    Q2 = quantile(datos$Precio, 0.50, na.rm = TRUE),
    Q3 = quantile(datos$Precio, 0.75, na.rm = TRUE)
  )
  
print(Estadisticas_Clave)
##         Medina       Mediana             SD           Q1           Q2
## 1 165205000000 9393874307823 38219834168449 125725000000 165205000000
##              Q3
## 1 1861440000000
Tabla_F <- datos %>%
  group_by(Tipo.de.Inmueble)%>%
  summarise(
    Frecuencia = n(),
    Frecuencia_Relativa = n() / nrow(datos))%>%
  arrange(desc(Frecuencia))

print(Tabla_F)
## # A tibble: 19 × 3
##    Tipo.de.Inmueble      Frecuencia Frecuencia_Relativa
##    <chr>                      <int>               <dbl>
##  1 LOCAL                        304             0.679  
##  2 LOTE VIVIENDA                 38             0.0848 
##  3 OFICINA                       28             0.0625 
##  4 CASA                          14             0.0312 
##  5 BODEGA                         8             0.0179 
##  6 FINCA                          8             0.0179 
##  7 LOTE CON CONSTRUCCION          8             0.0179 
##  8 LOTE MIXTO                     6             0.0134 
##  9 APARTAMENTO                    5             0.0112 
## 10 LOTE AGRICOLA                  5             0.0112 
## 11 EDIFICIO                       4             0.00893
## 12 LOTE                           4             0.00893
## 13 LOTE INDUSTRIAL                4             0.00893
## 14 CLINICA                        3             0.00670
## 15 LOTE COMERCIAL                 3             0.00670
## 16 EDIFICIO VIVIENDA              2             0.00446
## 17 GARAJE                         2             0.00446
## 18 HOTEL                          1             0.00223
## 19 LOTE NO URBANIZABLE            1             0.00223
options(scipen = 999)

Promedio_P_Estrato <- datos %>%
  filter(!is.na(Estrato), !is.na(Precio)) %>%
  group_by(Estrato)%>%
  summarise(
    Inmuebles = n(),
    precio_medio = mean(Precio, na.rm =TRUE),
    precio_mediana = median(Precio, na.rm = TRUE),
  ) %>%
  arrange(desc(Estrato))

print(Promedio_P_Estrato)
## # A tibble: 9 × 4
##   Estrato    Inmuebles precio_medio precio_mediana
##   <chr>          <int>        <dbl>          <dbl>
## 1 UNO                1      9.82e11        9.82e11
## 2 TRES              16      3.87e13        7.01e12
## 3 SEIS              18      7.06e12        2.13e12
## 4 RURAL             36      3.08e13        1.54e13
## 5 INDUSTRIAL         3      1.22e14        7.09e13
## 6 DOS               32      1.30e13        3.96e11
## 7 CUATRO            13      7.84e12        1.48e12
## 8 COMERCIAL        320      4.16e12        1.27e11
## 9 CINCO              8      1.60e13        9.59e12
Promedio_P_Ciudad <- datos %>%
  filter(!is.na(Ciudad), !is.na(Precio)) %>%
  group_by(Ciudad)%>%
  summarise(
    Inmuebles = n(),
    precio_medio = mean(Precio, na.rm =TRUE),
    precio_mediana = median(Precio, na.rm = TRUE),
  ) %>%
  arrange(desc(Ciudad))

print(Promedio_P_Ciudad)
## # A tibble: 48 × 4
##    Ciudad                 Inmuebles precio_medio precio_mediana
##    <chr>                      <int>        <dbl>          <dbl>
##  1 YUMBO                          1      6.88e12        6.88e12
##  2 VILLAVICENCIO                285      1.76e11        1.26e11
##  3 VILLA RICA                     1      7.09e13        7.09e13
##  4 TURBO                          1      1.70e13        1.70e13
##  5 TIBU                           1      5.81e12        5.81e12
##  6 TENJO                          1      6.37e12        6.37e12
##  7 TARAZA                         1      8.08e11        8.08e11
##  8 SOGAMOSO                       1      1.47e14        1.47e14
##  9 SOATA                          1      6.83e12        6.83e12
## 10 SANTANDER DE QUILICHAO         2      7.25e12        7.25e12
## # ℹ 38 more rows
library(dplyr)
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.4.3
library(stringr)
library(ggmap)
## Warning: package 'ggmap' was built under R version 4.4.3
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(tidygeocoder)
## Warning: package 'tidygeocoder' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'tidygeocoder'
## The following object is masked from 'package:ggmap':
## 
##     geocode
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Cargando paquete requerido: viridisLite
base_mapa <- st_read("C:/Users/ruber/OneDrive/Documentos/Taller 2/Colombia Datos")%>%
  select(DPTO_CNMBR, geometry)%>%
  rename(Departamento = DPTO_CNMBR)
## Reading layer `COLOMBIA' from data source 
##   `C:\Users\ruber\OneDrive\Documentos\Taller 2\Colombia Datos' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 33 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -81.73562 ymin: -4.229406 xmax: -66.84722 ymax: 13.39473
## Geodetic CRS:  MAGNA-SIRGAS
inmuebles_P <- datos %>%
  group_by(Departamento) %>%
  summarise(
    Precio_Bill = sum(Precio, na.rm = TRUE) / 1e12  
  ) %>%
  arrange(desc(Precio_Bill))

deptos_shapefile <- unique(base_mapa$DPTO_CNMBR)  

deptos_csv <- unique(inmuebles_P$Departamento)    

st_geometry(base_mapa)
## Geometry set for 33 features 
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -81.73562 ymin: -4.229406 xmax: -66.84722 ymax: 13.39473
## Geodetic CRS:  MAGNA-SIRGAS
## First 5 geometries:
## MULTIPOLYGON (((-76.41355 8.873829, -76.40465 8...
## MULTIPOLYGON (((-74.84946 11.09778, -74.84938 1...
## MULTIPOLYGON (((-74.07059 4.828562, -74.07036 4...
## MULTIPOLYGON (((-76.17318 9.387846, -76.17287 9...
## MULTIPOLYGON (((-72.17368 7.053077, -72.17277 7...
names(base_mapa)
## [1] "Departamento" "geometry"
ggplot(base_mapa) +
  geom_sf(fill = "red", color = "white") + 
  theme_void()+
  labs(title = "Mapa de Colombia")

inmuebles_P <- datos %>%
  group_by(Departamento) %>%
  summarise(
    Precio_Bill = sum(Precio, na.rm = TRUE) / 1e12  
  ) %>%
  arrange(desc(Precio_Bill))  


Mapa_Final <- base_mapa %>%
  left_join(inmuebles_P)%>%
  mutate(
    Precio_Bill = ifelse(is.na(Precio_Bill), 0, Precio_Bill)
  )
## Joining with `by = join_by(Departamento)`
Mapa_Final <- Mapa_Final %>%
  mutate(
    Rango_Precio = case_when(
      Precio_Bill == 0 ~ "Sin datos (0 B)",
      Precio_Bill > 0 & Precio_Bill <= 100 ~ "Bajo (0-100 B)",
      Precio_Bill > 100 & Precio_Bill <= 500 ~ "Medio (100-500 B)",
      Precio_Bill > 500 ~ "Alto (>500 B)",
      TRUE ~ NA_character_
    ),
    Rango_Precio = factor(
      Rango_Precio,
      levels = c("Sin datos (0 B)", "Bajo (0-100 B)", "Medio (100-500 B)", "Alto (>500 B)")
    )
  )
Mapa_Final <- st_transform(Mapa_Final, crs = 4326)

paleta_roja <- colorNumeric(
  palette = c("#C6F4D6", "#8BC34A", "#3E8E41", "#2E865F", "#228B22", "#1A6D1A", "#145214", "#032B03"),
  domain = Mapa_Final$Precio_Bill
)

etiquetas <- sprintf(
  "<strong>%s</strong><br/>Precio: <b>%s billones</b>",
  Mapa_Final$Departamento,
  round(Mapa_Final$Precio_Bill, 1))%>%
  lapply(htmltools::HTML)

leaflet(Mapa_Final)%>%
  addProviderTiles(providers$CartoDB.Positron)%>%
  addPolygons(
    fillColor = ~paleta_roja(Precio_Bill),
    fillOpacity = 0.8,
    color = "white",
    weight = 1,
    smoothFactor = 0.5,
    highlightOptions =highlightOptions(
      weight = 3,
      color = "#00FF00",
      fillOpacity = 1,
      bringToFront = TRUE,
      sendToBack = TRUE
    ),
    label = etiquetas,
    labelOptions = labelOptions(
      style = list("font-weight"="normal", padding = "3px 8px"),
      textsize = "15px",
      direction = "auto"
    )
    )%>%
  addLegend(
    pal = paleta_roja,
    values = ~Precio_Bill,
    title = "Precio (Billones COP)",
    opacity = 0.9,
    labFormat = labelFormat(suffix = "B")
  )
library(ggplot2)
library(dplyr)

datos2 <- datos %>%
  filter(!is.na(Precio),
         !is.na(Departamento))%>%
  mutate(
    Precio_Bill = Precio/1e12
  )

ggplot(datos2, aes(
  x = reorder(Departamento, Precio_Bill, FUN = median),
  y = Precio_Bill,
  fill = Departamento
))+
  geom_boxplot(
    outlier.shape = 1,
    width = 0.6,
    alpha = 0.8
  )+
  geom_jitter(
    width = 0.15,
    size = 1.5,
    alpha = 0.3,
    color = "#333333"
  )+
  labs(
    title = "Distribucion de Precios de Inmuebles por Departamento",
    subtitle = "Valores en Billones de COP",
    x= "",
    y= "Precio (Billones COP)"
  )+
  theme_minimal(base_size = 12)+
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
    legend.position = "none",
    plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
    panel.grid.major.x = element_blank()
  )+
  scale_fill_viridis_d(option ="plasma", begin = 0.2, end = 0.8)

datos_filtrados <- datos %>%
  filter(
   
    (!is.na(Area.Construida) & Area.Construida > 0) | 
      (!is.na(Area.Terreno) & Area.Terreno > 0)
  ) %>%
  mutate(
    Area_Util = ifelse(!is.na(Area.Construida) & Area.Construida > 0,
                       Area.Construida,
                       Area.Terreno),
    precio_m2 = Precio / Area_Util,
    Barrio_Depto1 = paste0(Barrio, " (", Departamento, ")")
  )


analisis_barrios <- datos_filtrados %>%
  filter(!is.na(Barrio)) %>%  # Excluir barrios desconocidos
  group_by(Barrio_Depto1, Estrato) %>%
  summarise(
    n_inmuebles = n(),
    precio_mediano = median(precio_m2, na.rm = TRUE),
    precio_promedio = mean(precio_m2, na.rm = TRUE),
    .groups = 'drop'
  )

ggplot(analisis_barrios, 
       aes(x = reorder(Barrio_Depto1, precio_mediano), 
           y = precio_mediano/1e6,  
           fill = Estrato)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Precio mediano por m^2 por Barrio y Estrato",
       y = "Precio por m^2 (millones COP)",
       x = "",
       fill = "Estrato") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8),
        legend.position = "bottom") +
  scale_fill_brewer(palette = "Reds")

library(dplyr)

#
datos_correlacion <- datos %>%
  mutate( 
    across(c(Area.Construida, Area.Terreno), ~ifelse(. <= 0, NA, .))
  ) %>%
  select(Precio, Area.Construida, Area.Terreno) %>%
  filter(rowSums(!is.na(.)) > 0) 
if (nrow(datos_correlacion) > 10) {  
  tryCatch({
    matriz_cor <- cor(datos_correlacion, use = "pairwise.complete.obs")
    
    if (any(is.na(matriz_cor))) {
      message("Advertencia: Algunas correlaciones no pudieron calcularse")
      print(matriz_cor)
    } else {
      corrplot::corrplot(matriz_cor, method = "color")
    }
  }, error = function(e) {
    message("Error: ", e$message)
  })
}

corrplot::corrplot(matriz_cor, method = "color")
datos_plot <- datos %>%
  filter(
    !is.na(Precio),
    !is.na(Estrato),
    !is.na(Tipo.de.Inmueble),
    Precio > 0
  ) %>%
  mutate(
    
    Tipo = case_when(
      grepl("APART|APTO|APARTAMENTO", Tipo.de.Inmueble, ignore.case = TRUE) ~ "Apartamento",
      grepl("CASA", Tipo.de.Inmueble, ignore.case = TRUE) ~ "Casa",
      grepl("LOTE|TERRENO", Tipo.de.Inmueble, ignore.case = TRUE) ~ "Terreno",
      TRUE ~ "Otros"
    )
  )


ggplot(datos_plot, 
       aes(x = Estrato, 
           y = Precio/1e6,
           color = Tipo)) +
  geom_point(alpha = 0.6, position = position_jitter(width = 0.2)) +
  facet_wrap(~ Tipo, ncol = 2) +
  scale_y_log10(labels = scales::comma) +
  labs(title = "Relacion Precio-Estrato por Tipo de Inmueble",
       x = "Estrato",
       y = "Precio (millones COP)") +
  theme_minimal()+
  theme(
    axis.text.x = element_text(
      angle = 45, 
      hjust = 1,
      size = 10,
      margin = margin(t = 5)
    ))

file.exists("T.html")
## [1] TRUE