# Install stringi if not installed
if (!requireNamespace("stringi", quietly = TRUE)) {
  install.packages("stringi")
}

# Load the stringi package
library(stringi)

# Assuming 'datos' is your dataset
datos$ciudad <- stri_replace_all_fixed(datos$ciudad, " ", "")

These are the maps of the cities with the data points

V_suelo is equal to the price per square meter in USD.

mapview(bogota,  zcol = "v_suelo")
mapview(Medellin, zcol = "v_suelo")
mapview(Cordoba, zcol = "v_suelo")
mapview(Curitiba, zcol = "v_suelo")
mapview(Belo_Horizonte, zcol = "v_suelo")
mapview(La_plata, zcol = "v_suelo")
mapview(Buenos_Aires, zcol = "v_suelo")
mapview(Loja, zcol = "v_suelo")
mapview(Cochabamba, zcol = "v_suelo")

Descriptive statistics of each dataset.

We are first going to commence and see what are the different characteristics of each dataset in every city, to test for robustness and also to chech if there are any outliers we have to be careful about.

The first characteristic we are going to analyze is the distribution per year.

filtered_cities <- bind_rows(
  bogota, Medellin, Cordoba, Curitiba,
  Belo_Horizonte, La_plata, Fortaleza,
  Buenos_Aires, Loja, Cochabamba
)
# Assuming your data frame is named filtered_data
grouped_table <- filtered_cities %>%
  group_by(ciudad, anio) %>%
  summarize(count = n()) 
## `summarise()` has grouped output by 'ciudad'. You can override using the
## `.groups` argument.
pivoted_table <- grouped_table %>%
  pivot_wider(names_from = "anio", values_from = "count", values_fill = 0)
# Print or view the resulting transposed table
print(grouped_table)
## Simple feature collection with 59 features and 3 fields
## Geometry type: MULTIPOINT
## Dimension:     XY
## Bounding box:  xmin: -8822408 ymin: -4190545 xmax: -4275832 ymax: 710716.2
## Projected CRS: WGS 84 / Pseudo-Mercator
## # A tibble: 59 × 4
## # Groups:   ciudad [10]
##    ciudad          anio count                                               geom
##    <chr>          <dbl> <int>                                   <MULTIPOINT [m]>
##  1 BELO HORIZONTE  2016     8 ((-4898219 -2257114), (-4897645 -2260735), (-4897…
##  2 BELO HORIZONTE  2018   626 ((-4903848 -2271153), (-4903506 -2271711), (-4903…
##  3 BELO HORIZONTE  2019   784 ((-4904377 -2270111), (-4903652 -2272585), (-4903…
##  4 BELO HORIZONTE  2020   846 ((-4904240 -2270237), (-4903998 -2270492), (-4903…
##  5 BELO HORIZONTE  2021  1073 ((-4904232 -2270147), (-4903564 -2272448), (-4903…
##  6 BELO HORIZONTE  2022   756 ((-4902996 -2272180), (-4902984 -2272806), (-4902…
##  7 BOGOTA          2016     7 ((-8253593 525729.1), (-8247995 529760), (-824420…
##  8 BOGOTA          2017    19 ((-8256341 514867.9), (-8253965 517609.5), (-8251…
##  9 BOGOTA          2019  1234 ((-8261331 514014.4), (-8261260 513856.5), (-8261…
## 10 BOGOTA          2020  2543 ((-8261498 514263.3), (-8261461 514257.3), (-8261…
## # ℹ 49 more rows
grouped_data <- pivoted_table %>%
  group_by(ciudad) %>%
  summarize(across(starts_with("20"), sum, na.rm = TRUE))
## Warning: There was 1 warning in `summarize()`.
## ℹ In argument: `across(starts_with("20"), sum, na.rm = TRUE)`.
## ℹ In group 1: `ciudad = "BELO HORIZONTE"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
grouped_data <- grouped_data %>%
  select(ciudad, `2016`, `2017`, starts_with("20"), geom)
head(grouped_data)
## Simple feature collection with 6 features and 8 fields
## Geometry type: MULTIPOINT
## Dimension:     XY
## Bounding box:  xmin: -8261550 ymin: -4122479 xmax: -4883509 ymax: 535208.4
## Projected CRS: WGS 84 / Pseudo-Mercator
## # A tibble: 6 × 9
##   ciudad         `2016` `2017` `2018` `2019` `2020` `2021` `2022`
##   <chr>           <int>  <int>  <int>  <int>  <int>  <int>  <int>
## 1 BELO HORIZONTE      8      0    626    784    846   1073    756
## 2 BOGOTA              7     19      0   1234   2543   9827  11387
## 3 BUENOS AIRES       12      0    439    650    567    129    177
## 4 COCHABAMBA         26     23     82    383    125     80    293
## 5 CORDOBA            44     92   2701    340    899    913    733
## 6 CURITIBA          251      0      0    319   1281    514   1474
## # ℹ 1 more variable: geom <MULTIPOINT [m]>

I dont like this data, not at all, so we are going to filter the cities that have more than 200 observations.

city_counts_ciudad <- datos %>%
    group_by(pais , ciudad ) %>%
    summarize(count = n())
## `summarise()` has grouped output by 'pais'. You can override using the
## `.groups` argument.
filtered_city_counts <- city_counts_ciudad %>%
  filter(count >= 200)

We are just going to keep the names and filter the dataset with that.

cities = select(filtered_city_counts, 2)


cities =  st_drop_geometry(cities)
cities_values <- datos %>%
  filter(ciudad %in% cities$ciudad)

We run the same analyses as before to find out the best cities.

# Assuming your data frame is named filtered_data
grouped_table <- cities_values %>%
  group_by(ciudad, anio) %>%
  summarize(count = n()) 
## `summarise()` has grouped output by 'ciudad'. You can override using the
## `.groups` argument.
pivoted_table <- grouped_table %>%
  pivot_wider(names_from = "anio", values_from = "count", values_fill = 0)
# Print or view the resulting transposed table
print(grouped_table)
## Simple feature collection with 378 features and 3 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -11965140 ymin: -4536440 xmax: -4122947 ymax: 2860694
## Projected CRS: WGS 84 / Pseudo-Mercator
## # A tibble: 378 × 4
## # Groups:   ciudad [99]
##    ciudad          anio count                                               geom
##    <chr>          <dbl> <int>                                   <MULTIPOINT [m]>
##  1 AGUASCALIENTES  2016     4 ((-11393322 2494924), (-11388337 2493122), (-1138…
##  2 AGUASCALIENTES  2018   114 ((-11394103 2493283), (-11392623 2494978), (-1139…
##  3 AGUASCALIENTES  2019   281 ((-11394028 2493625), (-11394022 2493640), (-1139…
##  4 AGUASCALIENTES  2020   236 ((-11392851 2497032), (-11392600 2493889), (-1139…
##  5 ALAJUELITA      2018   223 ((-9364883 1105744), (-9364633 1106111), (-936454…
##  6 ALAJUELITA      2019     3 ((-9364210 1110637), (-9364179 1110661), (-936417…
##  7 ALTAGRACIA      2018   126 ((-7179088 -3717121), (-7179061 -3717324), (-7178…
##  8 ALTAGRACIA      2019    44 ((-7178748 -3717309), (-7178470 -3717162), (-7178…
##  9 ALTAGRACIA      2020   120 ((-7179094 -3717080), (-7179030 -3717352), (-7178…
## 10 ALTAGRACIA      2021    79 ((-7178458 -3716419), (-7178427 -3716297), (-7178…
## # ℹ 368 more rows
grouped_data <- pivoted_table %>%
  group_by(ciudad) %>%
  summarize(across(starts_with("20"), sum, na.rm = TRUE))
grouped_data <- grouped_data %>%
  select(ciudad, `2016`, `2017`, starts_with("20"), geom)

grouped_data = st_drop_geometry(grouped_data)

grouped_data <- grouped_data %>%
  mutate(total = rowSums(select(., `2016`:`2022`), na.rm = TRUE))

grouped_data <- grouped_data %>%
  arrange(desc(total))
print(grouped_data)
## # A tibble: 99 × 9
##    ciudad        `2016` `2017` `2018` `2019` `2020` `2021` `2022` total
##    <chr>          <int>  <int>  <int>  <int>  <int>  <int>  <int> <dbl>
##  1 BOGOTA             7     19      0   1234   2543   9827  11387 25017
##  2 MEDELLIN          58      0      0   3814   2226   1026    903  8027
##  3 CORDOBA           44     92   2701    340    899    913    733  5722
##  4 LAPLATA           58      0      0    913    383   1995   1406  4755
##  5 BELOHORIZONTE      8      0    626    784    846   1073    756  4093
##  6 CURITIBA         251      0      0    319   1281    514   1474  3839
##  7 FORTALEZA          6      0    365    547    400    715    731  2764
##  8 BUENOSAIRES       12      0    655    650    567    129    177  2190
##  9 SANSALVADOR        8    246    312    192    157      0    312  1227
## 10 LOJA               3     62    425    419    227      0     40  1176
## # ℹ 89 more rows
result_df <- result_df %>%
  mutate(levenshtein_difference = adist(ciudad, closest_word))

So, now we are going to start subsetting our dataset to check for quality filters.

First we are going to remove cities that do not have points in 2019,2020,2021

filtered_data <- grouped_data %>%
  filter(`2019` > 0 & `2020` > 0 & `2021` > 0)
filtered_data <- filtered_data %>%
  mutate(
    Mean = apply(select(., `2016`:`2022`), 1, mean, na.rm = TRUE),
    sd = apply(select(., `2016`:`2022`), 1, sd, na.rm = TRUE),
    CV = sd / Mean * 100
  )

So now we have the different CV for all the dataset, we can calculate then, the mean and SD of the CV to know the outliers.

overall_sd_cv <- sd(filtered_data$CV, na.rm = TRUE)
overall_mean_cv <- mean(filtered_data$CV, na.rm = TRUE)
lower_bound <- overall_mean_cv - 3 * overall_sd_cv
upper_bound <- overall_mean_cv + 3 * overall_sd_cv
cat("Overall Mean of CV:", overall_mean_cv, "\n")
## Overall Mean of CV: 89.38371
cat("Overall SD of CV:", overall_sd_cv, "\n")
## Overall SD of CV: 23.01718
cat("Lower Bound (3 SD):", lower_bound, "\n")
## Lower Bound (3 SD): 20.33217
cat("Upper Bound (3 SD):", upper_bound, "\n")
## Upper Bound (3 SD): 158.4352

Now we are going to see the spatial distribution for some maps. What we are trying to do is to understand if there is spatial autocorrelation in the distribuiton of the datasets.

So, what we are going to do is to plot for each year the data, then we are going to divide the boundaires in hexagons, and count how many datapoints there are in each dataset, then we are going to do a colour map and a Moran diagram to see it’s distribution.

First, we are going to start with bogotá since it has the most datapoints.

First we are going to download the area of Bogota with OSM.

ciudad = "Bogota D.C, Colombia"
ciudad_bb =  getbb(ciudad, format_out = "sf_polygon") 

ciudad_bb = ciudad_bb$polygon
ciudad_bb_ <- ciudad_bb[1, ]
mapview(ciudad_bb_)
library(sf)
library(dplyr)

nc = ciudad_bb_

g <- st_make_grid(nc,cellsize = c(0.02, 0.02) ,square = FALSE, crs = 4326) 
  
  ciudad_grid = st_intersection(ciudad_bb_, g)
  
  mapview(ciudad_grid)
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
## Warning in split.default(ls$popup, f = as.character(sf::st_dimension(data))):
## largo de datos no es múltiplo de la variable de separación
print(st_crs(ciudad_grid))
## Coordinate Reference System:
##   User input: EPSG:4326 
##   wkt:
## GEOGCRS["WGS 84",
##     ENSEMBLE["World Geodetic System 1984 ensemble",
##         MEMBER["World Geodetic System 1984 (Transit)"],
##         MEMBER["World Geodetic System 1984 (G730)"],
##         MEMBER["World Geodetic System 1984 (G873)"],
##         MEMBER["World Geodetic System 1984 (G1150)"],
##         MEMBER["World Geodetic System 1984 (G1674)"],
##         MEMBER["World Geodetic System 1984 (G1762)"],
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]],
##         ENSEMBLEACCURACY[2.0]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["geodetic latitude (Lat)",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["geodetic longitude (Lon)",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     USAGE[
##         SCOPE["Horizontal component of 3D system."],
##         AREA["World."],
##         BBOX[-90,-180,90,180]],
##     ID["EPSG",4326]]
print(st_crs(bogota))
## Coordinate Reference System:
##   User input: WGS 84 / Pseudo-Mercator 
##   wkt:
## PROJCRS["WGS 84 / Pseudo-Mercator",
##     BASEGEOGCRS["WGS 84",
##         ENSEMBLE["World Geodetic System 1984 ensemble",
##             MEMBER["World Geodetic System 1984 (Transit)"],
##             MEMBER["World Geodetic System 1984 (G730)"],
##             MEMBER["World Geodetic System 1984 (G873)"],
##             MEMBER["World Geodetic System 1984 (G1150)"],
##             MEMBER["World Geodetic System 1984 (G1674)"],
##             MEMBER["World Geodetic System 1984 (G1762)"],
##             MEMBER["World Geodetic System 1984 (G2139)"],
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]],
##             ENSEMBLEACCURACY[2.0]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4326]],
##     CONVERSION["Popular Visualisation Pseudo-Mercator",
##         METHOD["Popular Visualisation Pseudo Mercator",
##             ID["EPSG",1024]],
##         PARAMETER["Latitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["False easting",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["Web mapping and visualisation."],
##         AREA["World between 85.06°S and 85.06°N."],
##         BBOX[-85.06,-180,85.06,180]],
##     ID["EPSG",3857]]
bogota_1 <- st_transform(bogota, crs = st_crs(ciudad_grid))
mapview(bogota_1[1:100, ])
ciudad_grid <- ciudad_grid %>%
  mutate(row_number = row_number())
heterogeinity = st_join( bogota_1, ciudad_grid)
heterogeinity_summary = heterogeinity %>% 
    group_by(row_number) %>% 
    summarise(cantidad = n())
heterogeinity_summary <- heterogeinity_summary %>%
  st_set_geometry(NULL)
heterogeinity_summary <- ciudad_grid %>%
  left_join(heterogeinity_summary, by="row_number")
mapview(heterogeinity_summary, zcol ="cantidad") + mapview(bogota)
mapview(heterogeinity_summary, zcol ="cantidad")
bogota_1 <- st_transform(bogota, crs = st_crs(ciudad_grid))

str(bogota_1)
## Classes 'sf' and 'data.frame':   25017 obs. of  5 variables:
##  $ pais   : chr  "COLOMBIA" "COLOMBIA" "COLOMBIA" "COLOMBIA" ...
##  $ ciudad : chr  "BOGOTA" "BOGOTA" "BOGOTA" "BOGOTA" ...
##  $ v_suelo: int  208 210 300 373 582 265 282 795 1941 462 ...
##  $ anio   : num  2016 2016 2016 2016 2016 ...
##  $ geom   :sfc_POINT of length 25017; first list element:  'XY' num  -74.06 4.75
##  - attr(*, "sf_column")= chr "geom"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA
##   ..- attr(*, "names")= chr [1:4] "pais" "ciudad" "v_suelo" "anio"
# Extract coordinates into separate lon and lat columns without losing other variables
bogota_1 <- bogota_1 %>%
  st_coordinates() %>%
  as.data.frame() %>%
  setNames(c("lon", "lat")) %>%
  cbind(bogota_1, .)

# Display the resulting data frame with separate lon and lat columns along with other variables
print(bogota_1)
## Simple feature collection with 25017 features and 6 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.21477 ymin: 4.469681 xmax: -74.0141 ymax: 4.802227
## Geodetic CRS:  WGS 84
## First 10 features:
##        pais ciudad v_suelo anio       lon      lat                       geom
## 1  COLOMBIA BOGOTA     208 2016 -74.05640 4.750277  POINT (-74.0564 4.750277)
## 2  COLOMBIA BOGOTA     210 2016 -74.05632 4.750635 POINT (-74.05632 4.750635)
## 3  COLOMBIA BOGOTA     300 2016 -74.09300 4.753452   POINT (-74.093 4.753452)
## 4  COLOMBIA BOGOTA     373 2016 -74.05898 4.750523 POINT (-74.05898 4.750523)
## 5  COLOMBIA BOGOTA     582 2016 -74.04484 4.756264 POINT (-74.04484 4.756264)
## 6  COLOMBIA BOGOTA     265 2016 -74.03893 4.758331 POINT (-74.03893 4.758331)
## 7  COLOMBIA BOGOTA     282 2016 -74.14328 4.717366 POINT (-74.14328 4.717366)
## 8  COLOMBIA BOGOTA     795 2017 -74.09999 4.692578 POINT (-74.09999 4.692578)
## 9  COLOMBIA BOGOTA    1941 2017 -74.06428 4.640462 POINT (-74.06428 4.640462)
## 10 COLOMBIA BOGOTA     462 2017 -74.16797 4.620122 POINT (-74.16797 4.620122)
ggplot(st_drop_geometry(bogota_1), aes(x = lon, y = lat)) + 
  coord_equal() + 
  xlab('Longitude') + 
  ylab('Latitude') + 
  stat_density2d(aes(fill = ..level..), alpha = .5,
                 geom = "polygon", data = st_drop_geometry(bogota_1)) + 
  scale_fill_viridis_c() + 
  theme(legend.position = 'none')
## Warning: The dot-dot notation (`..level..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(level)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Load necessary libraries
library(sf)
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
# Assuming you have an 'sf' object named bogota_1 with point geometries and 'anio' column representing years

# Check the structure of the data
print(bogota_1)
## Simple feature collection with 25017 features and 6 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.21477 ymin: 4.469681 xmax: -74.0141 ymax: 4.802227
## Geodetic CRS:  WGS 84
## First 10 features:
##        pais ciudad v_suelo anio       lon      lat                       geom
## 1  COLOMBIA BOGOTA     208 2016 -74.05640 4.750277  POINT (-74.0564 4.750277)
## 2  COLOMBIA BOGOTA     210 2016 -74.05632 4.750635 POINT (-74.05632 4.750635)
## 3  COLOMBIA BOGOTA     300 2016 -74.09300 4.753452   POINT (-74.093 4.753452)
## 4  COLOMBIA BOGOTA     373 2016 -74.05898 4.750523 POINT (-74.05898 4.750523)
## 5  COLOMBIA BOGOTA     582 2016 -74.04484 4.756264 POINT (-74.04484 4.756264)
## 6  COLOMBIA BOGOTA     265 2016 -74.03893 4.758331 POINT (-74.03893 4.758331)
## 7  COLOMBIA BOGOTA     282 2016 -74.14328 4.717366 POINT (-74.14328 4.717366)
## 8  COLOMBIA BOGOTA     795 2017 -74.09999 4.692578 POINT (-74.09999 4.692578)
## 9  COLOMBIA BOGOTA    1941 2017 -74.06428 4.640462 POINT (-74.06428 4.640462)
## 10 COLOMBIA BOGOTA     462 2017 -74.16797 4.620122 POINT (-74.16797 4.620122)
# Get unique years from the 'anio' column
unique_years <- unique(bogota_1$anio)

# Create a list to store ggplot objects for each year
plots <- list()

# Iterate through each unique year
for (year in unique_years) {
  # Subset data for the current year
  subset_data <- bogota_1 %>%
    filter(anio == year)
  
  # Create ggplot for current year
  plots[[as.character(year)]] <- ggplot(st_drop_geometry(subset_data), aes(x = lon, y = lat)) +
    coord_equal() +
    xlab('Longitude') +
    ylab('Latitude') +
    stat_density_2d(aes(fill = ..level..), alpha = .5, geom = "polygon") +
    scale_fill_viridis_c() +
    theme(legend.position = 'none') +
    ggtitle(paste("Density Plot for Year", year))
}

# Show or save each plot
for (i in 1:length(plots)) {
  print(plots[[i]])
  # Save the plots if needed: ggsave(filename = paste("density_plot_", unique_years[i], ".png"), plot = plots[[i]])
}