packages = c('olsrr', 'corrplot', 'ggpubr', 'sf', 'spdep', 'GWmodel', 'tmap', 'tidyverse','geobr','ggplot2','dplyr','rio','data.table')
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}
## Loading required package: olsrr
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
## Loading required package: corrplot
## corrplot 0.84 loaded
## Loading required package: ggpubr
## Loading required package: ggplot2
## Loading required package: sf
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
## Loading required package: spdep
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: GWmodel
## Loading required package: maptools
## Checking rgeos availability: TRUE
## Loading required package: robustbase
## Loading required package: Rcpp
## Loading required package: spatialreg
## Loading required package: Matrix
## Registered S3 methods overwritten by 'spatialreg':
##   method                   from 
##   residuals.stsls          spdep
##   deviance.stsls           spdep
##   coef.stsls               spdep
##   print.stsls              spdep
##   summary.stsls            spdep
##   print.summary.stsls      spdep
##   residuals.gmsar          spdep
##   deviance.gmsar           spdep
##   coef.gmsar               spdep
##   fitted.gmsar             spdep
##   print.gmsar              spdep
##   summary.gmsar            spdep
##   print.summary.gmsar      spdep
##   print.lagmess            spdep
##   summary.lagmess          spdep
##   print.summary.lagmess    spdep
##   residuals.lagmess        spdep
##   deviance.lagmess         spdep
##   coef.lagmess             spdep
##   fitted.lagmess           spdep
##   logLik.lagmess           spdep
##   fitted.SFResult          spdep
##   print.SFResult           spdep
##   fitted.ME_res            spdep
##   print.ME_res             spdep
##   print.lagImpact          spdep
##   plot.lagImpact           spdep
##   summary.lagImpact        spdep
##   HPDinterval.lagImpact    spdep
##   print.summary.lagImpact  spdep
##   print.sarlm              spdep
##   summary.sarlm            spdep
##   residuals.sarlm          spdep
##   deviance.sarlm           spdep
##   coef.sarlm               spdep
##   vcov.sarlm               spdep
##   fitted.sarlm             spdep
##   logLik.sarlm             spdep
##   anova.sarlm              spdep
##   predict.sarlm            spdep
##   print.summary.sarlm      spdep
##   print.sarlm.pred         spdep
##   as.data.frame.sarlm.pred spdep
##   residuals.spautolm       spdep
##   deviance.spautolm        spdep
##   coef.spautolm            spdep
##   fitted.spautolm          spdep
##   print.spautolm           spdep
##   summary.spautolm         spdep
##   logLik.spautolm          spdep
##   print.summary.spautolm   spdep
##   print.WXImpact           spdep
##   summary.WXImpact         spdep
##   print.summary.WXImpact   spdep
##   predict.SLX              spdep
## 
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
## 
##     anova.sarlm, as_dgRMatrix_listw, as_dsCMatrix_I, as_dsCMatrix_IrW,
##     as_dsTMatrix_listw, as.spam.listw, bptest.sarlm, can.be.simmed,
##     cheb_setup, coef.gmsar, coef.sarlm, coef.spautolm, coef.stsls,
##     create_WX, deviance.gmsar, deviance.sarlm, deviance.spautolm,
##     deviance.stsls, do_ldet, eigen_pre_setup, eigen_setup, eigenw,
##     errorsarlm, fitted.gmsar, fitted.ME_res, fitted.sarlm,
##     fitted.SFResult, fitted.spautolm, get.ClusterOption,
##     get.coresOption, get.mcOption, get.VerboseOption,
##     get.ZeroPolicyOption, GMargminImage, GMerrorsar, griffith_sone,
##     gstsls, Hausman.test, HPDinterval.lagImpact, impacts, intImpacts,
##     Jacobian_W, jacobianSetup, l_max, lagmess, lagsarlm, lextrB,
##     lextrS, lextrW, lmSLX, logLik.sarlm, logLik.spautolm, LR.sarlm,
##     LR1.sarlm, LR1.spautolm, LU_prepermutate_setup, LU_setup,
##     Matrix_J_setup, Matrix_setup, mcdet_setup, MCMCsamp, ME, mom_calc,
##     mom_calc_int2, moments_setup, powerWeights, predict.sarlm,
##     predict.SLX, print.gmsar, print.ME_res, print.sarlm,
##     print.sarlm.pred, print.SFResult, print.spautolm, print.stsls,
##     print.summary.gmsar, print.summary.sarlm, print.summary.spautolm,
##     print.summary.stsls, residuals.gmsar, residuals.sarlm,
##     residuals.spautolm, residuals.stsls, sacsarlm, SE_classic_setup,
##     SE_interp_setup, SE_whichMin_setup, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption, similar.listw, spam_setup, spam_update_setup,
##     SpatialFiltering, spautolm, spBreg_err, spBreg_lag, spBreg_sac,
##     stsls, subgraph_eigenw, summary.gmsar, summary.sarlm,
##     summary.spautolm, summary.stsls, trW, vcov.sarlm, Wald1.sarlm
## Welcome to GWmodel version 2.1-4.
## The new version of GWmodel 2.1-4 now is readyLoading required package: tmap
## Loading required package: tidyverse
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ✓ purrr   0.3.3     
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x tidyr::unpack() masks Matrix::unpack()
## Loading required package: geobr
## Loading required package: rio
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
install_formats()
## [1] TRUE

Data Import

muni_sf <- st_read(dsn = "data/geospatial", layer = "muni_sf")
## Reading layer `muni_sf' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex04/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 5572 features and 4 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -73.99045 ymin: -33.75118 xmax: -28.83594 ymax: 5.271841
## CRS:            4674
brazil_cities <- fread("data/aspatial/BRAZIL_CITIES.csv")
## Warning in require_bit64_if_needed(ans): Some columns are type 'integer64'
## but package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print the
## data again.
dataDict <- fread ("data/aspatial/Data_Dictionary.csv")

Check missing fields

sum(is.na(brazil_cities))
## [1] 35131
colnames(brazil_cities)
##  [1] "CITY"                   "STATE"                  "CAPITAL"               
##  [4] "IBGE_RES_POP"           "IBGE_RES_POP_BRAS"      "IBGE_RES_POP_ESTR"     
##  [7] "IBGE_DU"                "IBGE_DU_URBAN"          "IBGE_DU_RURAL"         
## [10] "IBGE_POP"               "IBGE_1"                 "IBGE_1-4"              
## [13] "IBGE_5-9"               "IBGE_10-14"             "IBGE_15-59"            
## [16] "IBGE_60+"               "IBGE_PLANTED_AREA"      "IBGE_CROP_PRODUCTION_$"
## [19] "IDHM Ranking 2010"      "IDHM"                   "IDHM_Renda"            
## [22] "IDHM_Longevidade"       "IDHM_Educacao"          "LONG"                  
## [25] "LAT"                    "ALT"                    "PAY_TV"                
## [28] "FIXED_PHONES"           "AREA"                   "REGIAO_TUR"            
## [31] "CATEGORIA_TUR"          "ESTIMATED_POP"          "RURAL_URBAN"           
## [34] "GVA_AGROPEC"            "GVA_INDUSTRY"           "GVA_SERVICES"          
## [37] "GVA_PUBLIC"             "GVA_TOTAL"              "TAXES"                 
## [40] "GDP"                    "POP_GDP"                "GDP_CAPITA"            
## [43] "GVA_MAIN"               "MUN_EXPENDIT"           "COMP_TOT"              
## [46] "COMP_A"                 "COMP_B"                 "COMP_C"                
## [49] "COMP_D"                 "COMP_E"                 "COMP_F"                
## [52] "COMP_G"                 "COMP_H"                 "COMP_I"                
## [55] "COMP_J"                 "COMP_K"                 "COMP_L"                
## [58] "COMP_M"                 "COMP_N"                 "COMP_O"                
## [61] "COMP_P"                 "COMP_Q"                 "COMP_R"                
## [64] "COMP_S"                 "COMP_T"                 "COMP_U"                
## [67] "HOTELS"                 "BEDS"                   "Pr_Agencies"           
## [70] "Pu_Agencies"            "Pr_Bank"                "Pu_Bank"               
## [73] "Pr_Assets"              "Pu_Assets"              "Cars"                  
## [76] "Motorcycles"            "Wheeled_tractor"        "UBER"                  
## [79] "MAC"                    "WAL-MART"               "POST_OFFICES"
popData <- brazil_cities %>% select('CITY','ESTIMATED_POP','GDP_CAPITA','LAT','LONG')

Check for missing fields

sum(is.na(popData))
## [1] 24

Check missing values in popData

popData[!complete.cases(popData), ]
##                  CITY ESTIMATED_POP GDP_CAPITA       LAT      LONG
##  1:  Balneário Rincão         12570   17788.63        NA        NA
##  2:   Lagoa Dos Patos            NA         NA        NA        NA
##  3:  Mojuí Dos Campos         15982    8831.56        NA        NA
##  4: Paraíso Das Águas          5455   92163.92        NA        NA
##  5:    Pescaria Brava         10022    8341.33        NA        NA
##  6:   Pinhal Da Serra          1965  181845.18        NA        NA
##  7:    Pinto Bandeira          2968   18184.45        NA        NA
##  8:   Santa Teresinha         10345         NA -12.77285 -39.52114
##  9:   Santa Terezinha            NA    6796.16        NA        NA
## 10:       São Caetano            NA         NA        NA        NA

Fill missing values in popData

popData.cleaned <- popData
# Add missing lat long
popData.cleaned$LAT[popData$CITY == "Balneário Rincão"] <- -28.8344
popData.cleaned$LONG[popData$CITY == "Balneário Rincão"] <- -49.2361

popData.cleaned$LAT[popData$CITY == "Lagoa Dos Patos"] <- -31.0697
popData.cleaned $LONG[popData$CITY == "Lagoa Dos Patos"] <- -51.4725

popData.cleaned$LAT[popData$CITY == "Mojuí Dos Campos"] <- -2.68472
popData.cleaned$LONG[popData$CITY == "Mojuí Dos Campos"] <- -54.6431

popData.cleaned$LAT[popData$CITY == "Paraíso Das Águas"] <- -19.0257
popData.cleaned$LONG[popData$CITY == "Paraíso Das Águas"] <- -53.0102

popData.cleaned$LAT[popData$CITY == "Pescaria Brava"] <- -28.4247
popData.cleaned$LONG[popData$CITY == "Pescaria Brava"] <- -48.8956

popData.cleaned$LAT[popData$CITY == "Pinhal Da Serra"] <- -27.8747
popData.cleaned$LONG[popData$CITY == "Pinhal Da Serra"] <- -51.1733

popData.cleaned$LAT[popData$CITY == "Pinto Bandeira"] <- -29.0978
popData.cleaned$LONG[popData$CITY == "Pinto Bandeira"] <- -51.4503

popData.cleaned$LAT[popData$CITY == "Santa Terezinha"] <- -12.7498
popData.cleaned$LONG[popData$CITY == "Santa Terezinha"] <- -39.5183

popData.cleaned$LAT[popData$CITY == "São Caetano"] <- -8.33
popData.cleaned$LONG[popData$CITY == "São Caetano"] <- -36.1459

# Add population values
# https://www.citypopulation.de/en/brazil/regiaosudeste/admin/minas_gerais/3137304__lagoa_dos_patos/
popData.cleaned$ESTIMATED_POP[popData$CITY == "Lagoa Dos Patos"] <- 4102

# https://www.citypopulation.de/en/brazil/regiaonordeste/admin/pernambuco/2612802__santa_terezinha/
popData.cleaned$ESTIMATED_POP[popData$CITY == "Santa Terezinha"] <- 11815
# https://www.citypopulation.de/en/brazil/saopaulo/3548807__s%C3%A3o_caetano_do_sul/
popData.cleaned$ESTIMATED_POP[popData$CITY == "São Caetano"] <- 161127

# Add GDP capita values

# https://cidades.ibge.gov.br/brasil/mg/lagoa-dos-patos/panorama
popData.cleaned$GDP_CAPITA[popData$CITY == "Lagoa Dos Patos"] <- 10518.36
# https://cidades.ibge.gov.br/brasil/pb/santa-teresinha/panorama
popData.cleaned$GDP_CAPITA[popData$CITY == "Santa Teresinha"] <- 9757.39
popData.cleaned$GDP_CAPITA[popData$CITY == "São Caetano"] <- 82119.69

Check if there is any missing fields

popData.cleaned[!complete.cases(popData.cleaned), ]
## Empty data.table (0 rows and 5 cols): CITY,ESTIMATED_POP,GDP_CAPITA,LAT,LONG

No missing fields

popData.cleaned.sf <- st_as_sf(popData.cleaned,
                            coords = c("LONG", "LAT"),
                            crs=4326)
head(popData.cleaned.sf)
## Simple feature collection with 6 features and 3 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -49.44055 ymin: -19.15585 xmax: -39.04755 ymax: -1.72347
## CRS:            EPSG:4326
##                  CITY ESTIMATED_POP GDP_CAPITA                    geometry
## 1     Abadia De Goiás          8583   20664.57 POINT (-49.44055 -16.75881)
## 2 Abadia Dos Dourados          6972   25591.70 POINT (-47.39683 -18.48756)
## 3           Abadiânia         19614   15628.40 POINT (-48.71881 -16.18267)
## 4              Abaeté         23223   18250.42 POINT (-45.44619 -19.15585)
## 5          Abaetetuba        156292    8222.36   POINT (-48.8844 -1.72347)
## 6             Abaiara         11663    6370.41 POINT (-39.04755 -7.356977)

Clean muni_sf

Join GDP capita with muni_sf

popData.cleaned.sf.setcrs <- st_transform(popData.cleaned.sf, 4674)
muniGDP <- st_join(muni_sf, popData.cleaned.sf.setcrs)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar

Set geometry null, use data tables to find duplicates

# duplicated(muniGDP.GeoNull) | duplicated(muniGDP.GeoNull, fromLast = TRUE)

# muniGDP.GeoNull <- muniGDP %>% st_set_geometry(NULL) 

Prepare a choropleth map showing the distribution of GDP per capita, 2016 at municipality level.

tm_shape(muniGDP) + tm_polygons(col= "GDP_CAPITA")

Check missing values

muniGDP[!complete.cases(muniGDP$GDP_CAPITA), ]
## Simple feature collection with 6 features and 7 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -53.53197 ymin: -33.61653 xmax: -37.3611 ymax: -7.356656
## CRS:            4674
##      code_mn         name_mn cod_stt abbrv_s CITY ESTIMATED_POP GDP_CAPITA
## 1612 2612802 Santa Terezinha      26      PE <NA>            NA         NA
## 2671 3137304 Lagoa Dos Patos      31      MG <NA>            NA         NA
## 4544 4215679 Santa Terezinha      42      SC <NA>            NA         NA
## 4607 4300001     Lagoa Mirim      43      RS <NA>            NA         NA
## 4608 4300002 Lagoa Dos Patos      43      RS <NA>            NA         NA
## 5303 5107776 Santa Terezinha      51      MT <NA>            NA         NA
##                            geometry
## 1612 MULTIPOLYGON (((-37.43182 -...
## 2671 MULTIPOLYGON (((-44.56744 -...
## 4544 MULTIPOLYGON (((-50.01573 -...
## 4607 MULTIPOLYGON (((-52.62241 -...
## 4608 MULTIPOLYGON (((-51.2719 -3...
## 5303 MULTIPOLYGON (((-50.84289 -...
# apply(muniGDP, 2, function(x) any(is.na(x)))

Plot map

tm_shape(muni_sf)+
   tm_polygons() +
tm_shape(popData.cleaned.sf)+
  tm_dots(col="orange")

Download all municipalities of Rio

# Available data sets
datasets <- list_geobr()
print(datasets, n=21)
## # A tibble: 22 x 4
##    `function`       geography            years                         source   
##    <chr>            <chr>                <chr>                         <chr>    
##  1 `read_country`   Country              1872, 1900, 1911, 1920, 1933… IBGE     
##  2 `read_region`    Region               2000, 2001, 2010, 2013, 2014… IBGE     
##  3 `read_state`     States               1872, 1900, 1911, 1920, 1933… IBGE     
##  4 `read_meso_regi… Meso region          2000, 2001, 2010, 2013, 2014… IBGE     
##  5 `read_micro_reg… Micro region         2000, 2001, 2010, 2013, 2014… IBGE     
##  6 `read_intermedi… Intermediate region  2017                          IBGE     
##  7 `read_immediate… Immediate region     2017                          IBGE     
##  8 `read_municipal… Municipality         1872, 1900, 1911, 1920, 1933… IBGE     
##  9 `read_weighting… Census weighting ar… 2010                          IBGE     
## 10 `read_census_tr… Census tract (setor… 2000, 2010                    IBGE     
## 11 `read_municipal… Municipality seats … 1872, 1900, 1911, 1920, 1933… IBGE     
## 12 `read_statistic… Statistical Grid of… 2010                          IBGE     
## 13 `read_metro_are… Metropolitan areas   1970, 2001, 2002, 2003, 2005… IBGE     
## 14 `read_urban_are… Urban footprints     2005, 2015                    IBGE     
## 15 `read_amazon`    Brazil's Legal Amaz… 2012                          MMA      
## 16 `read_biomes`    Biomes               2004, 2019                    IBGE     
## 17 `read_conservat… Environmental Conse… 201909                        MMA      
## 18 `read_disaster_… Disaster risk areas  2010                          CEMADEN …
## 19 `read_indigenou… Indigenous lands     201907                        FUNAI    
## 20 `read_semiarid`  Semi Arid region     2005, 2017                    IBGE     
## 21 `read_health_fa… Health facilities    2015                          CNES, Da…
## # … with 1 more row

Geospatial Data Wrangling:

The Task

Prepare a choropleth map showing the distribution of GDP per capita, 2016 at municipality level.

Calibrate an explanatory model to explain factors affecting the GDP per capita at the municipality level by using multiple linear regression method.

Prepare a choropleth map showing the distribution of the residual of the GDP per capita.

Calibrate an explanatory model to explain factors affecting the GDP per capita at the municipality level by using geographically weighted regression method.