In this example I will use QGIS geoprocessing scripts through the RQGIS library in R. I will use data from the 2005 Peru Demographic and Health Survey, where I construct Voronoi polygons from the primary sampling unit locations and generate estimates of educational attainment for women.

library(sf)
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
library(RQGIS3) 
## Loading required package: reticulate
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mapview)
peru_dhs_points<-st_read("C:/Users/ozd504/Google Drive/classes/dem7093/dem7093_20/data/PEGE5BFL", "PEGE52FL")
## Reading layer `PEGE52FL' from data source `C:\Users\ozd504\Google Drive\classes\dem7093\dem7093_20\data\PEGE5BFL' using driver `ESRI Shapefile'
## Simple feature collection with 1414 features and 20 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -81.30633 ymin: -18.22461 xmax: 0 ymax: 0
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
peru_dhs_points<-st_transform(peru_dhs_points, crs=24892)
#project the data into a meter based system
mapview(peru_dhs_points["DHSCLUST"])

##Set up QGIS environment This lets R find where your QGIS binaries are located. set_env() should work without you specifying the path

my_env<-set_env()
## Trying to find QGIS in C:/OSGEO4~1
open_app()

#Buffer analysis First we find the name of the algorithm for point buffering

find_algorithms(search_term = "buffer", qgis_env = my_env)
##  [1] "Buffer vectors--------------------------------------->gdal:buffervectors"            
##  [2] "One side buffer-------------------------------------->gdal:onesidebuffer"            
##  [3] "r.buffer--------------------------------------------->grass7:r.buffer"               
##  [4] "r.buffer.lowmem-------------------------------------->grass7:r.buffer.lowmem"        
##  [5] "v.buffer--------------------------------------------->grass7:v.buffer"               
##  [6] "Buffer----------------------------------------------->native:buffer"                 
##  [7] "Variable width buffer (by m-value)------------------->native:bufferbym"              
##  [8] "Multi-ring buffer (constant distance)---------------->native:multiringconstantbuffer"
##  [9] "Tapered buffers-------------------------------------->native:taperedbuffer"          
## [10] "Create wedge buffers--------------------------------->native:wedgebuffers"           
## [11] "Single sided buffer---------------------------------->qgis:singlesidedbuffer"        
## [12] "Variable distance buffer----------------------------->qgis:variabledistancebuffer"   
## [13] "Fixed distance buffer-------------------------------->saga:fixeddistancebuffer"      
## [14] "Raster buffer---------------------------------------->saga:rasterbuffer"             
## [15] "Raster proximity buffer------------------------------>saga:rasterproximitybuffer"    
## [16] "Threshold raster buffer------------------------------>saga:thresholdrasterbuffer"    
## [17] "Variable distance buffer----------------------------->saga:variabledistancebuffer"

For this, we’ll use the native:buffer function, but we need to see what the arguments to the function are:

#get_usage(alg="native:buffer")

so we have 5 arguments, when we use the function, we need to specify all of these:

params <- get_args_man(alg = "native:buffer")
## Choosing default values for following parameters:
## END_CAP_STYLE: 0
## JOIN_STYLE: 0
## See get_options('native:buffer') for all available options.

Here I do a 5km buffer around each PSU location.

wd<-"C:/Users/ozd504/OneDrive - University of Texas at San Antonio/gis_classwork/"
params$INPUT <-peru_dhs_points
params$DISTANCE<-5000 #5km around each point
params$OUTPUT<-file.path(wd, "peru_psu_buffer_5k.shp") # path to the output shapefile

now we have our parameters defined, we run the script:

peru_buff <- run_qgis(alg = "native:buffer",
                    params = params,
                    load_output = TRUE,
                    qgis_env = my_env)
## $OUTPUT
## [1] "C:/Users/ozd504/OneDrive - University of Texas at San Antonio/gis_classwork/peru_psu_buffer_5k.shp"
st_crs(peru_buff)<-24892
## Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
## that
mapview(peru_buff["DHSCLUST"], legend=T,map.types="OpenStreetMap")

buffer using R

buf2<-st_buffer(peru_dhs_points, dist = 5000)
mapview(buf2["DHSCLUST"])

5km buffers done! Now, If I only had some other point data, I could do some point in polygon operations!

##Vonoroi polygons

find_algorithms(search_term = "Voronoi")
## [1] "Voronoi polygons------------------------------------->qgis:voronoipolygons"
#get_usage(alg="qgis:voronoipolygons", qgis_env = my_env, intern = F)

so we have 3 arguments, when we use the function,:

params <- get_args_man(alg = "qgis:voronoipolygons", qgis_env = my_env)
params$INPUT <-peru_dhs_points
params$OUTPUT<-file.path(wd, "peru_psu_von_poly.shp") # path to the output shapefile

now we have our parameters defined, we run the script:

peru_von <- run_qgis(alg = "qgis:voronoipolygons",
                    params = params,
                    load_output = TRUE,
                    qgis_env = my_env)
## $OUTPUT
## [1] "C:/Users/ozd504/OneDrive - University of Texas at San Antonio/gis_classwork/peru_psu_von_poly.shp"
st_crs(peru_von)<-24892
## Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
## that
mapview(peru_von["DHSCLUST"],map.types="OpenStreetMap")

I can’t figure out how to use R for this

The polygons obviously have issues, so we can clip them to the Peruvian border:

find_algorithms(search_term = "clip")
## [1] "Clip raster by extent-------------------------------->gdal:cliprasterbyextent"    
## [2] "Clip raster by mask layer---------------------------->gdal:cliprasterbymasklayer" 
## [3] "Clip vector by extent-------------------------------->gdal:clipvectorbyextent"    
## [4] "Clip vector by mask layer---------------------------->gdal:clipvectorbypolygon"   
## [5] "Clip------------------------------------------------->native:clip"                
## [6] "Extract/clip by extent------------------------------->native:extractbyextent"     
## [7] "Clip points with polygons---------------------------->saga:clippointswithpolygons"
## [8] "Clip raster with polygon----------------------------->saga:cliprasterwithpolygon" 
## [9] "Polygon clipping------------------------------------->saga:polygonclipping"
#get_usage(alg="native:clip")

so we have 3 arguments, when we use the function,:

params <- get_args_man(alg = "native:clip")
peru_border<-st_read("C:/Users/ozd504/Google Drive/classes/dem7093/dem7093_20/data/PER_adm", "PER_adm0")
## Reading layer `PER_adm0' from data source `C:\Users\ozd504\Google Drive\classes\dem7093\dem7093_20\data\PER_adm' using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 70 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -81.3307 ymin: -18.3518 xmax: -68.65311 ymax: -0.03747
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
peru_border<-st_transform(peru_border,crs=24892)

params$INPUT <-peru_von
params$OVERLAY<-peru_border
params$OUTPUT<-file.path(wd, "peru_psu_von_poly_clip.shp") # path to the output shapefile

now we have our parameters defined, we run the script:

peru_von_clip <- run_qgis(alg = "native:clip",
                    params = params,
                    load_output = TRUE,
                    qgis_env = my_env)
## $OUTPUT
## [1] "C:/Users/ozd504/OneDrive - University of Texas at San Antonio/gis_classwork/peru_psu_von_poly_clip.shp"
mapview(peru_von_clip["DHSCLUST"],map.types="OpenStreetMap")

Using R

You can use st_clip to clip observations to a polygon

pclip<-st_intersection(peru_von, peru_border)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
# pclip2<-peru_von%>%
#   filter(st_contains( peru_border, ., sparse = F))

plot(pclip["DHSCLUST"])

##points in polygons See what the right script is, and what arguments it needs:

find_algorithms(search_term = "in polygon")
## [1] "Count points in polygon------------------------------>qgis:countpointsinpolygon"
## [2] "Fit n points in polygon------------------------------>saga:fitnpointsinpolygon"
#get_usage(alg="qgis:countpointsinpolygon", qgis_env = my_env, intern = F)
params <- get_args_man(alg = "qgis:countpointsinpolygon", qgis_env = my_env)

5km buffers done! Now, we need to do our point in polygon operation. So I read in the Peruvian school data. These are a csv file, but have lat/long specified, so I can read in the text and make a simple feature layer from it. You can’t have missing information in the coordinate data to do this.

schools<-read.csv("C:/Users/ozd504/Google Drive/classes/dem7093/dem7093_20/data/perusecondaryschools.csv", header=T)
schools<-schools%>%
  filter(complete.cases(Latitud, Longitud))

persch<-st_as_sf(schools, coords=c("Longitud", "Latitud"), crs=4326,agr="constant")
persch<-st_transform(persch,crs=24892 )

mapview(persch["Departamento"])

##make the parameter block

params$POLYGONS <-peru_buff
params$POINTS<-persch
params$OUTPUT<-file.path(wd, "peru_schoolsinbuffer.shp") # path to the output shapefile

now we have our parameters defined, we run the script:

peru_pip <- run_qgis(alg = "qgis:countpointsinpolygon",
                    params = params,
                    load_output = TRUE,
                    qgis_env = my_env)
## Warning in abbreviate(fld_names, minlength = 7): abbreviate used with non-ASCII
## chars
## Warning in abbreviate_shapefile_names(obj): Field names abbreviated for ESRI
## Shapefile driver
## Warning in CPL_write_ogr(obj, dsn, layer, driver,
## as.character(dataset_options), : GDAL Message 1: One or several characters
## couldn't be converted correctly from UTF-8 to ISO-8859-1. This warning will not
## be emitted anymore.
## $OUTPUT
## [1] "C:/Users/ozd504/OneDrive - University of Texas at San Antonio/gis_classwork/peru_schoolsinbuffer.shp"
mapview(peru_pip["NUMPOINTS"],map.types="OpenStreetMap", legend=T)
#recode the count of schools to be binary (0 or more than 0 schools)
peru_pip$closeschool<-ifelse(peru_pip$NUMPOINTS>0,1,0)
summary(peru_pip$closeschool)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.9279  1.0000  1.0000

So we see that 92.8% of communities have a secondary school within 5km, that’s great, but what about the communities that don’t?

using R

sfpip<-lengths(st_intersects( peru_buff,persch))
peru_buff$nschools<-sfpip
hist(peru_buff$nschools)

use these in an analysis

Here we load the 2005 DHS data, and do some recodes for educational attainment, age, and get the survey design variables.

library(haven)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(dplyr)

peru_data<-read_dta("C:/Users/ozd504/Google Drive/classes/dem7093/dem7093_20//data/PEIR51FL.DTA")

peru_data<-peru_data%>%
  mutate(eduprim=ifelse(v106==1,1,0),
  edusecplus=ifelse(v106!=9&v106>=2,1,0),
  pwt=v005/1000000, 
  age=v012)%>%
  select(v000, v021, v022,v024,age, pwt, eduprim, edusecplus )

head(peru_data)
## # A tibble: 6 x 8
##   v000   v021  v022         v024   age   pwt eduprim edusecplus
##   <chr> <dbl> <dbl>    <dbl+lbl> <dbl> <dbl>   <dbl>      <dbl>
## 1 PE5       5     1 1 [amazonas]    38 0.437       0          1
## 2 PE5       5     1 1 [amazonas]    20 0.437       0          1
## 3 PE5       5     1 1 [amazonas]    34 0.437       0          1
## 4 PE5       5     1 1 [amazonas]    30 0.437       0          1
## 5 PE5       5     1 1 [amazonas]    35 0.437       0          1
## 6 PE5       5     1 1 [amazonas]    26 0.437       0          1
peru_data$one<-1
des<-svydesign(ids=~v021, strata=~v022, weights=~pwt, data=peru_data, nest = T)
counts<-svyby(~edusecplus+one, ~v021, des, FUN = svytotal)
head(counts)
##   v021 edusecplus       one se.edusecplus    se.one
## 1    1   4.742290  6.097230      4.742290  6.097230
## 2    2   6.120072  8.451528      6.120072  8.451528
## 3    3   6.675930  6.675930      6.675930  6.675930
## 4    4  12.177576 12.177576     12.177576 12.177576
## 5    5   3.936816  5.249088      3.936816  5.249088
## 6    6   7.113205 11.639790      7.113205 11.639790

Now we merge the survey to the spatial data:

peru_merge<-left_join(peru_data,peru_pip,by=c("v021"="DHSCLUST"))
## Warning: Column `v021`/`DHSCLUST` has different attributes on LHS and RHS of
## join

In order to test for the effect of school access on women’s educational attainment, we use a binomial generalized linear mixed model. We control for the survey design using a nested random effect specification (PSUs within strata).

des<-svydesign(ids=~v021, strata=~v022, weights=~pwt, data=peru_merge, nest = T)

fit<-svyglm(edusecplus~closeschool+scale(age), family = binomial, design=des)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit)
## 
## Call:
## svyglm(formula = edusecplus ~ closeschool + scale(age), design = des, 
##     family = binomial)
## 
## Survey design:
## svydesign(ids = ~v021, strata = ~v022, weights = ~pwt, data = peru_merge, 
##     nest = T)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.64062    0.17158  -3.734 0.000196 ***
## closeschool  1.75444    0.17805   9.853  < 2e-16 ***
## scale(age)  -0.57474    0.02193 -26.206  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.052487)
## 
## Number of Fisher Scoring iterations: 4

So, we see in this case that if women lived within 5km of a secondary school, they are much more likely to have a secondary education or more, compared to woman living in a community without a school within 5km.