knitr::opts_chunk$set(eval = FALSE)

1. Load CSV data

library(readr)
data <- read_csv("Raw_data/merged_data.csv")

1.1 see variables

data.frame(
  index = 1:length(colnames(data)),  
  name = colnames(data)
)

1.2 clean data

library(dplyr)
library(lubridate)
library(stringr)

1.3 (corregir errores)

problems(data)

1.4 NA summary by columnns

na_report <- data %>%
  summarise(across(everything(),
                   list(
                     NAs = ~ sum(is.na(.)),
                     Perc_NA = ~ mean(is.na(.)) * 100
                   ),
                   .names = "{.col}_{.fn}"
  ))

1.5 Transpose for a better view

na_report_tidy <- tibble(
  Column = names(data),
  NAs = as.numeric(na_report[1, seq(1, ncol(na_report), 2)]),
  Perc_NA = as.numeric(na_report[1, seq(2, ncol(na_report), 2)])
)

1.6 Order from highest to lowest percentage of NA

na_report_tidy <- na_report_tidy %>%
  arrange(desc(Perc_NA))

1.7 Show first 15 columns with more NAs

head(na_report_tidy, 15)

1.8 Eliminate columns with more than 90% of NAs

data <- data %>%
  select(where(~ mean(is.na(.)) < 0.9))

2. download PLSS data in batches

library(httr)
library(sf)
library(dplyr)

url_base <- "https://gis.blm.gov/arcgis/rest/services/Cadastral/BLM_Natl_PLSS_CadNSDI/MapServer/2/query"

params <- list(
  where = "1=1",
  outFields = "*",
  returnGeometry = "true",
  f = "geojson",
  resultType = "standard",
  resultOffset = 0,
  resultRecordCount = 1000
)

directorio_salida <- "plss_secciones_total"
if (!dir.exists(directorio_salida)) {
  dir.create(directorio_salida)
}

offset <- 739000
lote_num <- 740
descargas_completas <- FALSE

2.1 Discharge loop

while (!full_downloads) {
  params$resultOffset <- offset
  response <- GET(url_base, query = params)
  
  if (http_error(response)) {
    stop("HTTP request error: ", content(response, "text"))
  }
  
  response_text <- content(response, "text", encoding = "UTF-8")
  current_sf <- st_read(response_text, quiet = TRUE)
  
  if (nrow(current_sf) == 0) {
    full_downloads <- TRUE
    cat("Download of all sections completed.\n")
  } else {
    file_name <- paste0(output_directory, "/secciones_lote_", lote_num, ".geojson")
    st_write(current_sf, file_name, driver = "GeoJSON", append = FALSE, delete_dsn = TRUE)
    
    cat("Downloaded and saved ", nrow(current_sf), " registros. Lote #", lote_num, "\n")
    
    offset <- offset + 1000
    lote_num <- lote_num + 1
  }
}

2.3 State filter

states_to_maintain <- c("MT","OK","NE","ND","AZ","MO","SD","KS","MN","CA","WA","NM","ID","WY","WI","OR","CO","UT","AL","MS","NV")

section_files <- list.files(path = output_directory, pattern = "\\.geojson$", full.names = TRUE)
complete_sections <- do.call(bind_rows, lapply(section_files, st_read))

filtered_sections <- complete_sections %>%
  filter(STATEABBR %in% states_to_maintain)

2.4 Batch bonding and column cleaning

library(purrr)

files_geojson <- list.files(path = output_directory, pattern = "\\.geojson$", full.names = TRUE)
file_batches <- split(files_geojson, ceiling(seq_along(files_geojson) / 51))

temporary_files <- list()
for (i in 1:length(file_batches)) {
  cat("Processing batch bonding", i, "/", length(file_batches), "...\n")
  
  united_lot <- map_dfr(file_batches[[i]], function(file) {
    st_read(file, quiet = TRUE) %>%
      mutate(across(matches("SOURCEDATE|last_edited_date|created_date|SRVNAME"), ~ as.character(.)))
  })
  
  name_temp <- paste0("lote_temporal_", i, ".geojson")
  st_write(united_lot, name_temp, driver = "GeoJSON", append = FALSE, delete_dsn = TRUE, quiet = TRUE)
  artemporary_files[[i]] <- name_temp
}

2.5 Final filtering and export

geojson_filtered <- geojson %>%
  filter(state %in% estados_a_mantener) %>%
  select(state, meridian_code, township_number, township_direction,
         range_number, range_direction, section_number,
         Shape.STArea.., geometry)

st_write(geojson_filtered, "Filtered_PLSS.geojson", driver = "GeoJSON", append = FALSE)

3. Load PLSS geometries

geojson_filtered <- st_read("Filtered_PLSS.geojson")

3.1 Eliminate unnecessary “attribute” columns

geojson_filtered <- geojson_filtered %>% select(FRSTDIVID, geometry, Shape.STArea..)  
names(geojson_filtrado)

3.2 make key

data_std <- data %>%
  mutate(
    township_number = str_extract(township_number, "[0-9]+"),
    range_number    = str_extract(range_number, "[0-9]+"),
    section_number  = str_extract(section_number, "[0-9]+"),
    meridian_code   = str_extract(meridian_code, "[0-9]+"),
    
    township_number = ifelse(is.na(township_number), "000", sprintf("%03d", as.integer(township_number))),
    range_number    = ifelse(is.na(range_number), "000", sprintf("%03d", as.integer(range_number))),
    section_number  = ifelse(is.na(section_number), "00",  sprintf("%02d", as.integer(section_number))),
    meridian_code   = ifelse(is.na(meridian_code), "00",  sprintf("%02d", as.integer(meridian_code))),
    
    FRSTDIVID = paste0(
      toupper(substr(state.y, 1, 2)), meridian_code,
      township_number, "0", toupper(township_direction),
      range_number, "0", toupper(range_direction),
      "0SN", section_number, "0"
    )
  )

3.3 Join Geometries to CSV

CSV_plus_geometries <- left_join(data_std,
                                 geojson_filtrado,
                                 by = "FRSTDIVID",
                                 relationship = "many-to-many"
                                 )

st_write("CSV_plus_geometries.gpkg") #save geometries in a gpkg

3.4 Lista de estados a procesar

states <- c("MT", "OK", "NE", "ND", "AZ", "MO", "SD", "KS", "MN", 
             "CA", "WA", "NM", "ID", "WY", "WI", "OR", "CO", "UT", 
             "AL", "MS", "NV")

4. Download data soils

After manually downloading the soil data (gssurgo_conus) from the Natural Resource Conservation Service website, we proceed to load the data.

5. Load data soils

library(sf)

sf_use_s2(FALSE)

state_files <- list.files(gpkg_dir, pattern = "_clip.gpkg$", full.names = TRUE)
out_dir <- file.path(gpkg_dir, "Recortados_PLSS")
dir.create(out_dir, showWarnings = FALSE)

big_states <- c("MT", "SD", "CA", "AZ", "NV", "NM")

Disable s2 (to avoid spherical geometries).
List .gpkg files ending in _clip.gpkg in the gpkg_dir folder.
Create the Recortados_PLSS output folder.
Define which states are large (to handle smaller blocks and not saturate memory).

5.1 Block cropping

n <- ifelse(any(big_states %in% state_name), 200, 1000)
n_blocks <- ceiling(nrow(g)/n)

If the status is in big_states, process in blocks of 200 rows.
If not, use 1000.
Calculate how many blocks (n_blocks) are needed.

5.2 Iterate over files by status

The generated files follow the naming convention NAME_clip_NUMBER.gpkg, where:

NAME corresponds to the name of the state or region processed.
clip indicates that it is a spatially clipped file.
NUMBER is a unique numerical identifier to differentiate multiple clips from the same state.
The .gpkg extension corresponds to the GeoPackage format.
Examples of file names that comply with this convention are:
Minnesota_clip_01.gpkg
Texas_clip_2.gpkg
California_clip_123.gpkg In all cases, the pattern ensures that the file contains the word clip, followed by a number, and ends with the .gpkg extension.

state_files <- list.files(gpkg_dir, pattern = "_clip_[0-9]+\\.gpkg$", full.names = TRUE) 
...
for (f in state_files) {
  fname_out <- file.path(out_dir, gsub("_clip_(\\d+)\\.gpkg$", "_clip_\\1_PLSS.gpkg", basename(f)))
  
  if(file.exists(fname_out)){
    cat("It already exists.:", fname_out, "\n")
    next
  }
  
  cat("Processing:", basename(f), "\n")
  
  g <- st_read(f, quiet = TRUE)
  g <- st_make_valid(g)
  g <- g[st_is_valid(g), ]

Search for files that are already cut into numbered blocks (_clip_1.gpkg, _clip_2.gpkg, etc.).
Create a new output name, adding PLSS at the end (_clip_1_PLSS.gpkg).
Skip the file if it already exists on disk.
Read the geometries and validate.

5.3 Filter and crop against PLSS

plss_block <- st_crop(plss, st_bbox(g))
hits <- st_intersects(g, plss_block, sparse = FALSE)
g <- g[rowSums(hits) > 0, ]
  
g_clip <- st_intersection(g, plss_block)

plss_block: trims the PLSS to the extent of the state block (st_bbox(g) → bounding box).
hits: identifies which polygons in the block intersect the PLSS.
Filters only those that have an intersection.
Performs the complete geometric intersection (st_intersection).

5.4 save results

 st_write(g_clip, fname_out, delete_dsn = TRUE, quiet = TRUE)
  cat("Load in:", fname_out, "\n")
  
  rm(g, plss_block, hits, g_clip)
} # this begin in the line code 5.2

Writes the intersected block clipping to disk.
Frees memory by removing intermediate objects.

5.5 Save final file (complete by status)

out_dir <- "C:/Users/galle/Exports_gSSURGO/Recortados_PLSS"
dir.create(out_dir, showWarnings = FALSE)

out_file <- file.path(out_dir, paste0(state, "_clip_PLSS.gpkg"))

st_write(g_clip, "g_clip.gpkg", delete_dsn = TRUE)

cat("Load:", out_file, "\n")

Define a fixed output folder.
Create a final name (MT_clip_PLSS.gpkg, for example).

This code (5 to 5.5) clips soil files (by state and by block) against PLSS geometries to leave only the intersecting part. It then writes the results to disk (_clip_X_PLSS.gpkg), and finally attempts to save them with a unique name for each state… although that part has a bug because it uses “g_clip.gpkg” instead of out_file.

6. Extract attribute table in CSV format

library(sf)
library(data.table)

6.1 Directories

gpkg_dir <- "C:/Users/galle/Exports_gSSURGO"   # folder where the GPKG files are located
csv_dir  <- "C:/Users/galle/Exports_CSV"       # out folder

if (!dir.exists(csv_dir)) dir.create(csv_dir)

6.2 List all .gpkg files in the directory

gpkg_files <- list.files(gpkg_dir, pattern = "\\.gpkg$", full.names = TRUE)

6.3 Extraer todas las tablas de cada GPKG

for (f in gpkg_files) {
  cat("Processing:", f, "\n")
  
  # List tables within GPKG
  tablas <- st_layers(f)$name
  
  for (t in tablas) {
    cat("   - Tabla:", t, "\n")
    
    # read table
    df <- tryCatch({
      st_read(f, layer = t, quiet = TRUE) 
    }, error = function(e) {
      fread(paste0("SQLite:", f, "?", t))  
    })
    
    # Remove geometry if it exists (to reduce CSV file size)
    if ("geometry" %in% names(df)) {
      df <- st_drop_geometry(df)
    }
    
    # Out name
    out_file <- file.path(
      csv_dir,
      paste0(tools::file_path_sans_ext(basename(f)), "_", t, ".csv")
    )
    
    # Save CSV
    fwrite(df, out_file)
  }
}

7. consolidate and organize the soil database

7.1. Folder with CSVs

csv_dir <- "C:/Users/galle/Exports_CSV"
  out_csv <- "C:/Users/galle/SSURGO_master.csv"

7.2 Read and merge all CSVs

 file_list <- list.files(csv_dir, pattern = "\\.csv$", full.names = TRUE)
  dt_list <- lapply(file_list, fread)
  ssurgo_master <- rbindlist(dt_list, fill = TRUE)

7.3 Save unified CSV

fwrite(ssurgo_master, out_csv)

7.4 Load CSV component and chorizon from gssurgo_conus

7.4.1 Run GDAL from R

system('ogr2ogr -f "CSV" C:/Users/galle/Downloads/ssurgo_recortes/mapunit.csv 
       C:/Users/galle/Downloads/gSSURGO_CONUS.gdb mapunit')

system('ogr2ogr -f "CSV" C:/Users/galle/Downloads/ssurgo_recortes/component.csv 
       C:/Users/galle/Downloads/gSSURGO_CONUS.gdb component')

system('ogr2ogr -f "CSV" C:/Users/galle/Downloads/ssurgo_recortes/chorizon.csv 
       C:/Users/galle/Downloads/gSSURGO_CONUS.gdb chorizon')

7.4.2 Load component and chorizon

component <- fread("C:/Users/galle/Downloads/ssurgo_recortes/component.csv")
chorizon  <- fread("C:/Users/galle/Downloads/ssurgo_recortes/chorizon.csv")
mapunit  <- fread("C:/Users/galle/Downloads/ssurgo_recortes/mapunit.csv")

setnames(ssurgo_master, "MUKEY", "mukey")
setnames(component, "MUKEY", "mukey")
setnames(chorizon, "COKEY", "cokey")

7.5 Join (merge) for mukey

First merge with component

ssurgo_comp <- merge(ssurgo_master, component, by = "mukey", all.x = TRUE)

After merge with chorizon

ssurgo_full <- merge(ssurgo_comp, chorizon, by = "cokey", all.x = TRUE)

7.6 Save final result

fwrite(ssurgo_full, "C:/Users/galle/SSURGO_master_full.csv")

This script (7 to 7.6) was designed to integrate multiple SSURGO output files into a single master dataset. First, all CSV files exported from the geopackages were merged into a single table. Subsequently, the component, chorizon, and mapunit tables were incorporated, ensuring consistency of keys (mukey and cokey) by standardizing column names. The final result was a consolidated database (SSURGO_master_full.csv) that brings together spatial and descriptive information on cartographic units, components, and soil horizons, ready for further analysis.

8. Combine soil variables by “FRSTDIVID”

8.1 summary soil variables

soils_resum1 <- ssurgo_full[, .(
  ph_mean            = mean(ph, na.rm = TRUE),
  om_mean            = mean(om, na.rm = TRUE),
  cec_mean           = mean(cec, na.rm = TRUE),
  basesat_mean       = mean(basesat, na.rm = TRUE),
  sand_mean          = mean(sand, na.rm = TRUE),
  silt_mean          = mean(silt, na.rm = TRUE),
  clay_mean          = mean(clay, na.rm = TRUE),
  bulk_density_mean  = mean(bulk_density, na.rm = TRUE),
  soc_mean           = mean(soc, na.rm = TRUE)
), by = FRSTDIVID]

Here, the soils_full data is grouped by each FRSTDIVID spatial identifier.
Averages of key soil variables (pH, organic matter, texture, bulk density, organic carbon) are calculated.
na.rm = TRUE ensures that NAs do not break the calculation.
The result is a table with one row per FRSTDIVID and its soil property averages.

8.2 Join the summary to the original dataset with

final1 <- merge(g_clip, soils_resum1, by = "FRSTDIVID", all.x = TRUE)

A left join is made between the original table (g_clip) and the soil averages (soils_resum1).
In this way, each polygon/record in the original now has the corresponding soil averages.

8.3 Control of duplicates

length(unique(soils_resum1$FRSTDIVID))
nrow(soils_resum1)
soils_resum1[duplicated(FRSTDIVID), ]

It checks if there really is 1 row per FRSTDIVID.
unique and nrow should match.
If there are duplicates, they are listed with duplicated(FRSTDIVID).

8.4 Categorical overview: dominant drainage class

soils_resum2 <- soils_full[, .(
  drainage_mode = names(which.max(table(drainage_class)))
), by = FRSTDIVID]

In addition to the numerical averages, the categorical variable drainage_class is summarised here.
For each FRSTDIVID, the most frequent category (mode) is selected.
Result: a table with FRSTDIVID and their dominant drainage class.

8.4.1 It is then combined with soils_resum1

soils_resum1 <- merge(soils_resum1, soils_resum2, by = "FRSTDIVID", all = TRUE)

8.5 Missing value allocation with missRanger

First, the numerical columns to be allocated are defined:

library(missRanger)

num_cols1 <- c("ph_mean", "cec_mean", "basesat_mean",
              "sand_mean", "silt_mean", "clay_mean",
              "bulk_density_mean", "om_mean", "soc_mean")

Imputation is then applied:

dt_imputed1 <- missRanger(
  soils_resum1[, ..num_cols1],
  pmm = 0,        # without predictive mean matching
  num.trees = 100, # number of trees for random forest
  maxiter = 5,     # number of iterations
  seed = 123
)

missRanger uses Random Forest to predict missing values from the other columns.
It is robust because it exploits correlations between soil variables.
Result: dt_imputed1, a version of the numeric columns without NA.

8.6 Construction of the imputed table

soils_resum_imputed1 <- copy(soils_resum1)
soils_resum_imputed1[, (num_cols1) := dt_imputed1]
fwrite(soils_resum_imputed1, "soils_resum_imputed1.csv")

The imputed values are replaced in the original table.
Exported as CSV (soils_resum_imputed1.csv).

8.7 Join imputees with the original table

final_imputed1 <- merge(
  original,  ##original is the csv plus geometries thaw we save before in section 3.3
  cbind(soils_resum1[, .(FRSTDIVID)], dt_imputed1), 
  by = "FRSTDIVID",
  all.x = TRUE
)

A new merge is made between the original dataset and the imputed values, ensuring that all original observations have their complete soil attributes.

8.8 Missing value diagnosis

na_pct <- sapply(final_imputed1, function(x) round(100 * sum(is.na(x)) / length(x), 2))
na_pct <- data.frame(Columna = names(na_pct), Porcentaje_NA = na_pct)
na_pct[order(-na_pct$Porcentaje_NA), ]

Calculates the percentage of NA per column after the imputation.
Allows to verify how effective the process was and if there are still information gaps.

9 add geometries in wkt format to csv to reduce file weight

9.1 Read GPKG file with geometries

geo <- st_read("CSV_plus_geometries.gpkg") # .gpkg csv plus geometries plss

A GeoPackage (.gpkg) file containing the PLSS grid along with tabular attributes is loaded here. The sf function st_read() directly opens the spatial dataset with its geometries.

9.2 Convert geometry to text (WKT)

geo$geometry <- st_as_text(geo$geometry)

Each PLSS polygon is originally stored as a geometry object (sfc). In order to save it in a CSV (which does not support spatial columns), it is converted to WKT (Well-Known Text), which is a standard form in text to represent geometries (example: “POLYGON((x1 y1, x2 y2, …))”).

9.3 Convert to data.table

geo_dt <- as.data.table(geo)

The spatial object is converted into an optimized data.table, which makes handling large data more efficient and facilitates its export to CSV.

9.4 Save as CSV (geometry in text)

fwrite(geo_dt, "plss_grid_geom_wkt.csv")

Finally, the dataset is exported to a CSV, keeping the geometries as WKT text. In this way the spatial reference is preserved without the need to rely on a heavy geographic file such as .gpkg.

9.5 Imputed table reading

data1 <- fread("./final_imputed1.csv")

Here we load the file final_imputed1.csv, which contains the project attribute table (already processed with imputations to handle missing data). We use fread() from data.table because it is much more efficient than read.csv when handling large files.

9.6 Rename the geometry column

colnames(geo_dt) <- "geometry_wkt"

The geo object (which came from .gpkg) contains the geometries converted to WKT text. To avoid confusion and maintain consistency, the column is renamed to “geometry_wkt”. This is important because it will later be merged with the attribute data and it must be clear which column represents the geometry.

9.7 Checking rows before joining

nrow(data1) == nrow(geo_dt)

It compares whether the number of data rows (attributes) is equal to the number of geo (geometries). - If it returns TRUE, it means that each row of data has its corresponding geometry. - If it returns FALSE, there is a mismatch (missing or excess rows in any of the tables), which could break the join.

9.8 Attribute binding with geometry

data1_geo <- cbind(data1, geo_dt)

Finally, both tables are combined by row position (not by an identifier). This works because it was checked before that nrow(data) == nrow(geo). The result is data_geo, a table containing both the imputed attributes and the geometry in WKT format.

10. Soil Quality Index

10.1 Normalization function

normalize <- function(x) (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE))

Min-max scaling is used to transform each variable to a range [0,1].
This is key because each soil indicator is on different scales (pH ≈ 0-14, CEC in cmol/kg, organic matter in %, etc.).
By normalizing, you guarantee that no indicator dominates the rest just because it has large values.

10.2 Standardization of indicators

data1_geo$ph_n   <- normalize(data1_geo$ph_mean)
data1_geo$cec_n  <- normalize(data1_geo$cec_mean)
data_1geo$basesat_n <- normalize(data1_geo$basesat_mean)

Normalized pH (ph_n): very acidic or very basic soils affect productivity, but here you are taking it to a 0-1 scale.
CEC (cation exchange capacity): reflects fertility and nutrient holding capacity. Higher → better.
Base saturation (basesat): measures percentage of Ca, Mg, K, Na exchangeable. Higher → better.

10.3 Inverse indicators (when “more” is not “better”)

data1_geo$bulk_density_n <- 1 - normalize(data1_geo$bulk_density_mean)

Bulk density: high values indicate compaction → poor drainage and aeration.
Reverse with 1 - normalize(…) so that lower density values have higher scores (i.e., better quality).

10.4 Organic matter and carbon

data1_geo$om_n   <- normalize(data1_geo$om_mean)
data1_geo$soc_n  <- normalize(data1_geo$soc_mean)

OM (organic matter): indicator of fertility, structure and water retention.
SOC (soil organic carbon): key variable for productivity and resilience.

10.5 Soil texture

data1_geo$texture_score <- 1 - normalize(
  abs(data1_geo$sand_mean - 40) + 
  abs(data1_geo$silt_mean - 40) + 
  abs(data1_geo$clay_mean - 20))

An optimum loam texture point is sought (≈ 40% sand, 40% silt, 20% clay).
The formula calculates the distance of each soil from the ideal.
It is then normalized and inverted (1 - …) so that soils closer to the loam have higher scores.

10.6 Calculation of the composite index (SQI)

data1_geo$SQI <- with(data1_geo,
  0.15*ph_n + 0.15*cec_n + 0.10*basesat_n +
  0.15*texture_score + 0.10*bulk_density_n +
  0.15*om_n + 0.20*soc_n
)

Here you combine the indicators into a composite index. The weights reflect the relative importance of each variable (sum of 1 = 100%)

Result: an SQI value between 0 and 1 →

0 = low quality soil

1 = ideal soil

Normalization and Synthesis of Edaphic Properties for the Development of a Soil Quality Index.

Santiago Gallego

2025-08-25