knitr::opts_chunk$set(eval = FALSE)
library(readr)
data <- read_csv("Raw_data/merged_data.csv")
data.frame(
index = 1:length(colnames(data)),
name = colnames(data)
)
library(dplyr)
library(lubridate)
library(stringr)
problems(data)
na_report <- data %>%
summarise(across(everything(),
list(
NAs = ~ sum(is.na(.)),
Perc_NA = ~ mean(is.na(.)) * 100
),
.names = "{.col}_{.fn}"
))
na_report_tidy <- tibble(
Column = names(data),
NAs = as.numeric(na_report[1, seq(1, ncol(na_report), 2)]),
Perc_NA = as.numeric(na_report[1, seq(2, ncol(na_report), 2)])
)
na_report_tidy <- na_report_tidy %>%
arrange(desc(Perc_NA))
head(na_report_tidy, 15)
data <- data %>%
select(where(~ mean(is.na(.)) < 0.9))
library(httr)
library(sf)
library(dplyr)
url_base <- "https://gis.blm.gov/arcgis/rest/services/Cadastral/BLM_Natl_PLSS_CadNSDI/MapServer/2/query"
params <- list(
where = "1=1",
outFields = "*",
returnGeometry = "true",
f = "geojson",
resultType = "standard",
resultOffset = 0,
resultRecordCount = 1000
)
directorio_salida <- "plss_secciones_total"
if (!dir.exists(directorio_salida)) {
dir.create(directorio_salida)
}
offset <- 739000
lote_num <- 740
descargas_completas <- FALSE
while (!full_downloads) {
params$resultOffset <- offset
response <- GET(url_base, query = params)
if (http_error(response)) {
stop("HTTP request error: ", content(response, "text"))
}
response_text <- content(response, "text", encoding = "UTF-8")
current_sf <- st_read(response_text, quiet = TRUE)
if (nrow(current_sf) == 0) {
full_downloads <- TRUE
cat("Download of all sections completed.\n")
} else {
file_name <- paste0(output_directory, "/secciones_lote_", lote_num, ".geojson")
st_write(current_sf, file_name, driver = "GeoJSON", append = FALSE, delete_dsn = TRUE)
cat("Downloaded and saved ", nrow(current_sf), " registros. Lote #", lote_num, "\n")
offset <- offset + 1000
lote_num <- lote_num + 1
}
}
states_to_maintain <- c("MT","OK","NE","ND","AZ","MO","SD","KS","MN","CA","WA","NM","ID","WY","WI","OR","CO","UT","AL","MS","NV")
section_files <- list.files(path = output_directory, pattern = "\\.geojson$", full.names = TRUE)
complete_sections <- do.call(bind_rows, lapply(section_files, st_read))
filtered_sections <- complete_sections %>%
filter(STATEABBR %in% states_to_maintain)
library(purrr)
files_geojson <- list.files(path = output_directory, pattern = "\\.geojson$", full.names = TRUE)
file_batches <- split(files_geojson, ceiling(seq_along(files_geojson) / 51))
temporary_files <- list()
for (i in 1:length(file_batches)) {
cat("Processing batch bonding", i, "/", length(file_batches), "...\n")
united_lot <- map_dfr(file_batches[[i]], function(file) {
st_read(file, quiet = TRUE) %>%
mutate(across(matches("SOURCEDATE|last_edited_date|created_date|SRVNAME"), ~ as.character(.)))
})
name_temp <- paste0("lote_temporal_", i, ".geojson")
st_write(united_lot, name_temp, driver = "GeoJSON", append = FALSE, delete_dsn = TRUE, quiet = TRUE)
artemporary_files[[i]] <- name_temp
}
geojson_filtered <- geojson %>%
filter(state %in% estados_a_mantener) %>%
select(state, meridian_code, township_number, township_direction,
range_number, range_direction, section_number,
Shape.STArea.., geometry)
st_write(geojson_filtered, "Filtered_PLSS.geojson", driver = "GeoJSON", append = FALSE)
geojson_filtered <- st_read("Filtered_PLSS.geojson")
geojson_filtered <- geojson_filtered %>% select(FRSTDIVID, geometry, Shape.STArea..)
names(geojson_filtrado)
data_std <- data %>%
mutate(
township_number = str_extract(township_number, "[0-9]+"),
range_number = str_extract(range_number, "[0-9]+"),
section_number = str_extract(section_number, "[0-9]+"),
meridian_code = str_extract(meridian_code, "[0-9]+"),
township_number = ifelse(is.na(township_number), "000", sprintf("%03d", as.integer(township_number))),
range_number = ifelse(is.na(range_number), "000", sprintf("%03d", as.integer(range_number))),
section_number = ifelse(is.na(section_number), "00", sprintf("%02d", as.integer(section_number))),
meridian_code = ifelse(is.na(meridian_code), "00", sprintf("%02d", as.integer(meridian_code))),
FRSTDIVID = paste0(
toupper(substr(state.y, 1, 2)), meridian_code,
township_number, "0", toupper(township_direction),
range_number, "0", toupper(range_direction),
"0SN", section_number, "0"
)
)
CSV_plus_geometries <- left_join(data_std,
geojson_filtrado,
by = "FRSTDIVID",
relationship = "many-to-many"
)
st_write("CSV_plus_geometries.gpkg") #save geometries in a gpkg
states <- c("MT", "OK", "NE", "ND", "AZ", "MO", "SD", "KS", "MN",
"CA", "WA", "NM", "ID", "WY", "WI", "OR", "CO", "UT",
"AL", "MS", "NV")
After manually downloading the soil data (gssurgo_conus) from the Natural Resource Conservation Service website, we proceed to load the data.
library(sf)
sf_use_s2(FALSE)
state_files <- list.files(gpkg_dir, pattern = "_clip.gpkg$", full.names = TRUE)
out_dir <- file.path(gpkg_dir, "Recortados_PLSS")
dir.create(out_dir, showWarnings = FALSE)
big_states <- c("MT", "SD", "CA", "AZ", "NV", "NM")
n <- ifelse(any(big_states %in% state_name), 200, 1000)
n_blocks <- ceiling(nrow(g)/n)
state_files <- list.files(gpkg_dir, pattern = "_clip_[0-9]+\\.gpkg$", full.names = TRUE)
...
for (f in state_files) {
fname_out <- file.path(out_dir, gsub("_clip_(\\d+)\\.gpkg$", "_clip_\\1_PLSS.gpkg", basename(f)))
if(file.exists(fname_out)){
cat("It already exists.:", fname_out, "\n")
next
}
cat("Processing:", basename(f), "\n")
g <- st_read(f, quiet = TRUE)
g <- st_make_valid(g)
g <- g[st_is_valid(g), ]
plss_block <- st_crop(plss, st_bbox(g))
hits <- st_intersects(g, plss_block, sparse = FALSE)
g <- g[rowSums(hits) > 0, ]
g_clip <- st_intersection(g, plss_block)
st_write(g_clip, fname_out, delete_dsn = TRUE, quiet = TRUE)
cat("Load in:", fname_out, "\n")
rm(g, plss_block, hits, g_clip)
} # this begin in the line code 5.2
Writes the intersected block clipping to disk.
Frees memory by removing intermediate objects.
out_dir <- "C:/Users/galle/Exports_gSSURGO/Recortados_PLSS"
dir.create(out_dir, showWarnings = FALSE)
out_file <- file.path(out_dir, paste0(state, "_clip_PLSS.gpkg"))
st_write(g_clip, "g_clip.gpkg", delete_dsn = TRUE)
cat("Load:", out_file, "\n")
This code (5 to 5.5) clips soil files (by state and by block) against PLSS geometries to leave only the intersecting part. It then writes the results to disk (_clip_X_PLSS.gpkg), and finally attempts to save them with a unique name for each state… although that part has a bug because it uses “g_clip.gpkg” instead of out_file.
library(sf)
library(data.table)
gpkg_dir <- "C:/Users/galle/Exports_gSSURGO" # folder where the GPKG files are located
csv_dir <- "C:/Users/galle/Exports_CSV" # out folder
if (!dir.exists(csv_dir)) dir.create(csv_dir)
gpkg_files <- list.files(gpkg_dir, pattern = "\\.gpkg$", full.names = TRUE)
for (f in gpkg_files) {
cat("Processing:", f, "\n")
# List tables within GPKG
tablas <- st_layers(f)$name
for (t in tablas) {
cat(" - Tabla:", t, "\n")
# read table
df <- tryCatch({
st_read(f, layer = t, quiet = TRUE)
}, error = function(e) {
fread(paste0("SQLite:", f, "?", t))
})
# Remove geometry if it exists (to reduce CSV file size)
if ("geometry" %in% names(df)) {
df <- st_drop_geometry(df)
}
# Out name
out_file <- file.path(
csv_dir,
paste0(tools::file_path_sans_ext(basename(f)), "_", t, ".csv")
)
# Save CSV
fwrite(df, out_file)
}
}
csv_dir <- "C:/Users/galle/Exports_CSV"
out_csv <- "C:/Users/galle/SSURGO_master.csv"
file_list <- list.files(csv_dir, pattern = "\\.csv$", full.names = TRUE)
dt_list <- lapply(file_list, fread)
ssurgo_master <- rbindlist(dt_list, fill = TRUE)
fwrite(ssurgo_master, out_csv)
system('ogr2ogr -f "CSV" C:/Users/galle/Downloads/ssurgo_recortes/mapunit.csv
C:/Users/galle/Downloads/gSSURGO_CONUS.gdb mapunit')
system('ogr2ogr -f "CSV" C:/Users/galle/Downloads/ssurgo_recortes/component.csv
C:/Users/galle/Downloads/gSSURGO_CONUS.gdb component')
system('ogr2ogr -f "CSV" C:/Users/galle/Downloads/ssurgo_recortes/chorizon.csv
C:/Users/galle/Downloads/gSSURGO_CONUS.gdb chorizon')
component <- fread("C:/Users/galle/Downloads/ssurgo_recortes/component.csv")
chorizon <- fread("C:/Users/galle/Downloads/ssurgo_recortes/chorizon.csv")
mapunit <- fread("C:/Users/galle/Downloads/ssurgo_recortes/mapunit.csv")
setnames(ssurgo_master, "MUKEY", "mukey")
setnames(component, "MUKEY", "mukey")
setnames(chorizon, "COKEY", "cokey")
First merge with component
ssurgo_comp <- merge(ssurgo_master, component, by = "mukey", all.x = TRUE)
After merge with chorizon
ssurgo_full <- merge(ssurgo_comp, chorizon, by = "cokey", all.x = TRUE)
fwrite(ssurgo_full, "C:/Users/galle/SSURGO_master_full.csv")
This script (7 to 7.6) was designed to integrate multiple SSURGO output files into a single master dataset. First, all CSV files exported from the geopackages were merged into a single table. Subsequently, the component, chorizon, and mapunit tables were incorporated, ensuring consistency of keys (mukey and cokey) by standardizing column names. The final result was a consolidated database (SSURGO_master_full.csv) that brings together spatial and descriptive information on cartographic units, components, and soil horizons, ready for further analysis.
soils_resum1 <- ssurgo_full[, .(
ph_mean = mean(ph, na.rm = TRUE),
om_mean = mean(om, na.rm = TRUE),
cec_mean = mean(cec, na.rm = TRUE),
basesat_mean = mean(basesat, na.rm = TRUE),
sand_mean = mean(sand, na.rm = TRUE),
silt_mean = mean(silt, na.rm = TRUE),
clay_mean = mean(clay, na.rm = TRUE),
bulk_density_mean = mean(bulk_density, na.rm = TRUE),
soc_mean = mean(soc, na.rm = TRUE)
), by = FRSTDIVID]
final1 <- merge(g_clip, soils_resum1, by = "FRSTDIVID", all.x = TRUE)
length(unique(soils_resum1$FRSTDIVID))
nrow(soils_resum1)
soils_resum1[duplicated(FRSTDIVID), ]
soils_resum2 <- soils_full[, .(
drainage_mode = names(which.max(table(drainage_class)))
), by = FRSTDIVID]
soils_resum1 <- merge(soils_resum1, soils_resum2, by = "FRSTDIVID", all = TRUE)
First, the numerical columns to be allocated are defined:
library(missRanger)
num_cols1 <- c("ph_mean", "cec_mean", "basesat_mean",
"sand_mean", "silt_mean", "clay_mean",
"bulk_density_mean", "om_mean", "soc_mean")
Imputation is then applied:
dt_imputed1 <- missRanger(
soils_resum1[, ..num_cols1],
pmm = 0, # without predictive mean matching
num.trees = 100, # number of trees for random forest
maxiter = 5, # number of iterations
seed = 123
)
soils_resum_imputed1 <- copy(soils_resum1)
soils_resum_imputed1[, (num_cols1) := dt_imputed1]
fwrite(soils_resum_imputed1, "soils_resum_imputed1.csv")
final_imputed1 <- merge(
original, ##original is the csv plus geometries thaw we save before in section 3.3
cbind(soils_resum1[, .(FRSTDIVID)], dt_imputed1),
by = "FRSTDIVID",
all.x = TRUE
)
na_pct <- sapply(final_imputed1, function(x) round(100 * sum(is.na(x)) / length(x), 2))
na_pct <- data.frame(Columna = names(na_pct), Porcentaje_NA = na_pct)
na_pct[order(-na_pct$Porcentaje_NA), ]
geo <- st_read("CSV_plus_geometries.gpkg") # .gpkg csv plus geometries plss
A GeoPackage (.gpkg) file containing the PLSS grid along with tabular attributes is loaded here. The sf function st_read() directly opens the spatial dataset with its geometries.
geo$geometry <- st_as_text(geo$geometry)
Each PLSS polygon is originally stored as a geometry object (sfc). In order to save it in a CSV (which does not support spatial columns), it is converted to WKT (Well-Known Text), which is a standard form in text to represent geometries (example: “POLYGON((x1 y1, x2 y2, …))”).
geo_dt <- as.data.table(geo)
The spatial object is converted into an optimized data.table, which makes handling large data more efficient and facilitates its export to CSV.
fwrite(geo_dt, "plss_grid_geom_wkt.csv")
Finally, the dataset is exported to a CSV, keeping the geometries as WKT text. In this way the spatial reference is preserved without the need to rely on a heavy geographic file such as .gpkg.
data1 <- fread("./final_imputed1.csv")
Here we load the file final_imputed1.csv, which contains the project attribute table (already processed with imputations to handle missing data). We use fread() from data.table because it is much more efficient than read.csv when handling large files.
colnames(geo_dt) <- "geometry_wkt"
The geo object (which came from .gpkg) contains the geometries converted to WKT text. To avoid confusion and maintain consistency, the column is renamed to “geometry_wkt”. This is important because it will later be merged with the attribute data and it must be clear which column represents the geometry.
nrow(data1) == nrow(geo_dt)
It compares whether the number of data rows (attributes) is equal to the number of geo (geometries). - If it returns TRUE, it means that each row of data has its corresponding geometry. - If it returns FALSE, there is a mismatch (missing or excess rows in any of the tables), which could break the join.
data1_geo <- cbind(data1, geo_dt)
Finally, both tables are combined by row position (not by an identifier). This works because it was checked before that nrow(data) == nrow(geo). The result is data_geo, a table containing both the imputed attributes and the geometry in WKT format.
normalize <- function(x) (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE))
data1_geo$ph_n <- normalize(data1_geo$ph_mean)
data1_geo$cec_n <- normalize(data1_geo$cec_mean)
data_1geo$basesat_n <- normalize(data1_geo$basesat_mean)
data1_geo$bulk_density_n <- 1 - normalize(data1_geo$bulk_density_mean)
data1_geo$om_n <- normalize(data1_geo$om_mean)
data1_geo$soc_n <- normalize(data1_geo$soc_mean)
data1_geo$texture_score <- 1 - normalize(
abs(data1_geo$sand_mean - 40) +
abs(data1_geo$silt_mean - 40) +
abs(data1_geo$clay_mean - 20))
data1_geo$SQI <- with(data1_geo,
0.15*ph_n + 0.15*cec_n + 0.10*basesat_n +
0.15*texture_score + 0.10*bulk_density_n +
0.15*om_n + 0.20*soc_n
)
Here you combine the indicators into a composite index. The weights reflect the relative importance of each variable (sum of 1 = 100%)
Result: an SQI value between 0 and 1 →
0 = low quality soil
1 = ideal soil