# [don't need] input_dir <- "/Users/df36/projects/2025_CLO_Whetten/data_2023_NE_ERD/"
# [don't understand; the file isn't a WI file] erd_filename <- "ebird_checklists_northeast_2023.parquet"
# [don't understand; the file isn't a WI file] erd_obs_filename <- "ebird_observations_northeast_2023.parquet"
# [don't understand; the file isn't a WI file] srd_filename <- "prediction-grid_year_northeast_2023.parquet"
solar_filename <- "pv-1000kW-updated_2025.03.27_small.csv"
# pv-1000kW-updated_2025.03.07.csv")
solar_all_filename <- "pv-1000kW-updated_all_2025.03.27_small.csv"
erd2023_filename <- "erd_2023.WI.data.parquet"
srd2023_filename <- "srd_2023.WI.data.parquet"
erd2023_solar_filename <- "erd.WI.PV_data_2025.03.28.parquet"
max_lat <- 47
max_lon <- -86.5
min_lat <- 42.4
min_lon <- -93
day_max <- 210
day_min <- 150
Lets start by identifying which checklists are close to PV
Next, define geographic candidate checklists as those that are further away from PV, but not too far away.
Refining Control to have similar landscapes Idea is to use RF to distinguish between PV & control sites based on the features that describe the local landscape.
For control data, we want landscapes that are as similar / comparable to the PV landscapes as possible.
The data set used to train the RF model includes the checklists close to PV installations and a random selection of geographic candidate control sites.
# [these were #'d in DF's code]
# write_parquet(erd, erd2023_solar_filename )
# erd2023_solar_filename <- "erd.WI.PV_data_2025.03.28.parquet"
erd <- read_parquet(erd2023_solar_filename )
distance_threshold <- 10000 # 10k meters
pv_index <- erd$PV_min_dist <= distance_threshold
sum(pv_index)
## [1] 13962
dim(unique(cbind(
erd$latitude[pv_index],erd$longitude[pv_index])))
## [1] 6786 2
# [these were #'d in DF's code]
# > dim(B_pv_index)
# [1] 13110 7
# > dim(unique(B_pv_index[,c(1:2)]))
# [1] 6543 2
geo_candidate_index <-
erd$PV_min_dist_all > 20000 &
erd$PV_min_dist_all < 50000 &
erd$PV_min_dist > 20000 &
erd$PV_min_dist < 50000
sum(geo_candidate_index)
## [1] 41799
training_index <- as.logical( runif(length(pv_index)) > 0.5 )
train_geo_candidate_index <- geo_candidate_index & training_index
sum(train_geo_candidate_index)
## [1] 20894
test_geo_candidate_index <- geo_candidate_index & !training_index
sum(test_geo_candidate_index)
## [1] 20905
column_nindex <-
c(
30:31, # eastness
34:35, # northness
40:41, # elev
45,47,49, #water
54:55, # road density
2*(30:52) ) # PLANDS
#59:104) # landcovers
rf_data_frame <- as.data.frame( erd[ pv_index, column_nindex ] )
response <- c(
rep(1, sum(pv_index)),
rep(0, sum(train_geo_candidate_index)) )
rf_data_frame <- rbind(rf_data_frame,
as.data.frame( erd[ train_geo_candidate_index, column_nindex ] ) )
rf_data_frame$response <- response
#str(rf_data_frame)
control_ranger <- ranger(
response ~ .,
data = rf_data_frame,
num.trees = 500,
probability = TRUE,
importance = "permutation")
vi <- data.frame(
feature=names(control_ranger$variable.importance),
vi=as.numeric(control_ranger$variable.importance))
xxx_order <- order(vi$vi, decreasing=FALSE)
vi <- vi[xxx_order, ]
par(mfrow= c(1,1), mar = c(5,15,5,5), las=1, cex=0.5)
barplot(t(vi$vi), main = "Control",
xlab = "Relative VI",
ylab = " ",
#xlim = c(0,1),
col = "steelblue",
horiz=TRUE,
names.arg=vi$feature)
box()
control_test_df <- as.data.frame(
erd[ test_geo_candidate_index, column_nindex ])
pred_pv <- predict( control_ranger,
data = control_test_df
)$predictions
hist(pred_pv[,2])
solar = read.csv("pv-1000kW-updated_all_2025.03.27_small.csv")
# [DF's code; I didn't use] solar <- read.csv(solar_filename)
A <- data.frame(
lat = solar$latitude,
lon = solar$longitude )
A <- A[ complete.cases(A), ]
solar <- read.csv(solar_all_filename)
A_all <- data.frame(
lat = solar$latitude,
lon = solar$longitude )
A_all <- A_all[ complete.cases(A_all), ]
par(mfrow = c(1,1), cex = 0.5)
xxx <- erd$longitude
yyy <- erd$latitude
plot(xxx, yyy,
xlab = "Lon",
ylab = "Lat",
main = "Breeding Season ERD2023",
pch = 20,
cex = 0.15,
col = alpha("darkslateblue", 0.20 ))
map("state", add=T, col="black")
# Add points to plot
points(
A_all$lon, A_all$lat,
col=alpha("deeppink", 0.5),
pch = 19, cex = 2.5)
points(
A$lon, A$lat,
col=alpha("chartreuse1", 0.9),
pch = 19, cex = 2.5)
points(
erd$longitude[pv_index], erd$latitude[pv_index],
col=alpha("chartreuse", 0.25),
pch = 19, cex = 0.45)
# Geographic Candidate Checklist locations
points(
erd$longitude[geo_candidate_index],
erd$latitude[geo_candidate_index],
col=alpha("coral", 0.25),
pch = 19, cex = 0.45)
test_geo_lon <- erd$longitude[ test_geo_candidate_index ]
test_geo_lat <- erd$latitude[ test_geo_candidate_index ]
most_like_pv_index <- pred_pv[,2] > 0.97
sum(most_like_pv_index)
## [1] 8029
# Geographic Candidate Checklist locations where model says landscape is much more similar to PV landscapes than the geographic candidate landscapes
points(
test_geo_lon[most_like_pv_index],
test_geo_lat[most_like_pv_index],
col=alpha("blue", 0.5),
pch = 19, cex = 0.45)
Are these landscapes really similar to the pop of checklist landscapes close to PV?
Do the blue points make sense for control site checklists? What about ones on the water?
These include environmental and observation process features
range(erd$longitude)
[1] -100.00000 -62.00005 range(erd$latitude) [1] 25 50
names(erd) [1] “checklist_id” “observer_id”
[3] “loc_id” “longitude”
[5] “latitude” “year”
[7] “day_of_year” “hours_of_day”
[9] “solar_noon_diff” “is_stationary”
[11] “effort_hrs” “effort_distance_km”
[13] “effort_speed_kmph” “num_observers”
[15] “cci” “moon_fraction”
[17] “moon_altitude” “cds_u10”
[19] “cds_v10” “cds_d2m”
[21] “cds_t2m” “cds_hcc”
[23] “cds_i10fg” “cds_mcc”
[25] “cds_lcc” “cds_sf”
[27] “cds_rf” “cds_slc”
[29] “cds_msl” “eastness_1km_median”
[31] “eastness_1km_sd” “eastness_90m_median”
[33] “eastness_90m_sd” “northness_1km_median”
[35] “northness_1km_sd” “northness_90m_median”
[37] “northness_90m_sd” “bathymetry_elevation_median”
[39] “bathymetry_elevation_sd” “elevation_30m_median”
[41] “elevation_30m_sd” “island”
[43] “mountain” “astwbd_c1_ed”
[45] “astwbd_c1_pland” “astwbd_c2_ed”
[47] “astwbd_c2_pland” “astwbd_c3_ed”
[49] “astwbd_c3_pland” “gsw_c2_pland”
[51] “gsw_c2_ed” “ntl_mean”
[53] “ntl_sd” “road_density_c1”
[55] “road_density_c2” “road_density_c3”
[57] “road_density_c4” “road_density_c5”
[59] “mcd12q1_lccs1_c1_ed” “mcd12q1_lccs1_c1_pland”
[61] “mcd12q1_lccs1_c2_ed” “mcd12q1_lccs1_c2_pland”
[63] “mcd12q1_lccs1_c11_ed” “mcd12q1_lccs1_c11_pland”
[65] “mcd12q1_lccs1_c12_ed” “mcd12q1_lccs1_c12_pland”
[67] “mcd12q1_lccs1_c13_ed” “mcd12q1_lccs1_c13_pland”
[69] “mcd12q1_lccs1_c14_ed” “mcd12q1_lccs1_c14_pland”
[71] “mcd12q1_lccs1_c15_ed” “mcd12q1_lccs1_c15_pland”
[73] “mcd12q1_lccs1_c16_ed” “mcd12q1_lccs1_c16_pland”
[75] “mcd12q1_lccs1_c21_ed” “mcd12q1_lccs1_c21_pland”
[77] “mcd12q1_lccs1_c22_ed” “mcd12q1_lccs1_c22_pland”
[79] “mcd12q1_lccs1_c31_ed” “mcd12q1_lccs1_c31_pland”
[81] “mcd12q1_lccs1_c32_ed” “mcd12q1_lccs1_c32_pland”
[83] “mcd12q1_lccs1_c41_ed” “mcd12q1_lccs1_c41_pland”
[85] “mcd12q1_lccs1_c42_ed” “mcd12q1_lccs1_c42_pland”
[87] “mcd12q1_lccs1_c43_ed” “mcd12q1_lccs1_c43_pland”
[89] “mcd12q1_lccs1_c255_ed” “mcd12q1_lccs1_c255_pland”
[91] “mcd12q1_lccs2_c9_ed” “mcd12q1_lccs2_c9_pland”
[93] “mcd12q1_lccs2_c25_ed” “mcd12q1_lccs2_c25_pland”
[95] “mcd12q1_lccs2_c35_ed” “mcd12q1_lccs2_c35_pland”
[97] “mcd12q1_lccs2_c36_ed” “mcd12q1_lccs2_c36_pland”
[99] “mcd12q1_lccs3_c27_ed” “mcd12q1_lccs3_c27_pland”
[101] “mcd12q1_lccs3_c50_ed” “mcd12q1_lccs3_c50_pland”
[103] “mcd12q1_lccs3_c51_ed” “mcd12q1_lccs3_c51_pland”
[105] “has_shoreline” “shoreline_waveheight_mean”
[107] “shoreline_waveheight_sd” “shoreline_tidal_range_mean”
[109] “shoreline_tidal_range_sd” “shoreline_chlorophyll_mean”
[111] “shoreline_chlorophyll_sd” “shoreline_turbidity_mean”
[113] “shoreline_turbidity_sd” “shoreline_sinuosity_mean”
[115] “shoreline_sinuosity_sd” “shoreline_slope_mean”
[117] “shoreline_slope_sd” “shoreline_outflow_density_mean”
[119] “shoreline_outflow_density_sd” “shoreline_erodibility_n”
[121] “shoreline_erodibility_c1_density”
“shoreline_erodibility_c2_density”
[123] “shoreline_erodibility_c3_density”
“shoreline_erodibility_c4_density”
[125] “shoreline_emu_physical_n” “shoreline_emu_physical_c1_density”
[127] “shoreline_emu_physical_c2_density”
“shoreline_emu_physical_c3_density” [129]
“shoreline_emu_physical_c4_density” “shoreline_emu_physical_c5_density”
[131] “shoreline_emu_physical_c6_density”
“shoreline_emu_physical_c7_density” [133]
“shoreline_emu_physical_c8_density” “shoreline_emu_physical_c9_density”
[135] “shoreline_emu_physical_c10_density”
“shoreline_emu_physical_c11_density” [137]
“shoreline_emu_physical_c12_density”
“shoreline_emu_physical_c13_density” [139]
“shoreline_emu_physical_c14_density”
“shoreline_emu_physical_c15_density” [141]
“shoreline_emu_physical_c16_density”
“shoreline_emu_physical_c17_density” [143]
“shoreline_emu_physical_c18_density”
“shoreline_emu_physical_c19_density” [145]
“shoreline_emu_physical_c20_density”
“shoreline_emu_physical_c21_density” [147]
“shoreline_emu_physical_c22_density”
“shoreline_emu_physical_c23_density” [149] “has_evi” “evi_median”
[151] “evi_sd”