Lets start by identifying which checklists are close to PV
Next, define geographic candidate checklists as those that are further away from PV, but not too far away.
Refining Control to have similar landscapes Idea is to use RF to distinguish between PV & control sites based on the features that describe the local landscape.
For control data, we want landscapes that are as similar / comparable to the PV landscapes as possible.
The data set used to train the RF model includes the checklists close to PV installations and a random selection of geographic candidate control sites.
solar = read.csv("pv-1000kW-updated_all_2025.03.27_small.csv")
# [DF's code; I didn't use] solar <- read.csv(solar_filename)
A <- data.frame(
lat = solar$latitude,
lon = solar$longitude )
A <- A[ complete.cases(A), ]
solar <- read.csv(solar_all_filename)
A_all <- data.frame(
lat = solar$latitude,
lon = solar$longitude )
A_all <- A_all[ complete.cases(A_all), ]
par(mfrow = c(1,1), cex = 0.5)
xxx <- erd$longitude
yyy <- erd$latitude
plot(xxx, yyy,
xlab = "Lon",
ylab = "Lat",
main = "Breeding Season ERD2023",
pch = 20,
cex = 0.15,
col = alpha("darkslateblue", 0.20 ))
map("state", add=T, col="black")
# Add points to plot
points(
A_all$lon, A_all$lat,
col=alpha("deeppink", 0.5),
pch = 19, cex = 2.5)
points(
A$lon, A$lat,
col=alpha("chartreuse1", 0.9),
pch = 19, cex = 2.5)
points(
erd$longitude[pv_index], erd$latitude[pv_index],
col=alpha("chartreuse", 0.25),
pch = 19, cex = 0.45)
# Geographic Candidate Checklist locations
points(
erd$longitude[geo_candidate_index],
erd$latitude[geo_candidate_index],
col=alpha("coral", 0.25),
pch = 19, cex = 0.45)
test_geo_lon <- erd$longitude[ test_geo_candidate_index ]
test_geo_lat <- erd$latitude[ test_geo_candidate_index ]
most_like_pv_index <- pred_pv[,2] > 0.97
sum(most_like_pv_index)
## [1] 8175
# Geographic Candidate Checklist locations where model says landscape is much more similar to PV landscapes than the geographic candidate landscapes
points(
test_geo_lon[most_like_pv_index],
test_geo_lat[most_like_pv_index],
col=alpha("blue", 0.5),
pch = 19, cex = 0.45)
Are these landscapes really similar to the pop of checklist landscapes close to PV?
Do the blue points make sense for control site checklists? What about ones on the water?
These include environmental and observation process features
range(erd$longitude)
[1] -100.00000 -62.00005 range(erd$latitude) [1] 25 50
names(erd) [1] “checklist_id” “observer_id”
[3] “loc_id” “longitude”
[5] “latitude” “year”
[7] “day_of_year” “hours_of_day”
[9] “solar_noon_diff” “is_stationary”
[11] “effort_hrs” “effort_distance_km”
[13] “effort_speed_kmph” “num_observers”
[15] “cci” “moon_fraction”
[17] “moon_altitude” “cds_u10”
[19] “cds_v10” “cds_d2m”
[21] “cds_t2m” “cds_hcc”
[23] “cds_i10fg” “cds_mcc”
[25] “cds_lcc” “cds_sf”
[27] “cds_rf” “cds_slc”
[29] “cds_msl” “eastness_1km_median”
[31] “eastness_1km_sd” “eastness_90m_median”
[33] “eastness_90m_sd” “northness_1km_median”
[35] “northness_1km_sd” “northness_90m_median”
[37] “northness_90m_sd” “bathymetry_elevation_median”
[39] “bathymetry_elevation_sd” “elevation_30m_median”
[41] “elevation_30m_sd” “island”
[43] “mountain” “astwbd_c1_ed”
[45] “astwbd_c1_pland” “astwbd_c2_ed”
[47] “astwbd_c2_pland” “astwbd_c3_ed”
[49] “astwbd_c3_pland” “gsw_c2_pland”
[51] “gsw_c2_ed” “ntl_mean”
[53] “ntl_sd” “road_density_c1”
[55] “road_density_c2” “road_density_c3”
[57] “road_density_c4” “road_density_c5”
[59] “mcd12q1_lccs1_c1_ed” “mcd12q1_lccs1_c1_pland”
[61] “mcd12q1_lccs1_c2_ed” “mcd12q1_lccs1_c2_pland”
[63] “mcd12q1_lccs1_c11_ed” “mcd12q1_lccs1_c11_pland”
[65] “mcd12q1_lccs1_c12_ed” “mcd12q1_lccs1_c12_pland”
[67] “mcd12q1_lccs1_c13_ed” “mcd12q1_lccs1_c13_pland”
[69] “mcd12q1_lccs1_c14_ed” “mcd12q1_lccs1_c14_pland”
[71] “mcd12q1_lccs1_c15_ed” “mcd12q1_lccs1_c15_pland”
[73] “mcd12q1_lccs1_c16_ed” “mcd12q1_lccs1_c16_pland”
[75] “mcd12q1_lccs1_c21_ed” “mcd12q1_lccs1_c21_pland”
[77] “mcd12q1_lccs1_c22_ed” “mcd12q1_lccs1_c22_pland”
[79] “mcd12q1_lccs1_c31_ed” “mcd12q1_lccs1_c31_pland”
[81] “mcd12q1_lccs1_c32_ed” “mcd12q1_lccs1_c32_pland”
[83] “mcd12q1_lccs1_c41_ed” “mcd12q1_lccs1_c41_pland”
[85] “mcd12q1_lccs1_c42_ed” “mcd12q1_lccs1_c42_pland”
[87] “mcd12q1_lccs1_c43_ed” “mcd12q1_lccs1_c43_pland”
[89] “mcd12q1_lccs1_c255_ed” “mcd12q1_lccs1_c255_pland”
[91] “mcd12q1_lccs2_c9_ed” “mcd12q1_lccs2_c9_pland”
[93] “mcd12q1_lccs2_c25_ed” “mcd12q1_lccs2_c25_pland”
[95] “mcd12q1_lccs2_c35_ed” “mcd12q1_lccs2_c35_pland”
[97] “mcd12q1_lccs2_c36_ed” “mcd12q1_lccs2_c36_pland”
[99] “mcd12q1_lccs3_c27_ed” “mcd12q1_lccs3_c27_pland”
[101] “mcd12q1_lccs3_c50_ed” “mcd12q1_lccs3_c50_pland”
[103] “mcd12q1_lccs3_c51_ed” “mcd12q1_lccs3_c51_pland”
[105] “has_shoreline” “shoreline_waveheight_mean”
[107] “shoreline_waveheight_sd” “shoreline_tidal_range_mean”
[109] “shoreline_tidal_range_sd” “shoreline_chlorophyll_mean”
[111] “shoreline_chlorophyll_sd” “shoreline_turbidity_mean”
[113] “shoreline_turbidity_sd” “shoreline_sinuosity_mean”
[115] “shoreline_sinuosity_sd” “shoreline_slope_mean”
[117] “shoreline_slope_sd” “shoreline_outflow_density_mean”
[119] “shoreline_outflow_density_sd” “shoreline_erodibility_n”
[121] “shoreline_erodibility_c1_density”
“shoreline_erodibility_c2_density”
[123] “shoreline_erodibility_c3_density”
“shoreline_erodibility_c4_density”
[125] “shoreline_emu_physical_n” “shoreline_emu_physical_c1_density”
[127] “shoreline_emu_physical_c2_density”
“shoreline_emu_physical_c3_density” [129]
“shoreline_emu_physical_c4_density” “shoreline_emu_physical_c5_density”
[131] “shoreline_emu_physical_c6_density”
“shoreline_emu_physical_c7_density” [133]
“shoreline_emu_physical_c8_density” “shoreline_emu_physical_c9_density”
[135] “shoreline_emu_physical_c10_density”
“shoreline_emu_physical_c11_density” [137]
“shoreline_emu_physical_c12_density”
“shoreline_emu_physical_c13_density” [139]
“shoreline_emu_physical_c14_density”
“shoreline_emu_physical_c15_density” [141]
“shoreline_emu_physical_c16_density”
“shoreline_emu_physical_c17_density” [143]
“shoreline_emu_physical_c18_density”
“shoreline_emu_physical_c19_density” [145]
“shoreline_emu_physical_c20_density”
“shoreline_emu_physical_c21_density” [147]
“shoreline_emu_physical_c22_density”
“shoreline_emu_physical_c23_density” [149] “has_evi” “evi_median”
[151] “evi_sd”
This data frame contains all the features in the erd data frame plus the feature “pred_pv” equal to the probability a given landscape sampled by an ebirder came from the population of landscapes sampled by ebirders that are within 10 km of the PV installations, compared to a set of candidate control locations.
Candidate controls are defined geographically as ebird checklists * 1) in the study region over years XXX - XXX * 2) between 20 and 50 km from any large PV installation in the state during the same time period
To select our set of control locations, we required pred_pv > 0.97
control_df <- as.data.frame(
erd[ test_geo_candidate_index, ])
control_df$pred_pv <- pred_pv[,2]
#dim(control_df)
#sum( control_df$pred_pv > 0.97)
# Determine which arrays are within 46 and 47 deg lat and -91.5 and -90.5 long
# which(solar_filtered$latitude > 46)
# This listed three arrays this far north: Hayward [31], Ashland [69], and ... Platteville [84]! The lat of platteville is 42!
# [1] 31 69 84
# It should have listed Superior [85]
Ashland is at 46.5924 and -90.88395. In the control file, pull out all controls between 46.0924 and 47.0924 deg lat and -91.38395 and -90.38395 long
control_df_Ashland = control_df %>% filter(latitude > 46.0924)
control_df_Ashland = control_df_Ashland %>% filter(latitude < 47.0924)
control_df_Ashland = control_df_Ashland %>% filter(longitude > -91.38395)
control_df_Ashland = control_df_Ashland %>% filter(longitude < -90.38395)
dim(control_df_Ashland)
## [1] 1483 518
GVD is at 44.77872 and -88.29199.
In the control file, pull out all controls between 44.27872 and 45.27872 deg lat and -88.79199. and -87.79199. long
[this creates a square, not the circle that DF’s code generates]
control_df_GVD = control_df %>% filter(latitude > 44.27872)
control_df_GVD = control_df_GVD %>% filter(latitude < 45.27872)
control_df_GVD = control_df_GVD %>% filter(longitude > -88.79199)
control_df_GVD = control_df_GVD %>% filter(longitude < -87.79199)
dim(control_df_GVD)
## [1] 1817 518
Define 10 km latitude boundaries around a given array [again, this creates a square, not a circle…]
Define 10 km longitude boundaries around a given array
checklists_GVD = erd %>% filter(latitude > solar$lat.less.10k[solar$Customer.ProjectName == "Green Valley Dairy"])
checklists_GVD = checklists_GVD %>% filter(latitude < solar$lat.plus.10k[solar$Customer.ProjectName == "Green Valley Dairy"])
checklists_GVD = checklists_GVD %>% filter(longitude > solar$long.less.10k[solar$Customer.ProjectName == "Green Valley Dairy"])
checklists_GVD = checklists_GVD %>% filter(longitude < solar$long.plus.10k[solar$Customer.ProjectName == "Green Valley Dairy"])
dim(checklists_GVD)
## [1] 109 517
black = control sites red = checklists within 10 km
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = mcd12q1_lccs2_c25_pland), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = mcd12q1_lccs2_c25_pland), color = "red")
It was 0 across the board
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = mcd12q1_lccs2_c36_pland), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = mcd12q1_lccs2_c36_pland), color = "red")
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = northness_1km_median), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = northness_1km_median), color = "red")
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = ntl_mean), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = ntl_mean), color = "red")
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c1), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c1), color = "red")
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c2), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c2), color = "red")
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c3), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c3), color = "red")
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c4), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c4), color = "red")
ggplot() +
geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c5), color = "black") +
geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c5), color = "red")
pv_index == True/False True means it’s a point close to a PV array
geo_candidate_index == True/False True means it’s a point that could be a candidate control, i.e., close but not too close
blue points are candidate points; these are lat longs of all blue points