Start Here:

Inits

Outputs

Define WI Region & Season of interest

Constructing a Control Data set

Lets start by identifying which checklists are close to PV

Next, define geographic candidate checklists as those that are further away from PV, but not too far away.

Refining Control to have similar landscapes Idea is to use RF to distinguish between PV & control sites based on the features that describe the local landscape.

For control data, we want landscapes that are as similar / comparable to the PV landscapes as possible.

The data set used to train the RF model includes the checklists close to PV installations and a random selection of geographic candidate control sites.

Discrimination model

Define set of point close to PV installation

Define Geographic Candidate Control Checklists (Close but not too close)

Select among geographic candidates those checklists with landscapes most similar / indistinguishable from those close to PV sites.

Select features used to describe landscapes

Define “response” as indicator of nearby PV or geographic candidate

Train Random Forest model to discriminate between locations

Features most important for model to discriminate

Use model to predict prob of being near PV vs Geo Candidate

Plot checklists

Solar locations included in study

solar = read.csv("pv-1000kW-updated_all_2025.03.27_small.csv")

    # [DF's code; I didn't use] solar <- read.csv(solar_filename)
A <- data.frame(
      lat = solar$latitude,
      lon = solar$longitude )
    A <- A[ complete.cases(A), ]

Locations of all large arrays in state

    solar <- read.csv(solar_all_filename)       
    A_all <- data.frame(
      lat = solar$latitude,
      lon = solar$longitude )
    A_all <- A_all[ complete.cases(A_all), ]
    par(mfrow = c(1,1), cex = 0.5)

    xxx <- erd$longitude
    yyy <- erd$latitude
    plot(xxx, yyy, 
        xlab = "Lon", 
        ylab = "Lat",
        main = "Breeding Season ERD2023",
        pch = 20,  
        cex = 0.15, 
        col = alpha("darkslateblue", 0.20 ))
    map("state", add=T, col="black") 
    
    # Add points to plot 
    points(
        A_all$lon, A_all$lat, 
        col=alpha("deeppink", 0.5), 
        pch = 19, cex = 2.5)

    points(
        A$lon, A$lat, 
        col=alpha("chartreuse1", 0.9), 
        pch = 19, cex = 2.5)

    points(
        erd$longitude[pv_index], erd$latitude[pv_index], 
        col=alpha("chartreuse", 0.25), 
        pch = 19, cex = 0.45)

# Geographic Candidate Checklist locations
    points(
        erd$longitude[geo_candidate_index], 
        erd$latitude[geo_candidate_index], 
        col=alpha("coral", 0.25), 
        pch = 19, cex = 0.45)

    test_geo_lon <- erd$longitude[ test_geo_candidate_index ]
    test_geo_lat <- erd$latitude[ test_geo_candidate_index ]
    most_like_pv_index <- pred_pv[,2] > 0.97
    sum(most_like_pv_index)
## [1] 8175
# Geographic Candidate Checklist locations where model says landscape is much more similar to PV landscapes than the geographic candidate landscapes 
    points(
        test_geo_lon[most_like_pv_index], 
        test_geo_lat[most_like_pv_index],
        col=alpha("blue", 0.5), 
        pch = 19, cex = 0.45)

Sanity Checking

Are these landscapes really similar to the pop of checklist landscapes close to PV?

Do the blue points make sense for control site checklists? What about ones on the water?

Checklist feature names

These include environmental and observation process features

range(erd$longitude)

[1] -100.00000 -62.00005 range(erd$latitude) [1] 25 50

names(erd) [1] “checklist_id” “observer_id”
[3] “loc_id” “longitude”
[5] “latitude” “year”
[7] “day_of_year” “hours_of_day”
[9] “solar_noon_diff” “is_stationary”
[11] “effort_hrs” “effort_distance_km”
[13] “effort_speed_kmph” “num_observers”
[15] “cci” “moon_fraction”
[17] “moon_altitude” “cds_u10”
[19] “cds_v10” “cds_d2m”
[21] “cds_t2m” “cds_hcc”
[23] “cds_i10fg” “cds_mcc”
[25] “cds_lcc” “cds_sf”
[27] “cds_rf” “cds_slc”
[29] “cds_msl” “eastness_1km_median”
[31] “eastness_1km_sd” “eastness_90m_median”
[33] “eastness_90m_sd” “northness_1km_median”
[35] “northness_1km_sd” “northness_90m_median”
[37] “northness_90m_sd” “bathymetry_elevation_median”
[39] “bathymetry_elevation_sd” “elevation_30m_median”
[41] “elevation_30m_sd” “island”
[43] “mountain” “astwbd_c1_ed”
[45] “astwbd_c1_pland” “astwbd_c2_ed”
[47] “astwbd_c2_pland” “astwbd_c3_ed”
[49] “astwbd_c3_pland” “gsw_c2_pland”
[51] “gsw_c2_ed” “ntl_mean”
[53] “ntl_sd” “road_density_c1”
[55] “road_density_c2” “road_density_c3”
[57] “road_density_c4” “road_density_c5”
[59] “mcd12q1_lccs1_c1_ed” “mcd12q1_lccs1_c1_pland”
[61] “mcd12q1_lccs1_c2_ed” “mcd12q1_lccs1_c2_pland”
[63] “mcd12q1_lccs1_c11_ed” “mcd12q1_lccs1_c11_pland”
[65] “mcd12q1_lccs1_c12_ed” “mcd12q1_lccs1_c12_pland”
[67] “mcd12q1_lccs1_c13_ed” “mcd12q1_lccs1_c13_pland”
[69] “mcd12q1_lccs1_c14_ed” “mcd12q1_lccs1_c14_pland”
[71] “mcd12q1_lccs1_c15_ed” “mcd12q1_lccs1_c15_pland”
[73] “mcd12q1_lccs1_c16_ed” “mcd12q1_lccs1_c16_pland”
[75] “mcd12q1_lccs1_c21_ed” “mcd12q1_lccs1_c21_pland”
[77] “mcd12q1_lccs1_c22_ed” “mcd12q1_lccs1_c22_pland”
[79] “mcd12q1_lccs1_c31_ed” “mcd12q1_lccs1_c31_pland”
[81] “mcd12q1_lccs1_c32_ed” “mcd12q1_lccs1_c32_pland”
[83] “mcd12q1_lccs1_c41_ed” “mcd12q1_lccs1_c41_pland”
[85] “mcd12q1_lccs1_c42_ed” “mcd12q1_lccs1_c42_pland”
[87] “mcd12q1_lccs1_c43_ed” “mcd12q1_lccs1_c43_pland”
[89] “mcd12q1_lccs1_c255_ed” “mcd12q1_lccs1_c255_pland”
[91] “mcd12q1_lccs2_c9_ed” “mcd12q1_lccs2_c9_pland”
[93] “mcd12q1_lccs2_c25_ed” “mcd12q1_lccs2_c25_pland”
[95] “mcd12q1_lccs2_c35_ed” “mcd12q1_lccs2_c35_pland”
[97] “mcd12q1_lccs2_c36_ed” “mcd12q1_lccs2_c36_pland”
[99] “mcd12q1_lccs3_c27_ed” “mcd12q1_lccs3_c27_pland”
[101] “mcd12q1_lccs3_c50_ed” “mcd12q1_lccs3_c50_pland”
[103] “mcd12q1_lccs3_c51_ed” “mcd12q1_lccs3_c51_pland”
[105] “has_shoreline” “shoreline_waveheight_mean”
[107] “shoreline_waveheight_sd” “shoreline_tidal_range_mean”
[109] “shoreline_tidal_range_sd” “shoreline_chlorophyll_mean”
[111] “shoreline_chlorophyll_sd” “shoreline_turbidity_mean”
[113] “shoreline_turbidity_sd” “shoreline_sinuosity_mean”
[115] “shoreline_sinuosity_sd” “shoreline_slope_mean”
[117] “shoreline_slope_sd” “shoreline_outflow_density_mean”
[119] “shoreline_outflow_density_sd” “shoreline_erodibility_n”
[121] “shoreline_erodibility_c1_density” “shoreline_erodibility_c2_density”
[123] “shoreline_erodibility_c3_density” “shoreline_erodibility_c4_density”
[125] “shoreline_emu_physical_n” “shoreline_emu_physical_c1_density” [127] “shoreline_emu_physical_c2_density” “shoreline_emu_physical_c3_density” [129] “shoreline_emu_physical_c4_density” “shoreline_emu_physical_c5_density” [131] “shoreline_emu_physical_c6_density” “shoreline_emu_physical_c7_density” [133] “shoreline_emu_physical_c8_density” “shoreline_emu_physical_c9_density” [135] “shoreline_emu_physical_c10_density” “shoreline_emu_physical_c11_density” [137] “shoreline_emu_physical_c12_density” “shoreline_emu_physical_c13_density” [139] “shoreline_emu_physical_c14_density” “shoreline_emu_physical_c15_density” [141] “shoreline_emu_physical_c16_density” “shoreline_emu_physical_c17_density” [143] “shoreline_emu_physical_c18_density” “shoreline_emu_physical_c19_density” [145] “shoreline_emu_physical_c20_density” “shoreline_emu_physical_c21_density” [147] “shoreline_emu_physical_c22_density” “shoreline_emu_physical_c23_density” [149] “has_evi” “evi_median”
[151] “evi_sd”

Control data fram

Assemble Data Frame for controls

This data frame contains all the features in the erd data frame plus the feature “pred_pv” equal to the probability a given landscape sampled by an ebirder came from the population of landscapes sampled by ebirders that are within 10 km of the PV installations, compared to a set of candidate control locations.

Candidate controls are defined geographically as ebird checklists * 1) in the study region over years XXX - XXX * 2) between 20 and 50 km from any large PV installation in the state during the same time period

To select our set of control locations, we required pred_pv > 0.97

    control_df <- as.data.frame( 
            erd[ test_geo_candidate_index,  ])
    control_df$pred_pv <- pred_pv[,2] 
    #dim(control_df)
    #sum( control_df$pred_pv > 0.97)

Misty’s fiddlings with the above

Isolate candidates around Ashland

# Determine which arrays are within 46 and 47 deg lat and -91.5 and -90.5 long

# which(solar_filtered$latitude > 46)
# This listed three arrays this far north:  Hayward [31], Ashland [69], and ... Platteville [84]!  The lat of platteville is 42!
# [1] 31 69 84
# It should have listed Superior [85]

Ashland is at 46.5924 and -90.88395. In the control file, pull out all controls between 46.0924 and 47.0924 deg lat and -91.38395 and -90.38395 long

control_df_Ashland = control_df %>% filter(latitude > 46.0924)
control_df_Ashland = control_df_Ashland %>% filter(latitude < 47.0924)
control_df_Ashland = control_df_Ashland %>% filter(longitude > -91.38395)
control_df_Ashland = control_df_Ashland %>% filter(longitude < -90.38395)
dim(control_df_Ashland)
## [1] 1483  518

Green Valley Dairy

GVD is at 44.77872 and -88.29199.

Isolate candidates around GVD

In the control file, pull out all controls between 44.27872 and 45.27872 deg lat and -88.79199. and -87.79199. long

[this creates a square, not the circle that DF’s code generates]

control_df_GVD = control_df %>% filter(latitude > 44.27872)
control_df_GVD = control_df_GVD %>% filter(latitude < 45.27872)
control_df_GVD = control_df_GVD %>% filter(longitude > -88.79199)
control_df_GVD = control_df_GVD %>% filter(longitude < -87.79199)
dim(control_df_GVD)
## [1] 1817  518

Checklists within 10 k of Green Valley Dairy

Define 10 km latitude boundaries around a given array [again, this creates a square, not a circle…]

Define 10 km longitude boundaries around a given array

checklists_GVD = erd %>% filter(latitude > solar$lat.less.10k[solar$Customer.ProjectName == "Green Valley Dairy"])
checklists_GVD = checklists_GVD %>% filter(latitude < solar$lat.plus.10k[solar$Customer.ProjectName == "Green Valley Dairy"])
checklists_GVD = checklists_GVD %>% filter(longitude > solar$long.less.10k[solar$Customer.ProjectName == "Green Valley Dairy"])
checklists_GVD = checklists_GVD %>% filter(longitude < solar$long.plus.10k[solar$Customer.ProjectName == "Green Valley Dairy"])

dim(checklists_GVD)
## [1] 109 517

plot envt’l features around Green Valley Dairy

forest / crop

black = control sites red = checklists within 10 km

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = mcd12q1_lccs2_c25_pland), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = mcd12q1_lccs2_c25_pland), color = "red") 

herbaceous / crop

It was 0 across the board

herbaceous crop

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = mcd12q1_lccs2_c36_pland), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = mcd12q1_lccs2_c36_pland), color = "red")

northness, 1 km

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = northness_1km_median), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = northness_1km_median), color = "red")

nighttime lights

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = ntl_mean), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = ntl_mean), color = "red")

road_density_c1

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c1), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c1), color = "red")

road_density_c2

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c2), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c2), color = "red")

road_density_c3

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c3), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c3), color = "red")

road_density_c4

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c4), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c4), color = "red")

road_density_c5

ggplot() +
  geom_point(data = control_df_GVD, aes(x = checklist_id, y = road_density_c5), color = "black") +
  geom_point(data = checklists_GVD, aes(x = checklist_id, y = road_density_c5), color = "red")

Misty notes

pv_index == True/False True means it’s a point close to a PV array

geo_candidate_index == True/False True means it’s a point that could be a candidate control, i.e., close but not too close

blue points are candidate points; these are lat longs of all blue points