Start Here:

Inits

# [don't need]  input_dir <- "/Users/df36/projects/2025_CLO_Whetten/data_2023_NE_ERD/"

# [don't understand; the file isn't a WI file]  erd_filename <- "ebird_checklists_northeast_2023.parquet"
# [don't understand; the file isn't a WI file]  erd_obs_filename <- "ebird_observations_northeast_2023.parquet"
# [don't understand; the file isn't a WI file]  srd_filename <- "prediction-grid_year_northeast_2023.parquet"

solar_filename <- "pv-1000kW-updated_2025.03.27_small.csv"
        # pv-1000kW-updated_2025.03.07.csv")
solar_all_filename <- "pv-1000kW-updated_all_2025.03.27_small.csv"

Outputs

    erd2023_filename <- "erd_2023.WI.data.parquet"
    srd2023_filename <- "srd_2023.WI.data.parquet"
    erd2023_solar_filename <- "erd.WI.PV_data_2025.03.28.parquet"

Define WI Region & Season of interest

    max_lat <- 47
    max_lon <- -86.5
    min_lat <- 42.4
    min_lon <- -93
    day_max <- 210
    day_min <- 150

Constructing a Control Data set

Lets start by identifying which checklists are close to PV

Next, define geographic candidate checklists as those that are further away from PV, but not too far away.

Refining Control to have similar landscapes Idea is to use RF to distinguish between PV & control sites based on the features that describe the local landscape.

For control data, we want landscapes that are as similar / comparable to the PV landscapes as possible.

The data set used to train the RF model includes the checklists close to PV installations and a random selection of geographic candidate control sites.

Discrimination model

# [these were #'d in DF's code]
#    write_parquet(erd, erd2023_solar_filename )
#       erd2023_solar_filename <- "erd.WI.PV_data_2025.03.28.parquet"
    
erd <- read_parquet(erd2023_solar_filename )

Define set of point close to PV installation

    distance_threshold <- 10000  # 10k meters
    
pv_index <- erd$PV_min_dist <= distance_threshold
    
    sum(pv_index)
## [1] 13962
    dim(unique(cbind(
        erd$latitude[pv_index],erd$longitude[pv_index])))
## [1] 6786    2
# [these were #'d in DF's code]
        # >     dim(B_pv_index)
    # [1] 13110     7
    # >     dim(unique(B_pv_index[,c(1:2)]))
    # [1] 6543    2

Define Geographic Candidate Control Checklists (Close but not too close)

    geo_candidate_index <- 
            erd$PV_min_dist_all > 20000 & 
            erd$PV_min_dist_all < 50000 & 
            erd$PV_min_dist > 20000 & 
            erd$PV_min_dist < 50000  
    sum(geo_candidate_index)
## [1] 41799
    training_index <- as.logical( runif(length(pv_index)) > 0.5 ) 
    train_geo_candidate_index <- geo_candidate_index & training_index
    sum(train_geo_candidate_index)
## [1] 20894
    test_geo_candidate_index <- geo_candidate_index & !training_index
    sum(test_geo_candidate_index)
## [1] 20905

Select among geographic candidates those checklists with landscapes most similar / indistinguishable from those close to PV sites.

Select features used to describe landscapes

column_nindex <- 
            c(
            30:31,      # eastness 
            34:35,      # northness
            40:41,      # elev
            45,47,49,   #water
            54:55,      # road density
            2*(30:52) ) # PLANDS
            #59:104)    # landcovers
    rf_data_frame <- as.data.frame( erd[ pv_index, column_nindex ] ) 

Define “response” as indicator of nearby PV or geographic candidate

    response <- c( 
            rep(1, sum(pv_index)), 
            rep(0, sum(train_geo_candidate_index)) )
    rf_data_frame <- rbind(rf_data_frame, 
            as.data.frame( erd[ train_geo_candidate_index, column_nindex ] ) ) 
    rf_data_frame$response <- response
    #str(rf_data_frame)

Train Random Forest model to discriminate between locations

    control_ranger <- ranger( 
        response ~ ., 
        data = rf_data_frame,
        num.trees = 500,
        probability = TRUE, 
        importance = "permutation")

Features most important for model to discriminate

    vi <- data.frame(
        feature=names(control_ranger$variable.importance), 
        vi=as.numeric(control_ranger$variable.importance))
    xxx_order <- order(vi$vi, decreasing=FALSE)
    vi <- vi[xxx_order, ]

    par(mfrow= c(1,1), mar = c(5,15,5,5), las=1, cex=0.5)
    barplot(t(vi$vi), main = "Control",
          xlab = "Relative VI", 
          ylab = " ",
          #xlim = c(0,1), 
          col = "steelblue", 
          horiz=TRUE, 
          names.arg=vi$feature)
    box()

Use model to predict prob of being near PV vs Geo Candidate

    control_test_df <- as.data.frame( 
            erd[ test_geo_candidate_index, column_nindex ])
    pred_pv <- predict( control_ranger, 
        data =  control_test_df 
        )$predictions
    hist(pred_pv[,2])

Plot checklists

Solar locations included in study

solar = read.csv("pv-1000kW-updated_all_2025.03.27_small.csv")

    # [DF's code; I didn't use] solar <- read.csv(solar_filename)
A <- data.frame(
      lat = solar$latitude,
      lon = solar$longitude )
    A <- A[ complete.cases(A), ]

Loocations of all large arrays in state

    solar <- read.csv(solar_all_filename)       
    A_all <- data.frame(
      lat = solar$latitude,
      lon = solar$longitude )
    A_all <- A_all[ complete.cases(A_all), ]
    par(mfrow = c(1,1), cex = 0.5)

    xxx <- erd$longitude
    yyy <- erd$latitude
    plot(xxx, yyy, 
        xlab = "Lon", 
        ylab = "Lat",
        main = "Breeding Season ERD2023",
        pch = 20,  
        cex = 0.15, 
        col = alpha("darkslateblue", 0.20 ))
    map("state", add=T, col="black") 
    
    # Add points to plot 
    points(
        A_all$lon, A_all$lat, 
        col=alpha("deeppink", 0.5), 
        pch = 19, cex = 2.5)

    points(
        A$lon, A$lat, 
        col=alpha("chartreuse1", 0.9), 
        pch = 19, cex = 2.5)

    points(
        erd$longitude[pv_index], erd$latitude[pv_index], 
        col=alpha("chartreuse", 0.25), 
        pch = 19, cex = 0.45)

# Geographic Candidate Checklist locations
    points(
        erd$longitude[geo_candidate_index], 
        erd$latitude[geo_candidate_index], 
        col=alpha("coral", 0.25), 
        pch = 19, cex = 0.45)

    test_geo_lon <- erd$longitude[ test_geo_candidate_index ]
    test_geo_lat <- erd$latitude[ test_geo_candidate_index ]
    most_like_pv_index <- pred_pv[,2] > 0.97
    sum(most_like_pv_index)
## [1] 8029
# Geographic Candidate Checklist locations where model says landscape is much more similar to PV landscapes than the geographic candidate landscapes 
    points(
        test_geo_lon[most_like_pv_index], 
        test_geo_lat[most_like_pv_index],
        col=alpha("blue", 0.5), 
        pch = 19, cex = 0.45)

Sanity Checking

Are these landscapes really similar to the pop of checklist landscapes close to PV?

Do the blue points make sense for control site checklists? What about ones on the water?

Checklist feature names

These include environmental and observation process features

range(erd$longitude)

[1] -100.00000 -62.00005 range(erd$latitude) [1] 25 50

names(erd) [1] “checklist_id” “observer_id”
[3] “loc_id” “longitude”
[5] “latitude” “year”
[7] “day_of_year” “hours_of_day”
[9] “solar_noon_diff” “is_stationary”
[11] “effort_hrs” “effort_distance_km”
[13] “effort_speed_kmph” “num_observers”
[15] “cci” “moon_fraction”
[17] “moon_altitude” “cds_u10”
[19] “cds_v10” “cds_d2m”
[21] “cds_t2m” “cds_hcc”
[23] “cds_i10fg” “cds_mcc”
[25] “cds_lcc” “cds_sf”
[27] “cds_rf” “cds_slc”
[29] “cds_msl” “eastness_1km_median”
[31] “eastness_1km_sd” “eastness_90m_median”
[33] “eastness_90m_sd” “northness_1km_median”
[35] “northness_1km_sd” “northness_90m_median”
[37] “northness_90m_sd” “bathymetry_elevation_median”
[39] “bathymetry_elevation_sd” “elevation_30m_median”
[41] “elevation_30m_sd” “island”
[43] “mountain” “astwbd_c1_ed”
[45] “astwbd_c1_pland” “astwbd_c2_ed”
[47] “astwbd_c2_pland” “astwbd_c3_ed”
[49] “astwbd_c3_pland” “gsw_c2_pland”
[51] “gsw_c2_ed” “ntl_mean”
[53] “ntl_sd” “road_density_c1”
[55] “road_density_c2” “road_density_c3”
[57] “road_density_c4” “road_density_c5”
[59] “mcd12q1_lccs1_c1_ed” “mcd12q1_lccs1_c1_pland”
[61] “mcd12q1_lccs1_c2_ed” “mcd12q1_lccs1_c2_pland”
[63] “mcd12q1_lccs1_c11_ed” “mcd12q1_lccs1_c11_pland”
[65] “mcd12q1_lccs1_c12_ed” “mcd12q1_lccs1_c12_pland”
[67] “mcd12q1_lccs1_c13_ed” “mcd12q1_lccs1_c13_pland”
[69] “mcd12q1_lccs1_c14_ed” “mcd12q1_lccs1_c14_pland”
[71] “mcd12q1_lccs1_c15_ed” “mcd12q1_lccs1_c15_pland”
[73] “mcd12q1_lccs1_c16_ed” “mcd12q1_lccs1_c16_pland”
[75] “mcd12q1_lccs1_c21_ed” “mcd12q1_lccs1_c21_pland”
[77] “mcd12q1_lccs1_c22_ed” “mcd12q1_lccs1_c22_pland”
[79] “mcd12q1_lccs1_c31_ed” “mcd12q1_lccs1_c31_pland”
[81] “mcd12q1_lccs1_c32_ed” “mcd12q1_lccs1_c32_pland”
[83] “mcd12q1_lccs1_c41_ed” “mcd12q1_lccs1_c41_pland”
[85] “mcd12q1_lccs1_c42_ed” “mcd12q1_lccs1_c42_pland”
[87] “mcd12q1_lccs1_c43_ed” “mcd12q1_lccs1_c43_pland”
[89] “mcd12q1_lccs1_c255_ed” “mcd12q1_lccs1_c255_pland”
[91] “mcd12q1_lccs2_c9_ed” “mcd12q1_lccs2_c9_pland”
[93] “mcd12q1_lccs2_c25_ed” “mcd12q1_lccs2_c25_pland”
[95] “mcd12q1_lccs2_c35_ed” “mcd12q1_lccs2_c35_pland”
[97] “mcd12q1_lccs2_c36_ed” “mcd12q1_lccs2_c36_pland”
[99] “mcd12q1_lccs3_c27_ed” “mcd12q1_lccs3_c27_pland”
[101] “mcd12q1_lccs3_c50_ed” “mcd12q1_lccs3_c50_pland”
[103] “mcd12q1_lccs3_c51_ed” “mcd12q1_lccs3_c51_pland”
[105] “has_shoreline” “shoreline_waveheight_mean”
[107] “shoreline_waveheight_sd” “shoreline_tidal_range_mean”
[109] “shoreline_tidal_range_sd” “shoreline_chlorophyll_mean”
[111] “shoreline_chlorophyll_sd” “shoreline_turbidity_mean”
[113] “shoreline_turbidity_sd” “shoreline_sinuosity_mean”
[115] “shoreline_sinuosity_sd” “shoreline_slope_mean”
[117] “shoreline_slope_sd” “shoreline_outflow_density_mean”
[119] “shoreline_outflow_density_sd” “shoreline_erodibility_n”
[121] “shoreline_erodibility_c1_density” “shoreline_erodibility_c2_density”
[123] “shoreline_erodibility_c3_density” “shoreline_erodibility_c4_density”
[125] “shoreline_emu_physical_n” “shoreline_emu_physical_c1_density” [127] “shoreline_emu_physical_c2_density” “shoreline_emu_physical_c3_density” [129] “shoreline_emu_physical_c4_density” “shoreline_emu_physical_c5_density” [131] “shoreline_emu_physical_c6_density” “shoreline_emu_physical_c7_density” [133] “shoreline_emu_physical_c8_density” “shoreline_emu_physical_c9_density” [135] “shoreline_emu_physical_c10_density” “shoreline_emu_physical_c11_density” [137] “shoreline_emu_physical_c12_density” “shoreline_emu_physical_c13_density” [139] “shoreline_emu_physical_c14_density” “shoreline_emu_physical_c15_density” [141] “shoreline_emu_physical_c16_density” “shoreline_emu_physical_c17_density” [143] “shoreline_emu_physical_c18_density” “shoreline_emu_physical_c19_density” [145] “shoreline_emu_physical_c20_density” “shoreline_emu_physical_c21_density” [147] “shoreline_emu_physical_c22_density” “shoreline_emu_physical_c23_density” [149] “has_evi” “evi_median”
[151] “evi_sd”