Use historical harvests, number of hunters, and weather data to predict the harvest for the upcoming hunting seasons.
NOTICE that I am only looking at the general rifle hunting seasons on public land. There are also hunters for Archery, Muzzleloader, Private Land, Ranching for Wildlife, etc.
Load required libraries for wrangling data, charting, and mapping
library(plyr,quietly = T) # data wrangling
library(dplyr,quietly = T) # data wrangling
library(ggplot2, quietly = T) # charting
library(ggthemes,quietly = T) # so I can add the highcharts theme and palette
library(scales,quietly = T) # to load the percent function when labeling plots
library(caret,quietly = T) # classification and regression training
library(foreach,quietly = T) # parallel processing to speed up the model training
library(doMC,quietly = T) # parallel processing to speed up the model training
library(lubridate,quietly = T) # for timing models
Set our preferred charting theme
theme_set(theme_minimal()+theme_hc()+theme(legend.key.width = unit(1.5, "cm")))
Run script to get hunter data
source('~/_code/colorado-dow/datasets/Colorado Elk Harvest Data.R', echo=F)
[35m[1mtoOrdinal 1.0-0.0 (7-18-2017). For help: >help("toOrdinal") or visit https://centerforassessment.github.io/toOrdinal[22m[39m
The working directory was changed to /Users/psarnow/_code/colorado-dow/datasets inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
Table of the harvest data
head(COElkRifleAll)
Unit Harvest.Antlered Hunters.Antlered Success.Antlered Season HuntCode
1 1 0 0 NA 1 EM001O1R
2 2 0 0 NA 1 EM002O1R
3 201 0 0 NA 1 EM201O1R
4 3 0 0 NA 1 EM003O1R
5 301 0 0 NA 1 EM301O1R
6 4 0 0 NA 1 EM004O1R
Harvest.Antlerless Hunters.Antlerless Success.Antlerless Hunters.Either
1 NA NA NA NA
2 NA NA NA NA
3 NA NA NA NA
4 NA NA NA NA
5 NA NA NA NA
6 NA NA NA NA
Success.Either Year
1 NA 2006
2 NA 2006
3 NA 2006
4 NA 2006
5 NA 2006
6 NA 2006
Load the weather data
load("weatherdata5.RData")
head(weatherdata5)
source geodata
source('~/_code/colorado-dow/datasets/Colorado GMUnit and Road data.R', echo=F)
rgdal: version: 1.2-20, (SVN revision 725)
Geospatial Data Abstraction Library extensions to R successfully loaded
Loaded GDAL runtime: GDAL 2.1.3, released 2017/20/01
Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rgdal/gdal
GDAL binary built with GEOS: FALSE
Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rgdal/proj
Linking to sp version: 1.2-7
rgeos version: 0.3-26, (SVN revision 560)
GEOS runtime version: 3.6.1-CAPI-1.10.1 r0
Linking to sp version: 1.2-7
Polygon checking: TRUE
OGR data source with driver: ESRI Shapefile
Source: "/Users/psarnow/_code/colorado-dow/datasets/CPW_GMUBoundaries/BigGameGMUBoundaries03172015.shp", layer: "BigGameGMUBoundaries03172015"
with 185 features
It has 12 fields
Integer64 fields read as strings: GMUID
OGR data source with driver: ESRI Shapefile
Source: "/Users/psarnow/_code/colorado-dow/datasets/ne_10m_roads/ne_10m_roads.shp", layer: "ne_10m_roads"
with 56601 features
It has 29 fields
Integer64 fields read as strings: scalerank question
Take a peak at the boundary data
head(Unitboundaries2)
id long lat order hole piece group COUNTY DEERDAU ELKDAU ANTDAU MOOSEDAU
1 1 -109.0486 40.83476 1 FALSE 1 1.1 MOFFAT D-1 E-47 A-11 M-99
2 1 -109.0472 40.83357 2 FALSE 1 1.1 MOFFAT D-1 E-47 A-11 M-99
3 1 -109.0460 40.83295 3 FALSE 1 1.1 MOFFAT D-1 E-47 A-11 M-99
4 1 -109.0449 40.83228 4 FALSE 1 1.1 MOFFAT D-1 E-47 A-11 M-99
5 1 -109.0438 40.83204 5 FALSE 1 1.1 MOFFAT D-1 E-47 A-11 M-99
6 1 -109.0423 40.83181 6 FALSE 1 1.1 MOFFAT D-1 E-47 A-11 M-99
BEARDAU LIONDAU SQ_MILES ACRES SHAPE_area SHAPE_len Unit
1 B-15 L-1 127.2227 81422.53 329506619 100751.7 1
2 B-15 L-1 127.2227 81422.53 329506619 100751.7 1
3 B-15 L-1 127.2227 81422.53 329506619 100751.7 1
4 B-15 L-1 127.2227 81422.53 329506619 100751.7 1
5 B-15 L-1 127.2227 81422.53 329506619 100751.7 1
6 B-15 L-1 127.2227 81422.53 329506619 100751.7 1
Set to predictive analytics directory
setwd("~/_code/colorado-dow/phase III - predictive analytics")
Group weather data by year, and season
UnitWeather_Season <- summarise(group_by(weatherdata5,Year,Unit,Season),
daily.temperatureHigh = max(daily.temperatureHigh,na.rm = T),
daily.temperatureLow = min(daily.temperatureLow,na.rm = T),
daily.temperatureMean = mean(daily.temperatureMean,na.rm = T),
daily.precipAccumulation = mean(daily.precipAccumulation,na.rm = T),
daily.precipType = mean(daily.precipType,na.rm = T),
daily.windSpeed = mean(daily.windSpeed,na.rm = T),
daily.FullmoonPhase = mean(daily.FullmoonPhase,na.rm = T))
Appropriate field classes for model training
UnitWeather_Season$Year <- as.numeric(UnitWeather_Season$Year)
Group Hunter data by Year, Season, and Unit
COElkHarvest_Season <- summarise(group_by(COElkRifleAll,Year,Unit,Season),
Harvest = sum(c(Harvest.Antlered,Harvest.Antlerless),na.rm = T),
Hunters = sum(c(Hunters.Antlered,Hunters.Antlerless,Hunters.Either),na.rm = T))
COElkHarvest_Season$Year <- as.numeric(COElkHarvest_Season$Year)
Join Harvest, Hunter, and Weather data
COElkHarvest_Season <- full_join(COElkHarvest_Season, UnitWeather_Season, by = c("Year","Unit","Season"))
Remove rows with missing data
COElkHarvest_Season <- filter(COElkHarvest_Season, !is.na(Harvest) & !is.na(daily.temperatureMean) & Year != 2018)
Split into train and test sets. Will use 75% of the data to train on.
COElkHarvest_Season <- mutate(group_by(COElkHarvest_Season,Unit),
numentries = n())
COElkHarvest_Season <- filter(COElkHarvest_Season, numentries >= 3)
COElkHarvest_Season$UnitYearSeason <- paste(COElkHarvest_Season$Unit, COElkHarvest_Season$Year,COElkHarvest_Season$Season)
traindata2 <- COElkHarvest_Season %>% group_by(Unit) %>% sample_frac(size = .75, replace = F)
testdata2 <- COElkHarvest_Season[!COElkHarvest_Season$UnitYearSeason %in% traindata2$UnitYearSeason,]
COElkHarvest_Season <- select(COElkHarvest_Season, -UnitYearSeason, -numentries)
traindata2 <- select(traindata2, -UnitYearSeason, -numentries)
testdata2 <- select(testdata2, -UnitYearSeason, -numentries)
Save off for importing into AzureML
write.csv(COElkHarvest_Season,file = "~/_code/colorado-dow/datasets/COElkHarvest_Season.csv",row.names = F)
Loop through possible methods, utilizing the quicker ‘adaptive_cv’ parameter search from caret. Consider scripting this into AzureML to make it run much faster, though there is more setup and errors to control for
quickmethods <- c("lm",'svmLinear',"svmRadial","knn","cubist","kknn")
step1_Season_all <- NULL
for (imethod in quickmethods) {
step1_Season <- NULL
start <- now()
if (imethod == "lm" | imethod == "svmLinear") {
controlmethod <- "repeatedcv"
} else {controlmethod <- "adaptive_cv"}
fitControl <- trainControl(
method = controlmethod,
# search = 'random',
number = 4,
repeats = 4,
allowParallel = TRUE,
summaryFunction = defaultSummary)
registerDoSEQ()
registerDoMC(cores = 6)
HarvestModel_1_Season = train(Harvest ~ ., data = traindata2,
method = imethod,
#preProc = c("center","scale"),
tuneLength = 15,
trControl = fitControl)
HarvestModel_1_Season
# measure performance
predictdata <- predict(HarvestModel_1_Season, testdata2)
step1_Season$method <- imethod
step1_Season$RMSE <- postResample(pred = predictdata, obs = testdata2$Harvest)[1]
step1_Season$duration <- now() - start
step1_Season <- as.data.frame(step1_Season)
step1_Season_all <- rbind(step1_Season_all,step1_Season)
}
prediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleading
Attaching package: ‘kernlab’
The following object is masked from ‘package:scales’:
alpha
The following object is masked from ‘package:ggplot2’:
alpha
row names were found from a short variable and have been discardedrow names were found from a short variable and have been discardedrow names were found from a short variable and have been discardedrow names were found from a short variable and have been discardedrow names were found from a short variable and have been discardedrow names were found from a short variable and have been discardedrow names were found from a short variable and have been discardedrow names were found from a short variable and have been discarded
Attaching package: ‘kknn’
The following object is masked from ‘package:caret’:
contr.dummy
View Results, and compare to previous first models that did not expand to Seasons
step1_all
method RMSE duration
RMSE lm 52.23203 1.742155 secs
RMSE1 svmLinear 56.67347 1.994666 secs
RMSE2 svmRadial 55.10650 7.456390 secs
RMSE3 knn 105.23611 4.174521 secs
RMSE4 cubist 60.09588 9.766908 secs
RMSE5 kknn 59.56523 15.774304 secs
step1_Season_all
method RMSE duration
RMSE lm 26.60899 4.378424 secs
RMSE1 svmLinear 27.43676 8.249770 secs
RMSE2 svmRadial 25.41266 35.865861 secs
RMSE3 knn 35.31770 10.047640 secs
RMSE4 cubist 22.66765 30.146231 secs
RMSE5 kknn 26.01812 28.380214 secs
The performance RMSE metric is certainly improved when including the Seasonal grouping. Lets take the best method, and see if it is visually reasonable while also charting without Seasons.
Run the best model with all of the data
fitControl <- trainControl(
method = "adaptive_cv",
# search = 'random',
number = 4,
repeats = 4,
allowParallel = TRUE,
summaryFunction = defaultSummary)
registerDoSEQ()
registerDoMC(cores = 6)
HarvestModel_1_Season = train(Harvest ~ ., data = COElkHarvest_Season,
method = 'cubist',
#preProc = c("center","scale"),
tuneLength = 15,
trControl = fitControl)
row names were found from a short variable and have been discardedrow names were found from a short variable and have been discardedrow names were found from a short variable and have been discarded
HarvestModel_1_Season
Cubist
2775 samples
11 predictor
No pre-processing
Resampling: Adaptively Cross-Validated (4 fold, repeated 4 times)
Summary of sample sizes: 2081, 2080, 2082, 2082, 2080, 2082, ...
Resampling results across tuning parameters:
committees neighbors RMSE Rsquared MAE Resamples
1 0 30.01973 0.7616367 18.36972 5
1 5 26.40694 0.8138750 16.06972 5
1 9 26.49602 0.8129865 16.00035 5
10 0 27.61127 0.8028791 16.89908 5
10 5 25.05156 0.8361794 15.33611 7
10 9 24.70932 0.8396775 15.18101 13
20 0 27.27628 0.8084462 16.74196 5
20 5 24.97369 0.8369100 15.27278 8
20 9 24.87747 0.8386215 15.26527 22
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were committees = 20 and neighbors = 9.
COElkHunters_Season2018Predicted <- summarise(group_by(COElkHunters2018_Season, Year, Unit,Season),
Hunters = sum(Hunters,na.rm = T))
# Get list of Units and Seasons that will have data
COElkHarvest2018_Season <- COElkHarvest_Season
COElkHarvest2018_Season$Unit_Season <- paste(COElkHarvest2018_Season$Unit,COElkHarvest2018_Season$Season)
COElkHarvest2018_Season <- as.data.frame(unique(COElkHarvest2018_Season$Unit_Season))
colnames(COElkHarvest2018_Season) <- "Unit_Season"
# Fill in missing Units and Seasons per unique Unit_Seasons
COElkHarvest2018_Season$Unit <- str_extract(COElkHarvest2018_Season$Unit_Season,"[:alnum:]+(?=[:blank:])")
COElkHarvest2018_Season$Season <- str_extract(COElkHarvest2018_Season$Unit_Season,"(?<=[:blank:])[:alnum:]+")
COElkHarvest2018_Season <- select(COElkHarvest2018_Season, -Unit_Season)
COElkHarvest2018_Season$Year <- 2018
# Weather data for 2018
UnitWeather_Season2018 <- filter(UnitWeather_Season,Year==2018)
# A left join will autofill missing draw data with NAs, but will retain the full list of Unit Seasons
COElkHarvest2018_Season <- left_join(COElkHarvest2018_Season,UnitWeather_Season2018)
Joining, by = c("Unit", "Season", "Year")
# Join in forecasted Hunters
COElkHarvest2018_Season <- left_join(COElkHarvest2018_Season,COElkHunters_Season2018Predicted)
Joining, by = c("Unit", "Season", "Year")
# Only use the fields that were included in the model
COElkHarvest2018_Season <- COElkHarvest2018_Season[, colnames(COElkHarvest2018_Season) %in% c("Unit","Season",HarvestModel_1_Season$coefnames)]
# Use trained model to predict Harvest
COElkHarvest2018_Season$Harvest <- round(predict(HarvestModel_1_Season, COElkHarvest2018_Season))
COElkHarvest2018_Season$Harvest[COElkHarvest2018_Season$Harvest<0] <- 0
Save off so we don’t have to recreate the model everytime we want the results
Label and Join models together for comparisons
# Group Units
HarvestcompareStatewide <- summarise(group_by(Harvestcompare,Year,modeldata),
Harvest = sum(Harvest))
ggplot(HarvestcompareStatewide, aes(Year,Harvest,group=modeldata,fill=modeldata)) +
geom_bar(position="dodge",stat="identity") +
# coord_cartesian(ylim = c(130000,155000)) +
scale_fill_hc() +
labs(title="Statewide Elk Harvest", caption="source: cpw.state.co.us")
Harvest_Season <- rbind.fill(COElkHarvest_Season,COElkHarvest2018_Season)
ggplot(Harvest_Season, aes(Year,Harvest,group=Season,fill=Season)) +
geom_bar(position="dodge",stat="identity") +
# coord_cartesian(ylim = c(130000,155000)) +
scale_fill_hc() +
labs(title="Statewide Elk Harvest", caption="source: cpw.state.co.us")
Harvest_Season_77 <- filter(Harvest_Season, Unit == "77")
ggplot(Harvest_Season_77, aes(Year,Harvest,group=Season,fill=Season)) +
geom_bar(position="dodge",stat="identity") +
# coord_cartesian(ylim = c(130000,155000)) +
scale_fill_hc() +
labs(title="Unit 77 Elk Harvest", caption="source: cpw.state.co.us")