Description

Colorado Parks and Wildlife (CPW) aka Colorado Department of Wildlife (CDOW) provides historical statistics on Big Game hunts. The tables are organized by year and available for download in pdf format. In the case of the project we are solely interested in Elk Rifle Hunting on public land. The tables are organized by hunting seasons (First-Fourth), as well as by hunting regions (Units) The hunting regions vary very slightly from year to year but for the most part have been consistent for many years.

setwd("~/_code/colorado-dow/datasets")

Load required libraries for acquiring data from pdf

library(pdftools,quietly = T)
library(stringr,quietly = T)

# Identify the years that CDOW will provide tables for in this pdf format
years <- c(2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017)

Loop through years

COElkRifleAll <- NULL # Initialize
for (iyear in years) {
  
  # RUN ONCE to download
  # if (iyear >= 2014) {
  #   download.file(paste("http://cpw.state.co.us/Documents/Hunting/BigGame/Statistics/Elk/",
  #                       iyear,"StatewideElkHarvest.pdf",sep=""),
  #                 paste(iyear,"COElkHarvest",sep=""))
  # } else {
  #   download.file(paste("http://cpw.state.co.us/Documents/Hunting/BigGame/Statistics/Elk/",
  #                       iyear,"ElkHarvestSurvey.pdf",sep=""),
  #                 paste(iyear,"COElkHarvest",sep=""))
  # }
  
  # This function will directly export the raw text in a character vector with spaces to show 
  # the white space and \n to show the line breaks.
  COElk <- pdf_text(paste(iyear,"COElkHarvest",sep=""))
  
  # Having a full page in one element of a vector is not the most practical. Using strsplit 
  # will help separate lines from each other
  COElka <- strsplit(COElk, "\n")
  
  # years starting in 2014 have a cover page or table of contents
  if (iyear >= 2014) {
    COElka <- COElka[-1] # remove cover page (map, or table of contents)
  }
  
  # The document holds more information than we are after.
  # In our case we are looking for 
  tableheadings <- c(paste(iyear,"Elk Harvest, Hunters and Recreation Days for First Rifle Seasons"),
                     paste(iyear,"Elk Harvest, Hunters and Recreation Days for Second Rifle Seasons"),
                     paste(iyear,"Elk Harvest, Hunters and Recreation Days for Third Rifle Seasons"),
                     paste(iyear,"Elk Harvest, Hunters and Recreation Days for Fourth Rifle Seasons"))
  
  # Notice that the tables run across pages, there are some pages that will have
  # info we want and info we want to ignore
  
  # Identify pages with the table headings we identified
  rifle1 <- grep(tableheadings[1], COElka)
  rifle2 <- grep(tableheadings[2], COElka)
  rifle3 <- grep(tableheadings[3], COElka)
  rifle4 <- grep(tableheadings[4], COElka)
  
  rifleseasons <- unique(c(rifle1,rifle2,rifle3,rifle4))
  COElkb <- COElka[rifleseasons]
  
  # the first page has the end of a previous table, remove it so we can have consistent columns
  firsttable <- COElkb[[1]]
  # which row has the heading of our table?
  firsttablestart <- grep(tableheadings[1], firsttable)
  # drop all rows before that table
  firstpage <- firsttable[-(1:firsttablestart-1)]
  
  # the last page might have the beginning of a new table, remove it so we can have consistent columns
  lasttablepage <- length(COElkb) # what is the last page?
  lasttable <- COElkb[[lasttablepage]]
  # which rows have table headings? they all start with 'iyear Elk Harvest'
  lasttableend <- grep(paste(iyear,"Elk Harvest"), lasttable)
  lasttableend <- max(lasttableend) # the second entry is the start of the table to ignore
  if (lasttableend >1) {
    # drop all rows after our table
    lastpage <- lasttable[-(lasttableend:length(lasttable))]
  } else {lastpage <- lasttable}
  
  # replace updated first and last pages
  COElkc <- COElkb
  COElkc[[1]] <- firstpage
  COElkc[[lasttablepage]] <- lastpage
  
  # unlist page elements
  COElkd <- unlist(COElkc)
  
  ######## SEASON ONE ######## (could make a season loop as well)
  # identify season 1 data
  seasonONEstart <- grep(tableheadings[1], COElkd)[1]
  seasonONEend <- grep(tableheadings[2], COElkd)[1]
  seasonONE <- COElkd[((seasonONEstart+1):(seasonONEend-1))]
  
  # remove rows with the table headers
  removeheaderrows <- grep(paste(iyear,"Elk Harvest"), seasonONE)
  seasonONE <- seasonONE[-removeheaderrows]
  
  # determine column names 
  columnnames <- grep("([:alpha:])", seasonONE)
  seasonONE1 <- seasonONE[-columnnames]
  columnnames1 <- seasonONE[columnnames]
  columnnames1 <- columnnames1[-length(columnnames1)] # we know the last row is a 'Total' summary, not a name
  columnnames2 <- columnnames1[length(columnnames1)]
  columnnames2 <- str_trim(columnnames2) # remove any extra whitespace
  columnnames3 <- unlist(strsplit(columnnames2,split = "\\s+"))
  colremove <- grep("([.])",columnnames3)
  columnnames3 <- columnnames3[-colremove]
  
  seasonONE1 <- str_trim(seasonONE1) # remove extra whitespace
  pagenumbers <- grep("\\s+", seasonONE1) # remove page numbers
  seasonONE2 <- seasonONE1[pagenumbers]
  
  # now that it is cleaned up, use the white space to separate into columns
  seasonONE3 <- str_split_fixed(seasonONE2,pattern = "\\s+", n=length(columnnames3))
  seasonONE3 <- as.data.frame(seasonONE3) # and convert into a dataframe
  colnames(seasonONE3) <- columnnames3 # apply our column names
  
  # add the season column
  seasonONE3$Season <- 1
  
  ######## SEASON TWO ######## 
  # identify season 2 data
  seasonTWOstart <- grep(tableheadings[2], COElkd)[1]
  seasonTWOend <- grep(tableheadings[3], COElkd)[1]
  seasonTWO <- COElkd[((seasonTWOstart+1):(seasonTWOend-1))]
  
  # remove rows with the table headers
  removeheaderrows <- grep(" Elk Harvest", seasonTWO)
  seasonTWO <- seasonTWO[-removeheaderrows]
  
  # determine column names 
  columnnames <- grep("([:alpha:])", seasonTWO)
  seasonTWO1 <- seasonTWO[-columnnames]
  columnnames1 <- seasonTWO[columnnames]
  columnnames1 <- columnnames1[-length(columnnames1)] # we know the last row is a 'Total' summary, not a name
  columnnames2 <- columnnames1[length(columnnames1)]
  columnnames2 <- str_trim(columnnames2) # remove any extra whitespace
  columnnames3 <- unlist(strsplit(columnnames2,split = "\\s+"))
  colremove <- grep("([.])",columnnames3)
  columnnames3 <- columnnames3[-colremove]
  
  seasonTWO1 <- str_trim(seasonTWO1) # remove extra whitespace
  pagenumbers <- grep("\\s+", seasonTWO1) # remove page numbers
  seasonTWO2 <- seasonTWO1[pagenumbers]
  
  # now that it is cleaned up, use the white space to separate into columns
  seasonTWO3 <- str_split_fixed(seasonTWO2,pattern = "\\s+", n=length(columnnames3))
  seasonTWO3 <- as.data.frame(seasonTWO3) # and convert into a dataframe
  colnames(seasonTWO3) <- columnnames3 # apply our column names
  
  # add the season column
  seasonTWO3$Season <- 2
  
  ######## SEASON THREE ######## 
  # identify season 3 data
  seasonTHREEstart <- grep(tableheadings[3], COElkd)[1]
  seasonTHREEend <- grep(tableheadings[4], COElkd)[1]
  seasonTHREE <- COElkd[((seasonTHREEstart+1):(seasonTHREEend-1))]
  
  # remove rows with the table headers
  removeheaderrows <- grep(" Elk Harvest", seasonTHREE)
  seasonTHREE <- seasonTHREE[-removeheaderrows]
  
  # determine column names
  columnnames <- grep("([:alpha:])", seasonTHREE)
  seasonTHREE1 <- seasonTHREE[-columnnames]
  columnnames1 <- seasonTHREE[columnnames]
  columnnames1 <- columnnames1[-length(columnnames1)] # we know the last row is a 'Total' summary, not a name
  columnnames2 <- columnnames1[length(columnnames1)]
  columnnames2 <- str_trim(columnnames2) # remove any extra whitespace
  columnnames3 <- unlist(strsplit(columnnames2,split = "\\s+"))
  colremove <- grep("([.])",columnnames3)
  columnnames3 <- columnnames3[-colremove]
  
  seasonTHREE1 <- str_trim(seasonTHREE1) # remove extra whitespace
  pagenumbers <- grep("\\s+", seasonTHREE1) # remove page numbers
  seasonTHREE2 <- seasonTHREE1[pagenumbers]
  
  # now that it is cleaned up, use the white space to separate into columns
  seasonTHREE3 <- str_split_fixed(seasonTHREE2,pattern = "\\s+", n=length(columnnames3))
  seasonTHREE3 <- as.data.frame(seasonTHREE3) # and convert into a dataframe
  colnames(seasonTHREE3) <- columnnames3 # apply our column names
  
  # add the season column
  seasonTHREE3$Season <- 3
  
  
  ######## SEASON FOUR ######## 
  # identify season 4 data
  seasonFOURstart <- grep(tableheadings[4], COElkd)[1]
  seasonFOUR <- COElkd[((seasonFOURstart+1):(length(COElkd)))] # to the end
  
  # remove rows with the table headers
  removeheaderrows <- grep(" Elk Harvest", seasonFOUR)
  seasonFOUR <- seasonFOUR[-removeheaderrows]
  
  # determine column names 
  columnnames <- grep("([:alpha:])", seasonFOUR)
  seasonFOUR1 <- seasonFOUR[-columnnames]
  columnnames1 <- seasonFOUR[columnnames]
  columnnames1 <- columnnames1[-length(columnnames1)] # we know the last row is a 'Total' summary, not a name
  columnnames2 <- columnnames1[length(columnnames1)]
  columnnames2 <- str_trim(columnnames2) # remove any extra whitespace
  columnnames3 <- unlist(strsplit(columnnames2,split = "\\s+"))
  colremove <- grep("([.])",columnnames3)
  columnnames3 <- columnnames3[-colremove]
  
  seasonFOUR1 <- str_trim(seasonFOUR1) # remove extra whitespace
  pagenumbers <- grep("\\s+", seasonFOUR1) # remove page numbers
  seasonFOUR2 <- seasonFOUR1[pagenumbers]
  
  # now that it is cleaned up, use the white space to separate into columns
  seasonFOUR3 <- str_split_fixed(seasonFOUR2,pattern = "\\s+", n=length(columnnames3))
  seasonFOUR3 <- as.data.frame(seasonFOUR3) # and convert into a dataframe
  colnames(seasonFOUR3) <- columnnames3 # apply our column names
  
  # add the season column
  seasonFOUR3$Season <- 4
  
  #Combine
  COElkRifle <- rbind(seasonONE3,seasonTWO3,seasonTHREE3,seasonFOUR3)
  COElkRifle$Year <- as.character(iyear)
  COElkRifleAll <- rbind(COElkRifleAll,COElkRifle)
}

Clean up dataframe fields It may happen that when reading numeric data into R (usually, when reading in a file), they come in as factors. If f is such a factor object, you can use

# as.numeric(as.character(f)) to get the numbers back. 

More efficient, but harder to remember, is

# as.numeric(levels(f))[as.integer(f)]

However, there are still implications in our data as there are commas present for values in the thousands. As-is converting to numeric using the factor levels will coerce them to NAs So we will need to convert to character, remove the commas, then convert to numeric

COElkRifleAll$Hunters <- as.numeric(gsub(",", "", COElkRifleAll$Hunters))
COElkRifleAll$Days <- as.numeric(gsub(",", "", COElkRifleAll$Days))
# TODO make the gsub search each column

COElkRifleAll$Success <- as.numeric(levels(COElkRifleAll$Success))[as.integer(COElkRifleAll$Success)]
COElkRifleAll$Harvest <- as.numeric(levels(COElkRifleAll$Harvest))[as.integer(COElkRifleAll$Harvest)]
COElkRifleAll$Season <- as.character(COElkRifleAll$Season)
COElkRifleAll$Unit <- as.character(COElkRifleAll$Unit)

Create new statistics based on investigation

# How much effort it takes to have a successful result
COElkRifleAll$Harvest_Effort <- COElkRifleAll$Days / COElkRifleAll$Harvest # From Phase I investigation
COElkRifleAll$Harvest_Effort[is.infinite(COElkRifleAll$Harvest_Effort)] <- NA # we get an inf where there was no harvest

Peek at the dataframe

head(COElkRifleAll)
##   Unit Bulls Cows Calves Harvest Hunters Success Days Season Year
## 1  411     0    0      0       0      26       0   79      1 2006
## 2   42    34    3      0      37     125      30  353      1 2006
## 3  421    44   38      0      82     351      23 1236      1 2006
## 4   43    56   39      9     104     472      22 1699      1 2006
## 5   44    53   37     10     100     474      21 1485      1 2006
## 6  444    17   39      5      61     285      21  892      1 2006
##   Harvest_Effort
## 1             NA
## 2       9.540541
## 3      15.073171
## 4      16.336538
## 5      14.850000
## 6      14.622951