NMFS Trawl Data Cleaning

trawl.data<-survdat %>%
  select(ID, EST_YEAR,SEASON, STRATUM, DECDEG_BEGLAT,DECDEG_BEGLON,
         SVSPP, COMNAME, CATCHSEX,BIOMASS, AVGDEPTH, ABUNDANCE, LENGTH, NUMLEN) %>%
  filter(STRATUM >= 01010 & STRATUM <= 01760) %>%
  filter(STRATUM!=1310 & STRATUM!=1320 & STRATUM!=1330 & STRATUM!=1350 &
           STRATUM!=1410 & STRATUM!=1420 & STRATUM!=1490) %>%
  filter(SEASON == "SPRING" | SEASON == "FALL") %>%
  filter(EST_YEAR >= 1970 & EST_YEAR < 2018) %>%
  filter(!is.na(BIOMASS))

Generate template of all sampling occasions by selecting distinct cases by ID, EST_YEAR, SEASON, STRATUM, and AVGDEPTH.

template<-trawl.data %>%
  distinct(ID, EST_YEAR, SEASON, STRATUM, AVGDEPTH,.keep_all = TRUE) %>%
  select(ID, EST_YEAR, SEASON, STRATUM,DECDEG_BEGLAT,DECDEG_BEGLON,AVGDEPTH)

Create a dataframe to add common names by SVSPP code.

COMNAMEdata<-trawl.data %>% 
  select(SVSPP, COMNAME) %>% 
  distinct(SVSPP, .keep_all = TRUE)

Sum total biomass by species in each year.

biomass.spp.year<-trawl.data %>% 
  group_by(SVSPP, EST_YEAR) %>% 
  summarize(ANNUAL_SPECIES_BIOMASS = sum(BIOMASS)) 
biomass.spp<-trawl.data %>% 
  group_by(ID, SVSPP) %>% 
  summarize(SPECIES_BIOMASS = sum(BIOMASS)) %>% 
  right_join(template, by = "ID") %>% 
  left_join(biomass.spp.year, by = c("EST_YEAR", "SVSPP"))

Calculate total biomass per tow, not grouped by species. Will use this later to calculate proportion of species biomass per by tow.

biomass.tow<-trawl.data %>% 
  group_by(ID) %>% 
  summarize(TOW_BIOMASS = sum(BIOMASS)) %>% 
  right_join(template, by = "ID")

Calculate the number of species in each tow. This is a little indirect, but it does the job.

spp.tow<-trawl.data %>% 
  group_by(ID) %>% 
  count(SVSPP) %>% 
  mutate(count = n/n) %>% 
  group_by(ID) %>% 
  summarise(NSPECIES = sum(count))

Calculate the mean length of each fish.

*Join with the mean length dataframe for possible future analysis of fish lengths/sizes.

mean.lengths<-trawl.data %>% 
  group_by(SVSPP,ID) %>% 
  mutate(MEAN_LENGTH = weighted.mean(LENGTH, NUMLEN,na.rm = TRUE), n = sum(NUMLEN)) %>% 
  select(ID, SVSPP, MEAN_LENGTH, n) %>% 
  distinct(ID, SVSPP, .keep_all = TRUE)

length.spp<-trawl.data %>% 
  group_by(ID, SVSPP) %>% 
  nest(LENGTH:NUMLEN, key = "LENGTH") %>% 
  left_join(mean.lengths, by = c("ID", "SVSPP"))

Create “clean dataset” in a long form by tow ID and species

Long data with biomass by species by tow, mean length, and tow metadata (sum biomass, avg depth, lat/long, etc).

clean_trawl<-biomass.spp %>% 
  left_join(COMNAMEdata, by = "SVSPP") %>% 
  select(ID, SVSPP, COMNAME, SPECIES_BIOMASS) %>% 
  left_join(mean.lengths, by = c("ID", "SVSPP")) %>% 
  left_join(biomass.tow, by = "ID")

Create “tidy dataset” where all species level information is nested

Tidy dataset by tow with fully nested species data.

tidy_trawl<-mean.lengths %>% 
  left_join(biomass.spp, by = c("ID", "SVSPP")) %>% 
  left_join(COMNAMEdata, by = "SVSPP") %>% 
  left_join(biomass.tow, by = "ID") %>% 
  select(ID, SVSPP, COMNAME, SPECIES_BIOMASS, MEAN_LENGTH, n, TOW_BIOMASS) %>% 
  mutate(BIOMASS_PROP = SPECIES_BIOMASS/TOW_BIOMASS) %>% 
  select(-TOW_BIOMASS) %>% 
  group_by(ID) %>% 
  nest() %>% 
  left_join(biomass.tow, by = "ID") %>% 
  left_join(spp.tow, by = "ID")

Center of biomass

Calculate center of species biomass for each year.

centerofbiomass<-biomass.spp %>% 
  filter(!is.na(SPECIES_BIOMASS), 
                !is.na(ANNUAL_SPECIES_BIOMASS)) %>% 
  mutate(weightedLAT = (SPECIES_BIOMASS/ANNUAL_SPECIES_BIOMASS)*DECDEG_BEGLAT) %>% 
  mutate(weightedLON = (SPECIES_BIOMASS/ANNUAL_SPECIES_BIOMASS)*DECDEG_BEGLON) %>% 
  group_by(SVSPP,EST_YEAR) %>% 
  summarise(CENTER_LAT = sum(weightedLAT), CENTER_LAT = sum(weightedLON)) %>% 
  filter(!is.na(CENTER_LAT), 
         !is.na(CENTER_LAT)) %>% 
  left_join(COMNAMEdata, by = "SVSPP")

Select only species that occur in all years of the time series.

center_fullTS<-centerofbiomass %>% 
  group_by(SVSPP, COMNAME) %>% 
  count() %>%                                        #calculates number of years each spp occurs
  filter(n == (max(trawl.data$EST_YEAR) - min(trawl.data$EST_YEAR) + 1)) %>%   #filter missing yr species
  select(SVSPP)

center_fullTS<-centerofbiomass %>% 
  semi_join(center_fullTS, by = "SVSPP")        

Plot center of biomass.

loopbydf<-COMNAMEdata %>% 
  semi_join(center_fullTS, by = "SVSPP") %>% 
  arrange(SVSPP) %>% 
  mutate(loopby = seq(1,58,1))

plotlist<-list()

for(i in loopbydf$loopby){
tempdf<-center_fullTS %>% 
  filter(SVSPP == loopbydf$SVSPP[loopbydf$loopby == i]) %>% 
  mutate(smoothed = c(NA,NA,rollmean(CENTER_LAT,5),NA,NA))

plotlist[[i]]<-ggplot() + geom_point(data = tempdf,aes(x = EST_YEAR, y = CENTER_LAT)) + geom_line(data = tempdf,aes(x = EST_YEAR, y = CENTER_LAT), size = .5) + geom_line(data = tempdf,aes(x = EST_YEAR, y = smoothed), size = 0.9, color = "blue") + labs(x = "Year", y = "Center of latitude", title = paste('Species:', unique(tempdf$COMNAME))) + theme(panel.grid = element_blank())
}

#just plots the first 8 for example
list1 = plotlist[c(1:8)]
do.call(grid.arrange,c(list1, ncol = 4))

Bring in species traits data and pull out “functional groups.”

speciestraits<-read_csv(file = paste(shared.path, "Mills Lab/Projects/Pew_project/Metadata/species names and characteristics.csv", sep = "")) %>% 
  select(SVSPP, COMNAME, HARE_ET_AL_2016_Functional_Group) %>% 
  rename("functgroup" = "HARE_ET_AL_2016_Functional_Group") %>% 
  replace(.,is.na(.), "Unclassified")