###read latest data ######
library(readxl)
## Warning: package 'readxl' was built under R version 3.1.3
mtm <- suppressMessages(
  suppressWarnings(read_excel('LumpedAggedInfPCMet2017-01-19_BI.xlsx',na='empty')))
setwd("P:/MTM2 Oranga Taiao/EMP Data/EMP Data/Biotic Index calculator/BQI")
suppressMessages(
  suppressWarnings(
    library(tidyverse)))

####Calculate sensitivity values ####
###get only spp variables
sp <- mtm %>% 
  select(dorvilleidae:`lunella smaragda`) 

sp <- ceiling(sp) #to round-up decimal point in abundance of samples with abundance corrected for core diameter

f <- data.frame(taxa=colnames(sp),
                freq=colSums(sp!=0), 
                row.names = 1:ncol(sp))#get the number of cores were each taxon occur #

####Sensi10: Sensitivity scores (ES50_0.05) for taxa that occur in 10% of samples
f10 <- f %>% 
  dplyr::filter(freq>nrow(sp)*.1) ##list of taxa taxa that occur in >10% samples

suppressMessages(suppressWarnings(library(vegan)))#for rarefy function

sp1 <- sp %>% 
  dplyr::filter(rowSums(.)>=50) %>% ##remove core with < 50 individuals
  mutate(ES50 = rarefy(.,50)) %>%  #get the ES50 rarefied richness based on Hurlbert's (1971) formulation.
  select(match(f10$taxa,names(sp)),ES50) %>% ##select taxa that occur in >10% samples
  gather(sp,abund,-ES50) ##convert into long format

sensi10 <- 
  sp1 %>% 
  group_by(sp) %>% 
  dplyr::filter(abund>0) %>% 
  arrange(ES50) %>% 
  mutate(p=percent_rank(cumsum(abund))) %>% 
  slice(which.min(abs(p - .05))) %>% 
  data.frame(.) %>% 
  select(ES50,sp) %>% 
  rename(sensitivity = ES50) %>% 
  arrange(sensitivity) %>% 
  print()
##    sensitivity                      sp
## 1     3.513288            potamopyrgus
## 2     3.892110               amphipoda
## 3     3.944333     austrohelice crassa
## 4     4.089359       hemiplax hirtipes
## 5     4.121021           spionidae all
## 6     4.150406              nereididae
## 7     4.362826      arthritica bifurca
## 8     4.366324                 isopoda
## 9     4.533346             oligochaeta
## 10    4.599141      cossura consimilis
## 11    4.651881       paphies australis
## 12    4.724596            halicarcinus
## 13    4.727273            capitellidae
## 14    5.234194              nephtyidae
## 15    5.236645 austrovenus stutchburyi
## 16    5.255044 heteromastus filiformis
## 17    5.443262                nemertea
## 18    5.498591          orbiniidae all
## 19    5.502914              paraonidae
## 20    5.524700  cominella glandiformis
## 21    5.608283              glyceridae
## 22    6.000000               edwardsia
## 23    6.026442        macomona liliana
## 24    6.051962                 cumacea
## 25    6.342854               notoacmea
## 26    6.465517              actiniidae
## 27    6.629071                  diloma
## 28    6.641215                magelona
## 29    6.699195                syllidae
## 30    7.058335              maldanidae
## 31    7.336834             zeacumantus
## 32    7.363923               nuculidae
write.csv(sensi10,'sensitivity_10%.csv',row.names = F)


###Sensi5: Calculate sensitivity for taxa that ocurr in >5% of the samples##### 
f5 <- f %>% 
  dplyr::filter(freq>nrow(sp)*.05) ##list of taxa taxa that occur in >20 cores

sp2 <- sp %>% 
  dplyr::filter(rowSums(.)>=50) %>% ##remove core with < 50 individuals
  mutate(ES50 = rarefy(.,50)) %>%  #get the ES50 rarefied richness based on Hurlbert's (1971) formulation.
  select(match(f5$taxa,names(sp)),ES50) %>% ##select taxa that occur in >5% of samples
  gather(sp,abund,-ES50) ##convert into long format

sensi5 <- 
  sp2 %>% 
  group_by(sp) %>% 
  dplyr::filter(abund>0) %>% 
  arrange(ES50) %>% 
  mutate(p=percent_rank(cumsum(abund))) %>% 
  slice(which.min(abs(p - .05))) %>% 
  data.frame(.) %>% 
  select(ES50,sp) %>% 
  rename(sensitivity = ES50) %>% 
  arrange(sensitivity) %>% 
  print()
##    sensitivity                      sp
## 1     3.513288            potamopyrgus
## 2     3.743073       amphibola crenata
## 3     3.892110               amphipoda
## 4     3.944333     austrohelice crassa
## 5     4.089359       hemiplax hirtipes
## 6     4.121021           spionidae all
## 7     4.150406              nereididae
## 8     4.362826      arthritica bifurca
## 9     4.366324                 isopoda
## 10    4.533346             oligochaeta
## 11    4.599141      cossura consimilis
## 12    4.651881       paphies australis
## 13    4.724596            halicarcinus
## 14    4.727273            capitellidae
## 15    4.814693                nematoda
## 16    4.946660                  mysida
## 17    5.234194              nephtyidae
## 18    5.236645 austrovenus stutchburyi
## 19    5.255044 heteromastus filiformis
## 20    5.443262                nemertea
## 21    5.498591          orbiniidae all
## 22    5.502914              paraonidae
## 23    5.524700  cominella glandiformis
## 24    5.608283              glyceridae
## 25    6.000000               edwardsia
## 26    6.026442        macomona liliana
## 27    6.051962                 cumacea
## 28    6.342854               notoacmea
## 29    6.465517              actiniidae
## 30    6.629071                  diloma
## 31    6.641215                magelona
## 32    6.699195                syllidae
## 33    6.882395              goniadidae
## 34    7.058335              maldanidae
## 35    7.336834             zeacumantus
## 36    7.363923               nuculidae
## 37    8.604653                 hiatula
write.csv(sensi5,'sensitivity_5%.csv',row.names = F)###write to csv file

###Sensi5_all:dont remove samples with <50 and use taxa in 5% of samples#####
f5_all <- f %>% 
  dplyr::filter(freq>nrow(sp)*.05) ##list of taxa taxa that occur in >5% cores

sp3<- sp %>% 
  dplyr::filter(rowSums(.)>0) %>% ##remove core with < 0 individuals
  mutate(ES50 = rarefy(.,50)) %>%  #get the ES50 rarefied richness based on Hurlbert's (1971) formulation.
  select(match(f5_all$taxa,names(sp)),ES50) %>% ##select taxa that occur in >5% of samples
  gather(sp,abund,-ES50) ##convert into long format
## Warning in rarefy(., 50): Requested 'sample' was larger than smallest site
## maximum (1)
sensi5_all <- 
  sp3 %>% 
  group_by(sp) %>% 
  dplyr::filter(abund>0) %>% 
  arrange(ES50) %>% 
  mutate(p=percent_rank(cumsum(abund))) %>% 
  slice(which.min(abs(p - .05))) %>% 
  data.frame(.) %>% 
  select(ES50,sp) %>% 
  rename(sensitivity = ES50) %>% 
  arrange(sensitivity) %>% 
  print()
##    sensitivity                      sp
## 1     2.976488     austrohelice crassa
## 2     3.419637            potamopyrgus
## 3     3.660779              nereididae
## 4     3.769189           spionidae all
## 5     3.805448       amphibola crenata
## 6     3.880534               amphipoda
## 7     3.974684       hemiplax hirtipes
## 8     4.000000      arthritica bifurca
## 9     4.000000 austrovenus stutchburyi
## 10    4.000000             oligochaeta
## 11    4.000000       paphies australis
## 12    4.139791                 isopoda
## 13    4.401274      cossura consimilis
## 14    4.629846            capitellidae
## 15    4.649613            halicarcinus
## 16    4.651881                  mysida
## 17    4.874798               edwardsia
## 18    4.959281              paraonidae
## 19    5.000000  cominella glandiformis
## 20    5.000000                  diloma
## 21    5.000000 heteromastus filiformis
## 22    5.000000        macomona liliana
## 23    5.000000                nemertea
## 24    5.000000              nephtyidae
## 25    5.112769                nematoda
## 26    5.203340          orbiniidae all
## 27    5.396116              glyceridae
## 28    5.777435               notoacmea
## 29    6.000000                 cumacea
## 30    6.000000              maldanidae
## 31    6.000000             zeacumantus
## 32    6.136017                syllidae
## 33    6.302414              actiniidae
## 34    6.333013                magelona
## 35    6.833064                 hiatula
## 36    6.866387              goniadidae
## 37    7.000000               nuculidae
write.csv(sensi5_all,'sensitivity_5%_all.csv',row.names = F)##write to csv file

###Calculate BQI and compare results to Nigel's calculator####

###Get data on species abundance only and cesymr for Tasman disctrict only
dat <- mtm %>% select(cesymr,council,dorvilleidae:`lunella smaragda`)  %>% 
  dplyr::filter(council=='tasmandistrictcouncil') %>% 
  select(-council)

####Calculate richness and abundace
dat1 <- dat %>%
  mutate(S = rowSums(.[,-1] > 0),##richness
         N = rowSums(.[,-1])) %>% ###abundace
  select(N,S,cesymr)###collate cesymr data 

###Convert data into long format and select taxa with sensitivity values only
BQI_data_tas <- dat %>% 
  dplyr::select(match(f10$taxa,names(dat)),cesymr) %>% 
  gather(sp,abund,-cesymr) %>% 
  left_join(sensi10,by='sp') %>% 
  left_join(dat1,by='cesymr')

BQI <- 
  BQI_data_tas %>%
  group_by(cesymr) %>% 
  mutate(v1 = (abund/ #abundace of a given taxa
                 sum(abund))*##total abundance of taxa with sensitivity values
           sensitivity) %>%
  summarise(BQI = sum(v1)* ##sum of part 1 at the replicate level
              dplyr::first(log10(S+1))*#log of total ichness at replicate level
              (dplyr::first(N) / (dplyr::first(N)+5)),##abundance correction factor
            N=mean(N))


write.csv(BQI,'bqi_tasman.csv',row.names = F)

##Add BQI index to original dataset
mtm1 <- left_join(dat,BQI,by = 'cesymr')

theme_set(theme_bw(base_size = 12))

###Read Anna's results from Nigel's calculator
library(readxl)
tas=read_excel('tas.xlsx') %>% 
  bind_cols(BQI)

ggplot(tas,aes(x=`BQI-new`,y=BQI))+
  geom_point()+
  xlab('BQI-R calculator')+
  ylab('BQI-Excel calculator')+
  geom_abline(intercept = 0,slope = 1,col=2)

##there is a small discrepancy between the two indices calculator, maybe a mismatch in the number of dp?

ggplot(tas,aes(x=N,y=`Total abundance (N)`))+
  geom_point()+
  xlab('N-R calculator')+
  ylab('N-Excel calculator')+
  geom_abline(intercept = 0,slope = 1,col=2)

##Total abundance match exactly


####Calculate BQI for the whole dataset and using different sensitivity values: sensi5_all, sensi10 and sensi5####
##BQI sensi10####
dat_all <- mtm %>% select(cesymr,dorvilleidae:`lunella smaragda`)

####Calculate richness and abundace
dat_div_all<- dat_all %>%
  mutate(S = rowSums(.[,-1] > 0),##richness
         N = rowSums(.[,-1])) %>% ###abundace
  select(N,S,cesymr)###collate cesymr data 

###Convert data into long format and select taxa with sensitivity values only
BQI_dat_10 <- dat_all %>% 
  dplyr::select(match(f10$taxa,names(dat_all)),cesymr) %>% 
  gather(sp,abund,-cesymr) %>% 
  left_join(sensi10,by='sp') %>%
  left_join(dat_div_all,by='cesymr')

BQI_all_10 <- 
  BQI_dat_10 %>%
  group_by(cesymr) %>% 
  mutate(v1 = (abund/ #abundace of a given taxa
                 sum(abund))*##total abundance of taxa with sensitivity values
           sensitivity) %>%
  summarise(BQI_10 = sum(v1)* ##sum of part 1 at the replicate level
              dplyr::first(log10(S+1))*#log of total ichness at replicate level
              (dplyr::first(N) / (dplyr::first(N)+5)))

####Sensi5#####
###Convert data into long format and select taxa with sensitivity values only
BQI_dat_5 <- dat_all %>% 
  dplyr::select(match(f5$taxa,names(dat_all)),cesymr) %>% 
  gather(sp,abund,-cesymr) %>% 
  left_join(sensi5,by='sp') %>%
  left_join(dat_div_all,by='cesymr')

BQI_all_5<- 
  BQI_dat_5 %>%
  group_by(cesymr) %>% 
  mutate(v1 = (abund/ #abundace of a given taxa
                 sum(abund))*##total abundance of taxa with sensitivity values
           sensitivity) %>%
  summarise(BQI_5 = sum(v1)* ##sum of part 1 at the replicate level
              dplyr::first(log10(S+1))*#log of total ichness at replicate level
              (dplyr::first(N) / (dplyr::first(N)+5)))

####Sensi5_all#####
###Convert data into long format and select taxa with sensitivity values only
BQI_dat_5all <- dat_all %>% 
  dplyr::select(match(f5_all$taxa,names(dat_all)),cesymr) %>% 
  gather(sp,abund,-cesymr) %>% 
  left_join(sensi5_all,by='sp') %>%
  left_join(dat_div_all,by='cesymr')

BQI_all_5all<- 
  BQI_dat_5all %>%
  group_by(cesymr) %>% 
  mutate(v1 = (abund/ #abundace of a given taxa
                 sum(abund))*##total abundance of taxa with sensitivity values
           sensitivity) %>%
  summarise(BQI_5all = sum(v1)* ##sum of part 1 at the replicate level
              dplyr::first(log10(S+1))*#log of total ichness at replicate level
              (dplyr::first(N) / (dplyr::first(N)+5)))


BQI <- full_join(BQI_all_10,BQI_all_5,by='cesymr') %>% 
  full_join(BQI_all_5all,by='cesymr')
  
names(BQI)
## [1] "cesymr"   "BQI_10"   "BQI_5"    "BQI_5all"
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y,use='complete.obs'))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor * r)
}

pairs(BQI[,-1],upper.panel = panel.cor)

mtm_BQI <- full_join(mtm,BQI,by='cesymr') %>% 
  select(sedlt63:As,BQI_10:BQI_5all) %>% 
  gather(var,value,-c(BQI_10:BQI_5all))

theme_set(theme_bw(base_size = 8))
ggplot(mtm_BQI,aes(x=value,y=BQI_10))+
  geom_point(alpha=.3)+
  geom_smooth()+
  facet_wrap(~var,scales = 'free')
## Warning: Removed 60627 rows containing non-finite values (stat_smooth).
## Warning: Removed 60627 rows containing missing values (geom_point).

ggplot(mtm_BQI,aes(x=value,y=BQI_5))+
  geom_point(alpha=.3)+
  geom_smooth()+
  facet_wrap(~var,scales = 'free')
## Warning: Removed 60627 rows containing non-finite values (stat_smooth).

## Warning: Removed 60627 rows containing missing values (geom_point).

ggplot(mtm_BQI,aes(x=value,y=BQI_5all))+
  geom_point(alpha=.3)+
  geom_smooth()+
  facet_wrap(~var,scales = 'free')
## Warning: Removed 60627 rows containing non-finite values (stat_smooth).

## Warning: Removed 60627 rows containing missing values (geom_point).

library(broom)
## Warning: package 'broom' was built under R version 3.1.3
BQI_fit <- full_join(mtm,BQI,by='cesymr') %>% 
  select(sedlt63,BQI_10:BQI_5all) %>% 
  gather(var,value,-sedlt63) %>% 
  group_by(var) %>% 
  do(fit = lm(value ~ sedlt63, data = .)) 

tidy(BQI_fit, fit)
## Source: local data frame [6 x 6]
## Groups: var [3]
## 
##        var        term    estimate    std.error statistic       p.value
##      (chr)       (chr)       (dbl)        (dbl)     (dbl)         (dbl)
## 1   BQI_10 (Intercept)  5.11950517 0.0217597008 235.27461  0.000000e+00
## 2   BQI_10     sedlt63 -0.02856288 0.0007024158 -40.66378  0.000000e+00
## 3    BQI_5 (Intercept)  5.13555946 0.0218196437 235.36404  0.000000e+00
## 4    BQI_5     sedlt63 -0.02885671 0.0007043508 -40.96923  0.000000e+00
## 5 BQI_5all (Intercept)  4.63914925 0.0202129928 229.51323  0.000000e+00
## 6 BQI_5all     sedlt63 -0.02555916 0.0006524872 -39.17190 9.417516e-305
glance(BQI_fit, fit) 
## Source: local data frame [3 x 12]
## Groups: var [3]
## 
##        var r.squared adj.r.squared    sigma statistic       p.value    df
##      (chr)     (dbl)         (dbl)    (dbl)     (dbl)         (dbl) (int)
## 1   BQI_10 0.1849488     0.1848370 1.360661  1653.543  0.000000e+00     2
## 2    BQI_5 0.1872156     0.1871041 1.364409  1678.478  0.000000e+00     2
## 3 BQI_5all 0.1739442     0.1738308 1.263944  1534.438 9.417516e-305     2
## Variables not shown: logLik (dbl), AIC (dbl), BIC (dbl), deviance (dbl),
##   df.residual (int)
BQI5_mud <- full_join(mtm,BQI,by='cesymr') %>% 
  select(sedlt63,BQI_5,council)

library(viridis)
## Warning: package 'viridis' was built under R version 3.1.3
ggplot(BQI5_mud,aes(sedlt63,BQI_5)) +
  geom_point(alpha=.5,aes(color=council)) +
geom_smooth(method='lm')+
  scale_color_viridis(discrete=T,name = 'Council') +
  xlab(~ "Percentage mud (<63 "*mu * "m)") +
  ylab('BQI')
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).