Load pacakges and reading Tropicos data

Data is downloaded as tab-separated UTF-8.

require(tidyverse)
require(knitr)

dat <- read_delim("Downloads/TropicosQuery-2.dat",delim = "\t", escape_double = FALSE, trim_ws = TRUE)

Checking Tropicos data

A quick check of data messiness can be done by seeing whether we have many divergent identifications (i.e multiple names associated with same collection number). In this case it doesn’t look too bad so we’ll proceed.

dat %>%
  group_by(CollectorString,CollectionNumber) %>%
  summarize(number_of_names=length(unique(`*FullName`)),
            names=paste(unique(`*FullName`),collapse=", ")
            ) %>%
  filter(number_of_names>1) %>%
  arrange(desc(number_of_names)) %>%
  kable()
CollectorString CollectionNumber number_of_names names
Service Forestier (SF) 13804 2 NA, Diospyros L.
Service Forestier (SF) 5262 2 NA, Vernonia pectoralis Baker var. pectoralis
Tabita Randrianarivony 1184 2 Conyza aff. bonariensis (L.) Cronquist, Conyza bonariensis (L.) Cronquist
Tabita Randrianarivony 1192 2 Conyza aff. attenuata DC., Conyza attenuata DC.
Tabita Randrianarivony, Miantohitsy, Tompoinarivo, Rehary, Tsiazokatia, Miantohitsy & Rebesa 449 2 Dombeya decaryi Hochr., Dombeya Cav.

Basic map

Basic maps: with geom_point()

ggplot(dat,aes(LongitudeDecimal,LatitudeDecimal))+
  geom_point()

This is very “overplotted” because many points share the same location, so we can use transparency (alpha) and geom_jitter() which give this points a little bit of artificial scatter along the x and y axes to space them out and make ‘hotspots’ a little more visible. We’ll also make the background white (theme_bw()) for clarity.

ggplot(dat,aes(LongitudeDecimal,LatitudeDecimal))+
  geom_jitter(width=0.001,height=0.001,alpha=I(1/10))+
  theme_bw()

We may also use the ggmap package to plot this on a basemap (in this case from google). Here, we’ll also make the points red so they stand out more. Note you’ll have to separately register a google maps API key to run this - if you want I can help set this up.

require(ggmap)

map <- get_map(location =c(left=44.1,right=44.25,top=-22.60,bottom=-22.725),zoom=12,maptype="satellite")
ggmap(map)+geom_jitter(data=dat,aes(LongitudeDecimal,LatitudeDecimal),width=0.001,height=0.001,alpha=I(1/10),color="red")+theme_bw()+scale_x_continuous(limits=c(44.11,44.23))+scale_y_continuous(limits=c(-22.73,-22.60))

Heatmap

This may also be expressed as a heatmap, by dividing the data into a grid, and counting up how many collections are in each grid cell.

map <- get_map(location =c(left=44.1,right=44.25,top=-22.60,bottom=-22.725),zoom=12,maptype="satellite")
ggmap(map)+
  geom_bin2d(data=dat,aes(LongitudeDecimal,LatitudeDecimal))+
  stat_bin2d(data=dat,geom = "text", aes(LongitudeDecimal,LatitudeDecimal,label = after_stat(count)),color="white",size=2) +
  theme_bw()+
  scale_x_continuous(limits=c(44.11,44.23))+
  scale_y_continuous(limits=c(-22.73,-22.60))

If we construct our bins with a custom function, we may also count how many species fall into each bin, and how many are unique to it.

#create bins
dat <- dat %>% mutate(
  xcell=as.numeric(cut(LongitudeDecimal,15,labels=1:15)), #binned values of lon
  ycell=as.numeric(cut(LatitudeDecimal,15,labels=1:15)), #binned values of lat
  cellid=paste(xcell,ycell)) #cell "id" as text string of the two binned values

#for each bin, compute number of collections, number of species,
#turnover (beta-diversity) as number of species in a cell not shared by neighbor cells
#and unique speices (contribution to gamma-diversity) as number of species in a cell n0t share by any other cell

#empty vectors that will build up in the loop below
cellid<-c()
n_collections<-c()
n_species<-c()
turnover<-c()
unique_species<-c()
X<-c()
Y<-c()
for(i in unique(dat$cellid)){ #for each cell...
  celldat<-dat[dat$cellid==i,] #subset our data to this cell
  cell_species<-unique(celldat$`*FullName`) #extract a list of unique species in cell
  x<-unique(celldat$xcell) 
  y<-unique(celldat$ycell)
  
  #define the 8 cells surrounding a cell as its neighborhood, and get those species
  l<-x-1 #left
  r<-x+1 #right
  u<-y+1 #upper
  w<-y-1 #lower
  neighborhood<-c() 
  for(j in l:r){neighborhood<-c(neighborhood,paste(j,u:w))} #all combinations of these
    neighborhood<-setdiff(neighborhood,i) #remove the cell in question itself
  celldat_n=dat[dat$cellid %in% neighborhood,]
  neighbor_species=unique(celldat_n$`*FullName`)
  
  #get all other cells (except this one) and species occuring there
  celldat_o<-dat[dat$cellid!=i,]
  o_species<-unique(celldat_o$`*FullName`)
  
  #get measurements for this cell
  n_collections_i<-dim(celldat)[1] #just the number of rows falling in the cell
  n_species_i<-length(cell_species) 
  turnover_i<-length(setdiff(cell_species,neighbor_species)) #setdiff is the difference between two lists
  unique_species_i<-length(setdiff(cell_species,o_species))
  cellid_i<-i
  
  #concatenate these measurements onto our empty vectors one by one
  n_collections<-c(n_collections,n_collections_i)
  n_species<-c(n_species,n_species_i)
  turnover<-c(turnover,turnover_i)
  cellid<-c(cellid,cellid_i)
  unique_species<-c(unique_species,unique_species_i)
  X<-c(X,x)
  Y<-c(Y,y)
  
}

values<-data.frame(cellid,X,Y,n_collections,n_species,turnover,unique_species)  

Heatmap showing collections in each cell:

ggplot(values,aes(X,Y))+
  geom_raster(aes(fill=n_collections))+
  geom_text(label=n_collections,color="white")

Showing number of species in each cell:

ggplot(values,aes(X,Y))+
  geom_raster(aes(fill=n_species))+
  geom_text(label=n_species,color="white")

And showing number of unique species in each cell:

ggplot(values,aes(X,Y))+
  geom_raster(aes(fill=unique_species))+
  geom_text(label=unique_species,color="white")

Predicting data for not-yet-collected cells

Finally, for cells with a certain amount of actual data recorded in neighboring cells, we can use this to interpolate expected number of new species for each new cell.

again, here’s the base heatmap by number of collections

#custom interpolate
#find all cellID's not in the data
all_cells<-data.frame(X=rep(1:15,15),Y=rep(1:15,each=15))

#add them to the data
all_cells <- merge(all_cells,values,all=T)

all_cells$interpolated<-NA
all_cells$cellid<-paste(all_cells$X,all_cells$Y)

ggplot(all_cells,aes(X,Y))+
  geom_tile(aes(fill=as.numeric(n_collections)),color="light grey")+
  geom_text(aes(label=n_collections),color="white")+
  scale_fill_gradient2(low="white",midpoint=-40,high="black",na.value="white")+
  scale_x_continuous("longitude",breaks=c(5,10,15),labels=c(44.16,44.19,44.22))+
  scale_y_continuous("latitude",breaks=c(5,10,15),labels=c(-22.69,-22.65,-22.61))+
  theme_bw()+theme(legend.position = "none")+ggtitle("number of collections / cell")

and here’s the heatmap by number of species:

ggplot(all_cells,aes(X,Y))+
  geom_tile(aes(fill=n_species),color="light grey")+
  geom_text(aes(label=n_species),color="white")+
  scale_fill_gradient2(low="white",midpoint=-5,high="black",na.value="white")+
  scale_x_continuous("longitude",breaks=c(5,10,15),labels=c(44.16,44.19,44.22))+
  scale_y_continuous("latitude",breaks=c(5,10,15),labels=c(-22.69,-22.65,-22.61))+
  theme_bw()+theme(legend.position = "none")+ggtitle("number of species / cell")

Below, we’ll interpolating cells with more than 4 neighbor cells that have recorded data, by averaging the neighbors.

k<-4

  for(i in unique(all_cells[is.na(all_cells$n_species),"cellid"])){ # for each cell without data
  celldat<-all_cells[all_cells$cellid==i,]
  #look at its neighborhood
  x<-unique(celldat$X)
  y<-unique(celldat$Y)
  #define the 8 cells surrounding a cell as its neighborhood
  l<-x-1 #left
  r<-x+1 #right
  u<-y+1 #upper
  w<-y-1 #lower
  neighborhood<-c() 
  for(j in l:r){neighborhood<-c(neighborhood,paste(j,u:w))} #all combinations of these
  neighborhood<-setdiff(neighborhood,i) #remove the cell in question itself
  celldat_n<-all_cells[all_cells$cellid %in% neighborhood,]
  celldat_nonzero<-celldat_n[!is.na(celldat_n$unique_species),]
  number_of_neighbors<-dim(celldat_nonzero)[1]
  if(number_of_neighbors>k){
  celldat$n_species=mean(celldat_nonzero$n_species) %>% round(0)
  celldat$turnover=mean(celldat_nonzero$turnover) %>% round(0)
  celldat$unique_species=mean(celldat_nonzero$unique_species) %>% round(0)
  celldat$interpolated="new"
  #add data
  all_cells[all_cells$cellid==i,]<-celldat #write over
  }
  }

ggplot(all_cells,aes(X,Y))+
  geom_tile(aes(fill=n_species),color="light grey")+
  geom_text(aes(label=n_species,color=interpolated))+
  scale_color_manual(values=c("red"),na.value="white")+
  scale_fill_gradient2(low="white",midpoint=-5,high="black",na.value="white")+
  scale_x_continuous("longitude",breaks=c(5,10,15),labels=c(44.16,44.19,44.22))+
  scale_y_continuous("latitude",breaks=c(5,10,15),labels=c(-22.69,-22.65,-22.61))+
  theme_bw()+theme(legend.position = "none")+ggtitle("number of species / cell")

Then we can iterate this several more times (code not shown)

This method suggests that each cell in, for example, the difficult-to-collect area in the SW corner of Analavelona, might have 10-28 species.

How would this contribute to the overall number of species for the forest? We can estimate this with an accumulation curve, seeing how each current cell contributes to the total number of species. Here, we’ll randomize the order in which we sample each cell, and run these randomized orders 100 times (transparent lines), and then run the average (heavy line).

It looks like the curve has not really started to level off = so what would it look like if we continued to sample the remaining empty cells? Perhaps as many as 1000.