Data is downloaded as tab-separated UTF-8.
require(tidyverse)
require(knitr)
dat <- read_delim("Downloads/TropicosQuery-2.dat",delim = "\t", escape_double = FALSE, trim_ws = TRUE)
A quick check of data messiness can be done by seeing whether we have many divergent identifications (i.e multiple names associated with same collection number). In this case it doesn’t look too bad so we’ll proceed.
dat %>%
group_by(CollectorString,CollectionNumber) %>%
summarize(number_of_names=length(unique(`*FullName`)),
names=paste(unique(`*FullName`),collapse=", ")
) %>%
filter(number_of_names>1) %>%
arrange(desc(number_of_names)) %>%
kable()
| CollectorString | CollectionNumber | number_of_names | names |
|---|---|---|---|
| Service Forestier (SF) | 13804 | 2 | NA, Diospyros L. |
| Service Forestier (SF) | 5262 | 2 | NA, Vernonia pectoralis Baker var. pectoralis |
| Tabita Randrianarivony | 1184 | 2 | Conyza aff. bonariensis (L.) Cronquist, Conyza bonariensis (L.) Cronquist |
| Tabita Randrianarivony | 1192 | 2 | Conyza aff. attenuata DC., Conyza attenuata DC. |
| Tabita Randrianarivony, Miantohitsy, Tompoinarivo, Rehary, Tsiazokatia, Miantohitsy & Rebesa | 449 | 2 | Dombeya decaryi Hochr., Dombeya Cav. |
Basic maps: with geom_point()
ggplot(dat,aes(LongitudeDecimal,LatitudeDecimal))+
geom_point()
This is very “overplotted” because many points share the same location, so we can use transparency (alpha) and geom_jitter() which give this points a little bit of artificial scatter along the x and y axes to space them out and make ‘hotspots’ a little more visible. We’ll also make the background white (theme_bw()) for clarity.
ggplot(dat,aes(LongitudeDecimal,LatitudeDecimal))+
geom_jitter(width=0.001,height=0.001,alpha=I(1/10))+
theme_bw()
We may also use the ggmap package to plot this on a basemap (in this case from google). Here, we’ll also make the points red so they stand out more. Note you’ll have to separately register a google maps API key to run this - if you want I can help set this up.
require(ggmap)
map <- get_map(location =c(left=44.1,right=44.25,top=-22.60,bottom=-22.725),zoom=12,maptype="satellite")
ggmap(map)+geom_jitter(data=dat,aes(LongitudeDecimal,LatitudeDecimal),width=0.001,height=0.001,alpha=I(1/10),color="red")+theme_bw()+scale_x_continuous(limits=c(44.11,44.23))+scale_y_continuous(limits=c(-22.73,-22.60))
This may also be expressed as a heatmap, by dividing the data into a grid, and counting up how many collections are in each grid cell.
map <- get_map(location =c(left=44.1,right=44.25,top=-22.60,bottom=-22.725),zoom=12,maptype="satellite")
ggmap(map)+
geom_bin2d(data=dat,aes(LongitudeDecimal,LatitudeDecimal))+
stat_bin2d(data=dat,geom = "text", aes(LongitudeDecimal,LatitudeDecimal,label = after_stat(count)),color="white",size=2) +
theme_bw()+
scale_x_continuous(limits=c(44.11,44.23))+
scale_y_continuous(limits=c(-22.73,-22.60))
If we construct our bins with a custom function, we may also count how many species fall into each bin, and how many are unique to it.
#create bins
dat <- dat %>% mutate(
xcell=as.numeric(cut(LongitudeDecimal,15,labels=1:15)), #binned values of lon
ycell=as.numeric(cut(LatitudeDecimal,15,labels=1:15)), #binned values of lat
cellid=paste(xcell,ycell)) #cell "id" as text string of the two binned values
#for each bin, compute number of collections, number of species,
#turnover (beta-diversity) as number of species in a cell not shared by neighbor cells
#and unique speices (contribution to gamma-diversity) as number of species in a cell n0t share by any other cell
#empty vectors that will build up in the loop below
cellid<-c()
n_collections<-c()
n_species<-c()
turnover<-c()
unique_species<-c()
X<-c()
Y<-c()
for(i in unique(dat$cellid)){ #for each cell...
celldat<-dat[dat$cellid==i,] #subset our data to this cell
cell_species<-unique(celldat$`*FullName`) #extract a list of unique species in cell
x<-unique(celldat$xcell)
y<-unique(celldat$ycell)
#define the 8 cells surrounding a cell as its neighborhood, and get those species
l<-x-1 #left
r<-x+1 #right
u<-y+1 #upper
w<-y-1 #lower
neighborhood<-c()
for(j in l:r){neighborhood<-c(neighborhood,paste(j,u:w))} #all combinations of these
neighborhood<-setdiff(neighborhood,i) #remove the cell in question itself
celldat_n=dat[dat$cellid %in% neighborhood,]
neighbor_species=unique(celldat_n$`*FullName`)
#get all other cells (except this one) and species occuring there
celldat_o<-dat[dat$cellid!=i,]
o_species<-unique(celldat_o$`*FullName`)
#get measurements for this cell
n_collections_i<-dim(celldat)[1] #just the number of rows falling in the cell
n_species_i<-length(cell_species)
turnover_i<-length(setdiff(cell_species,neighbor_species)) #setdiff is the difference between two lists
unique_species_i<-length(setdiff(cell_species,o_species))
cellid_i<-i
#concatenate these measurements onto our empty vectors one by one
n_collections<-c(n_collections,n_collections_i)
n_species<-c(n_species,n_species_i)
turnover<-c(turnover,turnover_i)
cellid<-c(cellid,cellid_i)
unique_species<-c(unique_species,unique_species_i)
X<-c(X,x)
Y<-c(Y,y)
}
values<-data.frame(cellid,X,Y,n_collections,n_species,turnover,unique_species)
Heatmap showing collections in each cell:
ggplot(values,aes(X,Y))+
geom_raster(aes(fill=n_collections))+
geom_text(label=n_collections,color="white")
Showing number of species in each cell:
ggplot(values,aes(X,Y))+
geom_raster(aes(fill=n_species))+
geom_text(label=n_species,color="white")
And showing number of unique species in each cell:
ggplot(values,aes(X,Y))+
geom_raster(aes(fill=unique_species))+
geom_text(label=unique_species,color="white")
Finally, for cells with a certain amount of actual data recorded in neighboring cells, we can use this to interpolate expected number of new species for each new cell.
again, here’s the base heatmap by number of collections
#custom interpolate
#find all cellID's not in the data
all_cells<-data.frame(X=rep(1:15,15),Y=rep(1:15,each=15))
#add them to the data
all_cells <- merge(all_cells,values,all=T)
all_cells$interpolated<-NA
all_cells$cellid<-paste(all_cells$X,all_cells$Y)
ggplot(all_cells,aes(X,Y))+
geom_tile(aes(fill=as.numeric(n_collections)),color="light grey")+
geom_text(aes(label=n_collections),color="white")+
scale_fill_gradient2(low="white",midpoint=-40,high="black",na.value="white")+
scale_x_continuous("longitude",breaks=c(5,10,15),labels=c(44.16,44.19,44.22))+
scale_y_continuous("latitude",breaks=c(5,10,15),labels=c(-22.69,-22.65,-22.61))+
theme_bw()+theme(legend.position = "none")+ggtitle("number of collections / cell")
and here’s the heatmap by number of species:
ggplot(all_cells,aes(X,Y))+
geom_tile(aes(fill=n_species),color="light grey")+
geom_text(aes(label=n_species),color="white")+
scale_fill_gradient2(low="white",midpoint=-5,high="black",na.value="white")+
scale_x_continuous("longitude",breaks=c(5,10,15),labels=c(44.16,44.19,44.22))+
scale_y_continuous("latitude",breaks=c(5,10,15),labels=c(-22.69,-22.65,-22.61))+
theme_bw()+theme(legend.position = "none")+ggtitle("number of species / cell")
Below, we’ll interpolating cells with more than 4 neighbor cells that have recorded data, by averaging the neighbors.
k<-4
for(i in unique(all_cells[is.na(all_cells$n_species),"cellid"])){ # for each cell without data
celldat<-all_cells[all_cells$cellid==i,]
#look at its neighborhood
x<-unique(celldat$X)
y<-unique(celldat$Y)
#define the 8 cells surrounding a cell as its neighborhood
l<-x-1 #left
r<-x+1 #right
u<-y+1 #upper
w<-y-1 #lower
neighborhood<-c()
for(j in l:r){neighborhood<-c(neighborhood,paste(j,u:w))} #all combinations of these
neighborhood<-setdiff(neighborhood,i) #remove the cell in question itself
celldat_n<-all_cells[all_cells$cellid %in% neighborhood,]
celldat_nonzero<-celldat_n[!is.na(celldat_n$unique_species),]
number_of_neighbors<-dim(celldat_nonzero)[1]
if(number_of_neighbors>k){
celldat$n_species=mean(celldat_nonzero$n_species) %>% round(0)
celldat$turnover=mean(celldat_nonzero$turnover) %>% round(0)
celldat$unique_species=mean(celldat_nonzero$unique_species) %>% round(0)
celldat$interpolated="new"
#add data
all_cells[all_cells$cellid==i,]<-celldat #write over
}
}
ggplot(all_cells,aes(X,Y))+
geom_tile(aes(fill=n_species),color="light grey")+
geom_text(aes(label=n_species,color=interpolated))+
scale_color_manual(values=c("red"),na.value="white")+
scale_fill_gradient2(low="white",midpoint=-5,high="black",na.value="white")+
scale_x_continuous("longitude",breaks=c(5,10,15),labels=c(44.16,44.19,44.22))+
scale_y_continuous("latitude",breaks=c(5,10,15),labels=c(-22.69,-22.65,-22.61))+
theme_bw()+theme(legend.position = "none")+ggtitle("number of species / cell")
Then we can iterate this several more times (code not shown)
This method suggests that each cell in, for example, the difficult-to-collect area in the SW corner of Analavelona, might have 10-28 species.
How would this contribute to the overall number of species for the forest? We can estimate this with an accumulation curve, seeing how each current cell contributes to the total number of species. Here, we’ll randomize the order in which we sample each cell, and run these randomized orders 100 times (transparent lines), and then run the average (heavy line).
It looks like the curve has not really started to level off = so what would it look like if we continued to sample the remaining empty cells? Perhaps as many as 1000.