Rationale
The project involves Exploratory Data Analysis of species occurrence data of large branchiopod crustaceans from the arid regions of Maharashtra state, India.
The field work has been carried out by Dr. Avinash Vanjare’s lab from Ahmednagar College, Maharashtra, India while I helped in the identification of the animals and conducted the Exploratory data analysis
A scientific poster in this regard has been presented at the Aquatic Ecosystems: Sustainability and Conservation conference held at IISER, Pune,2019 by Dr. Vanjare’s students
# Getting the data from the main dataset
locality_data<-read_excel(data_path,
sheet=3)%>%
dplyr::select(Locality,
Type,
lat,
long)
# Color palette for the map
pal <- colorFactor(
palette = c('steelblue',
'forestgreen',
'black',
'purple',
'orange',
'grey60'),
domain = locality_data$Locality)
# Map using leaflet package
lbranchi_arid_map<-leaflet(locality_data) %>%
addProviderTiles("Stamen.Terrain") %>%
addCircleMarkers(
color = "black",
opacity = 1,
stroke = TRUE,
lng = ~long,
lat = ~lat,
label = ~as.character(Locality),
radius = 4)
# Adding scale bar
addScaleBar(lbranchi_arid_map,
position = 'topright')%>%
addProviderTiles(providers$Stamen.Terrain) %>% ## adding a minimap for overview
addMiniMap(
tiles = providers$Esri.WorldImagery,
toggleDisplay = TRUE)data_summ_lbranchi<-psych::describe(lbranchi_data%>%dplyr::select(Altitude,
pH,
Temperature,
Salinity,
Total_species),
na.rm = T)
# Visualizing the table using DT package
require(DT)
DT::datatable(round(data_summ_lbranchi,2),
rownames = TRUE,
width = '100%',
height = '100%')The pH ranged from mild acidic to moderately alkaline while the altitude did not vary much.
# Converting from wide to long form
lbranchi_long<-lbranchi_data%>%dplyr::select(Altitude,
pH,
Temperature,
Salinity)%>%
tidyr::gather(env_var,values,
Altitude:Salinity)
# plot for env var
lbranchi_long%>%
ggplot(aes(x=env_var,
y=values))+
geom_boxplot(col='black',
fill='grey60',
lwd=1)+
theme_bw(base_size = 19)+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
facet_wrap(~env_var,
scales = 'free')+
labs(x="Environmental variables",
y="Value")Almost all the species occurred in these two habitat types
# Converting from wide to long form
lbranchi_long_type<-lbranchi_data%>%dplyr::select(Type,Altitude,
pH,
Temperature,
Salinity,
Total_species)%>%
tidyr::gather(env_var,values,
Altitude:Salinity)
# plot for env var
lbranchi_long_type%>%
ggplot(aes(x=env_var,
y=values,
fill=Type))+
geom_boxplot(lwd=0.8)+
theme_bw(base_size = 19)+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
facet_wrap(~env_var,
scales = 'free')+
labs(x="Environmental variables",
y="Value")+
scale_fill_manual(values = c('forestgreen','orange'))It was evident that range of the environmental variables measured overlapped between the habitat types
lbranchi_sp<-lbranchi_data%>%
dplyr::select(Type,S_dichotomus:C_hislopi)%>%
tidyr::gather(sp_name,pre_abs,S_dichotomus:C_hislopi)%>%
group_by(sp_name)%>%
summarise(occurrence=sum(pre_abs))%>%
dplyr::arrange(desc(occurrence))
head(lbranchi_sp,5)| sp_name | occurrence |
|---|---|
| S_dichotomus | 9 |
| C_annandalei | 4 |
| L_denticulatus | 4 |
| S_simplex | 4 |
| E_indocylindrova | 3 |
lbranchi_sp$sp_name<-fct_reorder(lbranchi_sp$sp_name,
lbranchi_sp$occurrence)
# Plot
lbranchi_sp%>%
plot_ly(x = ~occurrence,
y=~forcats::fct_reorder(sp_name,
occurrence),
type = "bar",
color = I("orange"),
orientation = 'h')%>%
layout(xaxis = list(title = "Species"),
yaxis = list(title = "Occurrence"))# OR plot in ggplot (optional)
# lbranchi_sp%>%
# ggplot(aes(x=sp_name,
# y=occurrence))+
# geom_bar(stat = 'identity',
# position = 'dodge',
# fill='orange',
# color = 'black')+
# xlab('Species')+
# theme_bw(base_size = 18)+
# theme(panel.grid.major = element_blank(),
# panel.grid.minor = element_blank())+
# scale_y_discrete(name="Occurrence",
# limits=1:9,
# breaks=c(0,5,9))+
# coord_flip()Fairy shrimp species were the most commonly occurring elements while Cyclestheria hislopi (C_hislopi) was rarely observed
# Obtaining the data
lbranchi_habitat_sp<-lbranchi_data%>%
dplyr::select(Type,S_dichotomus:C_hislopi)%>%
dplyr::filter(Type != 'River')%>%
tidyr::gather(sp_name,pre_abs,S_dichotomus:C_annandalei)%>%
dplyr::mutate(Order=rep(c("Anostraca","Laevicaudata","Spinicaudata"),
each = c(28,14,42)))%>%
group_by(Order,Type)%>%
summarise(occurrence=sum(pre_abs))%>%
dplyr::arrange(desc(occurrence))
head(lbranchi_habitat_sp,3)| Order | Type | occurrence |
|---|---|---|
| Anostraca | Pond | 11 |
| Laevicaudata | Pond | 4 |
| Spinicaudata | Pond | 4 |
lbranchi_habitat_sp$sp_name<-fct_reorder(lbranchi_habitat_sp$Order,
lbranchi_habitat_sp$Order)
lbranchi_habitat_sp%>%
ggplot(aes(x=Order,
y=occurrence,
fill=Type))+
geom_bar(stat = 'identity',
position = 'dodge',
color = 'black')+
xlab('Species')+
theme_bw(base_size = 18)+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 90))+
scale_fill_manual(values = c("#999999", "#E69F00"))+
ylab("Occurrence")+xlab("Order")From the data, it could be seen that more species occurred in ponds than in pools
# Pivoting the table to explore the species assemblages
lbranchi_data_pivot<-lbranchi_data%>%
dplyr::select(S_dichotomus:C_hislopi)%>%
dplyr::mutate(sites=sprintf("site_%d",seq=(1:15)),
site_id=seq(1:15))%>%
column_to_rownames(.,'sites')%>%
rownames_to_column(.)%>%
gather(var,
value,
S_dichotomus:C_hislopi)
# Visualization of the data
lbranchi_data_pivot%>%
plot_ly(x=~var,
y=~forcats::fct_reorder(rowname,site_id),
z=~value,
type = "heatmap",
colors = "Set3")%>%
layout(title = 'Heatmap showing species assemblages',
xaxis = list(title = "Species"),
yaxis = list(title = "Sites"))Value of 1 indicates presence and 0 indicates absence in the site
A maximum of 3 species were observed in a single site with a combination of fairy shrimp species with 2 clam shrimps. Cyclestheria hislopi being found in a river was never seen with any other species
Given the species co-occurrences, Fager’s index of co-occurrence for the large branchiopod species was then calculated The index calculates how frequently do the species pairs occur given the data. These crustaceans are known to occur in characteristic assemblages and hence it becomes interesting to check which species co-occur commonly
# The combn function is used to obtain all the species pairs combinations
# Data for obtaining the combinations
lbranchi_coocc<-lbranchi_data%>%
dplyr::select(S_dichotomus:C_hislopi)
sp_combo<-combn(colnames(lbranchi_coocc),
2,simplify = T)
head(sp_combo)## [,1] [,2] [,3] [,4]
## [1,] "S_dichotomus" "S_dichotomus" "S_dichotomus" "S_dichotomus"
## [2,] "S_simplex" "L_denticulatus" "E_indocylindrova" "E_michaeli"
## [,5] [,6] [,7] [,8]
## [1,] "S_dichotomus" "S_dichotomus" "S_simplex" "S_simplex"
## [2,] "C_annandalei" "C_hislopi" "L_denticulatus" "E_indocylindrova"
## [,9] [,10] [,11] [,12]
## [1,] "S_simplex" "S_simplex" "S_simplex" "L_denticulatus"
## [2,] "E_michaeli" "C_annandalei" "C_hislopi" "E_indocylindrova"
## [,13] [,14] [,15] [,16]
## [1,] "L_denticulatus" "L_denticulatus" "L_denticulatus" "E_indocylindrova"
## [2,] "E_michaeli" "C_annandalei" "C_hislopi" "E_michaeli"
## [,17] [,18] [,19] [,20]
## [1,] "E_indocylindrova" "E_indocylindrova" "E_michaeli" "E_michaeli"
## [2,] "C_annandalei" "C_hislopi" "C_annandalei" "C_hislopi"
## [,21]
## [1,] "C_annandalei"
## [2,] "C_hislopi"
After obtaining the species pairs, species co-occurrence values were obtained. This was done by writing a function to calculate the values
# Species combinations using 'combn' function are used to get the Fager's indices for the dataset
fg_index<-function(x){
joint_occ_1<-rowSums(lbranchi_coocc[,x])
joint_occ_2<-ifelse(joint_occ_1>=2,1,0)%>%
sum(.)
joint_occ_3<-2*joint_occ_2
tot_occ_1<-sum(lbranchi_coocc[,x])
joint_occ_3/tot_occ_1
}
# Observing the result obtained by using the above function
Fager_values<-apply(combn(colnames(lbranchi_coocc),2),
2,
fg_index)
Fager_values## [1] 0.4615385 0.3076923 0.0000000 0.0000000 0.6153846 0.0000000 0.0000000
## [8] 0.0000000 0.0000000 0.2500000 0.0000000 0.5714286 0.0000000 0.5000000
## [15] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
As observed above, the obtained dataset is a vector with no names,therefore, species combinations generated above (sp_names) are combined with the vector
Fager_data<-cbind(Values=round(Fager_values,4),
t(sp_combo))%>%
data.frame(.)%>%
dplyr::rename('Species_1'='V2',
'Species_2'='V3')%>%
dplyr::mutate_at(vars(contains('Value')),
as.character)%>%
dplyr::mutate_if(is.character,
as.numeric)%>%
arrange(desc(Values))
# Visualizing the Top 5 co-occuring species pairs using DT package
require(DT)
DT::datatable(Fager_data,
rownames = TRUE,
width = '100%',
height = '100%')The results obtained above were then visualized using a heatmap
To observe a clear trend in the visualization, species names were given order based on the occurrence (counts) in the Fager dataset (Please note that these counts are purely for a better visualization purpose)
# Species counts dataset
sp_counts<-Fager_data%>%
group_by(Species_1)%>%
summarise(sp_freq=n())%>%
arrange(desc(sp_freq))
# Joining the sp_counts with the Fager data
Fager_data2<-Fager_data%>%
inner_join(sp_counts,by="Species_1")
# Visualization using heatmap in plotly
Fager_data2%>%
plot_ly(x=~forcats::fct_reorder(Species_1,
desc(sp_freq)),
y=~forcats::fct_reorder(Species_2,
desc(sp_freq)),
z=~Values,
type = "heatmap",
hoverinfo = 'text',
text= ~paste("Species_1: ",
Species_1,
"<br>",
"Species_2: ",
Species_2,
"<br>",
"Fager_value",
Values))%>%
layout(title = 'Heatmap of Fager values',
xaxis = list(title = "Species"),
yaxis = list(title = "Species"))Both species of Streptocephalus co-occurred commonly as compared with other species. Eulimnadia michaeli did not co-occur with any other species while highest co-occurrence was observed between the Cyzicus annandalei and Streptocephalus dichotomus