Rationale

The project involves Exploratory Data Analysis of species occurrence data of large branchiopod crustaceans from the arid regions of Maharashtra state, India.

The field work has been carried out by Dr. Avinash Vanjare’s lab from Ahmednagar College, Maharashtra, India while I helped in the identification of the animals and conducted the Exploratory data analysis

A scientific poster in this regard has been presented at the Aquatic Ecosystems: Sustainability and Conservation conference held at IISER, Pune,2019 by Dr. Vanjare’s students

Data Import

# data file path

data_path<-"C:/Research_Data/Research data/Large Branchiopoda/Large_branchiopoda_faunistics_arid_region/Arid_area_collections.xlsx"

# Large branchiopod data

lbranchi_data<-read_excel(data_path,
                            sheet=3)

Map of the sampled localities where large branchiopods were found

# Getting the data from the main dataset

locality_data<-read_excel(data_path,
                          sheet=3)%>%
    dplyr::select(Locality,
                  Type,
                  lat,
                  long)


# Color palette for the map


pal <- colorFactor(
    palette = c('steelblue', 
                'forestgreen', 
                'black',
                'purple', 
                'orange',
                'grey60'),
    domain = locality_data$Locality)


# Map using leaflet package


lbranchi_arid_map<-leaflet(locality_data) %>%
     addProviderTiles("Stamen.Terrain") %>%
    addCircleMarkers(
        color = "black",
        opacity = 1,
        stroke = TRUE,
        lng = ~long, 
        lat = ~lat,
        label = ~as.character(Locality),
        radius = 4)
   
# Adding scale bar

addScaleBar(lbranchi_arid_map,
            position = 'topright')%>%
    addProviderTiles(providers$Stamen.Terrain) %>% ## adding a minimap for overview
    addMiniMap(
        tiles = providers$Esri.WorldImagery,
        toggleDisplay = TRUE)

Data summary of the environmental dataset

data_summ_lbranchi<-psych::describe(lbranchi_data%>%dplyr::select(Altitude,
                  pH,
                  Temperature,
                  Salinity,
                  Total_species),
                  na.rm = T)


# Visualizing the table using DT package

require(DT)

DT::datatable(round(data_summ_lbranchi,2),
                     rownames = TRUE,
                     width = '100%', 
                     height = '100%')
#write.csv(data_summ_lbranchi,'data_summ_lbranchi.csv')

The pH ranged from mild acidic to moderately alkaline while the altitude did not vary much.

Data visualization of some environmental variables

# Converting from wide to long form 

lbranchi_long<-lbranchi_data%>%dplyr::select(Altitude,
                  pH,
                  Temperature,
                  Salinity)%>%
       tidyr::gather(env_var,values, 
                  Altitude:Salinity)

# plot for env var

lbranchi_long%>%
    ggplot(aes(x=env_var,
               y=values))+
    geom_boxplot(col='black',
                 fill='grey60',
                 lwd=1)+
        theme_bw(base_size = 19)+
     theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank())+
    facet_wrap(~env_var,
               scales = 'free')+
    labs(x="Environmental variables",
         y="Value")

Data visualization of some environmental variables based on habitat types (pools and ponds)

Almost all the species occurred in these two habitat types

# Converting from wide to long form 

lbranchi_long_type<-lbranchi_data%>%dplyr::select(Type,Altitude,
                  pH,
                  Temperature,
                  Salinity,
                  Total_species)%>%
       tidyr::gather(env_var,values, 
                  Altitude:Salinity)

# plot for env var

lbranchi_long_type%>%
    ggplot(aes(x=env_var,
               y=values,
               fill=Type))+
    geom_boxplot(lwd=0.8)+
    theme_bw(base_size = 19)+
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank())+
    facet_wrap(~env_var,
               scales = 'free')+
    labs(x="Environmental variables",
         y="Value")+
    scale_fill_manual(values = c('forestgreen','orange'))

It was evident that range of the environmental variables measured overlapped between the habitat types

Faunistics summary and visualization

lbranchi_sp<-lbranchi_data%>%
    dplyr::select(Type,S_dichotomus:C_hislopi)%>%
    tidyr::gather(sp_name,pre_abs,S_dichotomus:C_hislopi)%>%
    group_by(sp_name)%>%
    summarise(occurrence=sum(pre_abs))%>%
    dplyr::arrange(desc(occurrence))

head(lbranchi_sp,5)
sp_name occurrence
S_dichotomus 9
C_annandalei 4
L_denticulatus 4
S_simplex 4
E_indocylindrova 3
lbranchi_sp$sp_name<-fct_reorder(lbranchi_sp$sp_name, 
                                 lbranchi_sp$occurrence)

# Plot

lbranchi_sp%>%
    plot_ly(x = ~occurrence,
            y=~forcats::fct_reorder(sp_name,
                                    occurrence),
            type = "bar",
            color = I("orange"),
            orientation = 'h')%>%
    layout(xaxis = list(title = "Species"),
           yaxis = list(title = "Occurrence"))
# OR plot in ggplot (optional)

# lbranchi_sp%>%
#     ggplot(aes(x=sp_name,
#                y=occurrence))+
#     geom_bar(stat = 'identity',
#              position = 'dodge',
#              fill='orange',
#              color = 'black')+
#     xlab('Species')+
#     theme_bw(base_size = 18)+
#     theme(panel.grid.major = element_blank(), 
#           panel.grid.minor = element_blank())+
#     scale_y_discrete(name="Occurrence", 
#                      limits=1:9,
#                      breaks=c(0,5,9))+
#     coord_flip()

Fairy shrimp species were the most commonly occurring elements while Cyclestheria hislopi (C_hislopi) was rarely observed

Number of occurrences with respect to Large branchiopod orders and habitat type (pools vs ponds)

# Obtaining the data

lbranchi_habitat_sp<-lbranchi_data%>%
    dplyr::select(Type,S_dichotomus:C_hislopi)%>%
    dplyr::filter(Type != 'River')%>%
    tidyr::gather(sp_name,pre_abs,S_dichotomus:C_annandalei)%>%
    dplyr::mutate(Order=rep(c("Anostraca","Laevicaudata","Spinicaudata"),
                             each = c(28,14,42)))%>%
    group_by(Order,Type)%>%
    summarise(occurrence=sum(pre_abs))%>%
    dplyr::arrange(desc(occurrence))


head(lbranchi_habitat_sp,3)
Order Type occurrence
Anostraca Pond 11
Laevicaudata Pond 4
Spinicaudata Pond 4
lbranchi_habitat_sp$sp_name<-fct_reorder(lbranchi_habitat_sp$Order, 
                                 lbranchi_habitat_sp$Order)


lbranchi_habitat_sp%>%
    ggplot(aes(x=Order,
               y=occurrence,
               fill=Type))+
    geom_bar(stat = 'identity',
             position = 'dodge',
             color = 'black')+
    xlab('Species')+
    theme_bw(base_size = 18)+
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          axis.text.x = element_text(angle = 90))+
    scale_fill_manual(values = c("#999999", "#E69F00"))+
    ylab("Occurrence")+xlab("Order")

From the data, it could be seen that more species occurred in ponds than in pools

Species assemblages visualization for the collected data

# Pivoting the table to explore the species assemblages

lbranchi_data_pivot<-lbranchi_data%>%
    dplyr::select(S_dichotomus:C_hislopi)%>%
    dplyr::mutate(sites=sprintf("site_%d",seq=(1:15)),
                  site_id=seq(1:15))%>%
    column_to_rownames(.,'sites')%>%
    rownames_to_column(.)%>%
    gather(var, 
           value,
           S_dichotomus:C_hislopi)

# Visualization of the data

lbranchi_data_pivot%>%
    plot_ly(x=~var,
            y=~forcats::fct_reorder(rowname,site_id),
            z=~value,
            type = "heatmap",
            colors = "Set3")%>%
    layout(title = 'Heatmap showing species assemblages',
           xaxis = list(title = "Species"),
           yaxis = list(title = "Sites"))

Value of 1 indicates presence and 0 indicates absence in the site

A maximum of 3 species were observed in a single site with a combination of fairy shrimp species with 2 clam shrimps. Cyclestheria hislopi being found in a river was never seen with any other species

Given the species co-occurrences, Fager’s index of co-occurrence for the large branchiopod species was then calculated The index calculates how frequently do the species pairs occur given the data. These crustaceans are known to occur in characteristic assemblages and hence it becomes interesting to check which species co-occur commonly

Fager’s index calculation

Obtaining the data

# The combn function is used to obtain all the species pairs combinations

# Data for obtaining the combinations


lbranchi_coocc<-lbranchi_data%>%
    dplyr::select(S_dichotomus:C_hislopi)


sp_combo<-combn(colnames(lbranchi_coocc),
                2,simplify = T)

head(sp_combo)
##      [,1]           [,2]             [,3]               [,4]          
## [1,] "S_dichotomus" "S_dichotomus"   "S_dichotomus"     "S_dichotomus"
## [2,] "S_simplex"    "L_denticulatus" "E_indocylindrova" "E_michaeli"  
##      [,5]           [,6]           [,7]             [,8]              
## [1,] "S_dichotomus" "S_dichotomus" "S_simplex"      "S_simplex"       
## [2,] "C_annandalei" "C_hislopi"    "L_denticulatus" "E_indocylindrova"
##      [,9]         [,10]          [,11]       [,12]             
## [1,] "S_simplex"  "S_simplex"    "S_simplex" "L_denticulatus"  
## [2,] "E_michaeli" "C_annandalei" "C_hislopi" "E_indocylindrova"
##      [,13]            [,14]            [,15]            [,16]             
## [1,] "L_denticulatus" "L_denticulatus" "L_denticulatus" "E_indocylindrova"
## [2,] "E_michaeli"     "C_annandalei"   "C_hislopi"      "E_michaeli"      
##      [,17]              [,18]              [,19]          [,20]       
## [1,] "E_indocylindrova" "E_indocylindrova" "E_michaeli"   "E_michaeli"
## [2,] "C_annandalei"     "C_hislopi"        "C_annandalei" "C_hislopi" 
##      [,21]         
## [1,] "C_annandalei"
## [2,] "C_hislopi"

After obtaining the species pairs, species co-occurrence values were obtained. This was done by writing a function to calculate the values

Obtaining species co-occurrence values

# Species combinations using 'combn' function are used to get the Fager's indices for the dataset


fg_index<-function(x){
    
    joint_occ_1<-rowSums(lbranchi_coocc[,x])
    
    joint_occ_2<-ifelse(joint_occ_1>=2,1,0)%>%
        sum(.)
    
    joint_occ_3<-2*joint_occ_2
    
    tot_occ_1<-sum(lbranchi_coocc[,x])
    
    joint_occ_3/tot_occ_1
    
}


# Observing the result obtained by using the above function


Fager_values<-apply(combn(colnames(lbranchi_coocc),2),
                    2,
                    fg_index)

Fager_values
##  [1] 0.4615385 0.3076923 0.0000000 0.0000000 0.6153846 0.0000000 0.0000000
##  [8] 0.0000000 0.0000000 0.2500000 0.0000000 0.5714286 0.0000000 0.5000000
## [15] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000

As observed above, the obtained dataset is a vector with no names,therefore, species combinations generated above (sp_names) are combined with the vector

Obtaining Fager’s values for respective species pairs

Fager_data<-cbind(Values=round(Fager_values,4),
                  t(sp_combo))%>%
    data.frame(.)%>%
    dplyr::rename('Species_1'='V2',
                  'Species_2'='V3')%>%
    dplyr::mutate_at(vars(contains('Value')),
                     as.character)%>%
    dplyr::mutate_if(is.character,
                     as.numeric)%>%
    arrange(desc(Values))


# Visualizing the Top 5 co-occuring species pairs using DT package

require(DT)

DT::datatable(Fager_data,
                     rownames = TRUE,
                     width = '100%', 
                     height = '100%')

The results obtained above were then visualized using a heatmap

To observe a clear trend in the visualization, species names were given order based on the occurrence (counts) in the Fager dataset (Please note that these counts are purely for a better visualization purpose)

Data Visualization of Fager’s index

# Species counts dataset 

sp_counts<-Fager_data%>%
    group_by(Species_1)%>%
    summarise(sp_freq=n())%>%
    arrange(desc(sp_freq))


# Joining the sp_counts with the Fager data


Fager_data2<-Fager_data%>%
    inner_join(sp_counts,by="Species_1")


# Visualization using heatmap in plotly

Fager_data2%>%
    plot_ly(x=~forcats::fct_reorder(Species_1,
                                    desc(sp_freq)),
            y=~forcats::fct_reorder(Species_2,
                                    desc(sp_freq)),
            z=~Values,
            type = "heatmap",
            hoverinfo = 'text',
            text= ~paste("Species_1: ",
                         Species_1, 
                         "<br>",
                         "Species_2: ",
                         Species_2,
                         "<br>",
                         "Fager_value",
                         Values))%>%
    layout(title = 'Heatmap of Fager values',
           xaxis = list(title = "Species"),
           yaxis = list(title = "Species"))

Both species of Streptocephalus co-occurred commonly as compared with other species. Eulimnadia michaeli did not co-occur with any other species while highest co-occurrence was observed between the Cyzicus annandalei and Streptocephalus dichotomus