In this note, we will introduce method to working with data have interval value in trait morphology.

PART I: WORKING WITH CONTINUOUS VALUE

Applicable to the measurement characteristics. Ex: Length of the petiole, stipule, spike….

Idea general

  • We get value min and max of each trait
  • Convert interval total [min;max] to n interval (class) (n is depending on the trait)
  • Test MERGEMODE for each pair of interval. Mean that: if one of two list species is selected by 2 interval have all species are present in the other one than we can merge into one interval.
  • look in a largest clique in the list of pairs interval that can be merge together. This means finding a group clique with multiple intervals that can be group into one interval.

FUNCTION

b.check.a.in.b

This funtion will return TRUE if all elements of list a present in list b or reverse. Note: if a or b is null, it will return True too

b.check.a.in.b <-function(a,b){
    if (length(a$sp)==0|length(b$sp)==0){
        return(T)
    } else{
        if(length(a$sp)<length(b$sp)){
            c<-a$sp %in% b$sp
        } else {
            c<-b$sp %in% a$sp
        }
        return(!F %in% c)  
    }
}

b.mergemode_interval

This function to check 2 list species corespend with 2 interval [a1;a2] and [b1;b2]. It will return TRUE if 2 intervals can be merge. Means one of two list is selected by 2 interval have all elements are present in the other one.

b.mergemode_interval <- function(df,a1,a2,b1,b2){
    a <- subset(df, df[,"min"]<a2 & df[,"max"]>=a1)
    b <- subset(df, df[,"min"]<b2 & df[,"max"]>=b1)
    re <-b.check.a.in.b(a,b)
    return(re)
}

b.mergemode.one.trait.interval_con

And this funtion will compare pairs of interval by class value. And it will suggest what interval that can be merge to gether. df imput is dataframe have 3 columns sp, min , max

b.mergemode.one.trait.interval_con <- function(df, class=10){
    min <-min(df$min)
    max <-max(df$max)
    distance<-(max-min)/class
    mod <-seq(min,max,distance)
    n<-length(mod)
    res<-c()
    
    for(i in 1: n){
        for(j in i+1:n){
            if(j>=n){
                break
            }
            else{
                a<-b.mergemode_interval(df,mod[i],mod[i+1],mod[j],mod[j+1])
                if (a==T){
                    res <-append(res,paste0(mod[i],"-",mod[i+1]," & ",
                                            mod[j],"-",mod[j+1]))
                }
            }
        }
    }
    return(res)
}

b.plot_min_max

Function to plot min-max with data.frame have 3 columns “sp”,“min,”max"

b.plot_min_max <- function(data){
    data$min <- as.numeric(data$min)
    data$max <- as.numeric(data$max)
    ggplot(data, aes(sp))+
        geom_linerange(aes(ymin=min,ymax=max),linetype=1,color="blue")+
        geom_point(aes(y=min),size=3,color="red")+
        geom_point(aes(y=max),size=3,color="red")+
        theme_bw()+coord_flip()+
        ylab("min-max")+xlab("species")
}

RUN

Load package

list_pkg <- c(
            "stringr",
            "tidyverse",
            "igraph",
            "visNetwork")
sapply(list_pkg, library, character.only = TRUE)

Load data

Data min_max.csv can be download at here

cecropia_min_max <- read.csv("min_max.csv", header = T)%>%
    lapply(function(x) {gsub("\\[", "", x)})%>%
    lapply(function(x) {gsub("\\]", "", x)})%>%
    as.data.frame()
head(cecropia_min_max,10)
##                            sp   segment   ratio      vein stamin_spike
## 1       01. Cecropia albicans 11.0;13.0 4.0;8.0 10.0;16.0     7.0;24.0
## 2         02. Cecropia andina   7.0;9.0 4.0;7.0 10.0;15.0     6.0;16.0
## 3       03. Cecropia angulata  8.0;11.0 7.0;9.0 15.0;20.0     8.0;20.0
## 4   04. Cecropia angustifolia  8.0;14.0 5.0;9.0 12.0;43.0    10.0;50.0
## 5       05. Cecropia annulata  8.0;10.0 6.0;9.0 11.0;20.0    10.0;25.0
## 6        06. Cecropia bullata   7.0;9.0 5.0;6.0 10.0;20.0     4.0;10.0
## 7  07. Cecropia chlorostachya  7.0;10.0 0.0;6.0 15.0;20.0      7.0;8.0
## 8       08. Cecropia concolor  7.0;11.0 9.0;9.0 16.0;30.0     8.0;20.0
## 9      09. Cecropia distachya  5.0;12.0 6.0;9.0 12.0;25.0     8.0;50.0
## 10      10. Cecropia elongata  8.0;16.0 7.0;9.0  8.0;40.0    13.0;25.0
##    pistil_spike    pet_len
## 1       4.0;9.0  30.0;75.0
## 2       3.0;4.0  30.0;85.0
## 3       4.0;6.0 50.0;110.0
## 4      3.0;12.0  20.0;70.0
## 5       4.0;6.0  25.0;70.0
## 6       1.0;2.0  40.0;55.0
## 7       4.0;4.0  25.0;55.0
## 8       4.0;4.0  25.0;50.0
## 9       2.0;4.0 15.0;100.0
## 10      4.0;9.0 20.0;105.0

In this dataframe, we have pet_len is continuous data.

Get and make an dataframe for length of petiole - pet_len with 3 columns sp, min, and max from

pet_len <- cecropia_min_max %>% select(sp, pet_len) %>%
    separate(pet_len, into = c("min", "max"), sep = ";")

#Convert all value to numeric
pet_len[,-1] <- sapply(pet_len[,-1], as.numeric)

head(pet_len,10)
##                            sp min max
## 1       01. Cecropia albicans  30  75
## 2         02. Cecropia andina  30  85
## 3       03. Cecropia angulata  50 110
## 4   04. Cecropia angustifolia  20  70
## 5       05. Cecropia annulata  25  70
## 6        06. Cecropia bullata  40  55
## 7  07. Cecropia chlorostachya  25  55
## 8       08. Cecropia concolor  25  50
## 9      09. Cecropia distachya  15 100
## 10      10. Cecropia elongata  20 105

MERGEMODE

Before test MERGEMODE, we plot our dataframe with value min, max and cut it by 10 class. We have:

min <-min(pet_len$min)
max <-max(pet_len$max)
class <-10
distance <- (max-min)/10
class_interval = seq(min, max, distance)
b.plot_min_max(pet_len)+geom_hline(yintercept =class_interval , col="orange")+
    scale_y_continuous(breaks=class_interval)

Here we can imagine that from total interval [10-120] we have 10 interval with value at

class_interval
##  [1]  10  21  32  43  54  65  76  87  98 109 120

Our goal is compare 2 lists species is selected by [10,21)&[21,32). Do the same for [10,21)&[32,43), [10,21)&[43,54) … [98,109)&[109,120]. To do this we call function b.mergemode.one.trait.interval_con

pet_len_merg <- b.mergemode.one.trait.interval_con(pet_len, 10)%>%
    data.frame(interval=.)%>%
    separate(interval, into = c("V1", "V2"), sep = " & ")
pet_len_merg
##        V1      V2
## 1   10-21   21-32
## 2   10-21   32-43
## 3   21-32   32-43
## 4   43-54   54-65
## 5   65-76   76-87
## 6   65-76   87-98
## 7   65-76  98-109
## 8   65-76 109-120
## 9   76-87   87-98
## 10  76-87  98-109
## 11  76-87 109-120
## 12  87-98  98-109
## 13  87-98 109-120
## 14 98-109 109-120

And now, we have 14 pairs of interval can be merge together. But our mission does not end here. We need to find clique, largest clique in between these 14 pairs.

Find clique

In this step, we use package igraph to find clique and largest clique. And package visNetwork to make interactive graph which would be easier to observe.

We make a function to plot a graph

#'This is function to plot largest clique
b.visNetwork <- function(g2,largest,number){
    g2$nodes$group<-"Other"
    g2$nodes$group[largest[[number]]] <- "Clique_largest"
    #Plot
    visNetwork(g2$nodes, g2$edges) %>%
        visIgraphLayout() %>%
        visNodes(size = 25, shape = "circle") %>%
        visOptions(selectedBy="group",
                   highlightNearest = TRUE,
                   nodesIdSelection = TRUE) %>%
        visGroups(groupname = "Clique_largest", color = "orange")%>%
        visInteraction(keyboard = TRUE)%>%
        visLegend()
}
D <- pet_len_merg
#'Make with igraph
g1 <- graph.data.frame(D, directed = F)

#'Find largest cliques
largest <- largest.cliques(g1)

#'convert g1 to Visnetwork
g2 <- toVisNetworkData(g1)

#plot
b.visNetwork(g2,largest,1)

RESULT

With the results in the graph, we can see that:
Length of petiole from 10 orginal intervals, can be merge into 3 intervals.

  1. [10-43)
  2. [43-65)
  3. [65-120]

PART II: WORKING WITH CONTINUOUS VALUE

Applicable to the counted characteristics. Ex: Number of lobes in laminar? pairs of secondary vein? staminate spike?, pistilate spike?

IDEA

  • Convert dataframe to new dataframe that one line [min;max] will to n line have value from min à max
  • Convert interval min:max of trait to n class = max - min +1
  • Test Mergemode to suggest pairs of class can be merge together
  • Find clique between list pairs of class

FUNCTION

b.convert.sp.to.specimens

This function will change one line species to n line. With n=max-min+1

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#This function to convert [min;max] to format c(min,max)
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
b.convert.range.to.vector <- function(range){
    range=str_replace(range,"\\[","")
    range=str_replace(range,"\\]","")
    range <-unlist(strsplit(range, ";"))
    return(as.numeric(range))
}
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#This function will change one line species to n line. With n=max-min+1 
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
b.convert.sp.to.specimens <- function(data,charac)  {
    list_sp=c()
    list_value=c()
    for (i in 1:length(data[,charac])){
        min_max <- as.character(data[,charac])[i]
        min_max <- b.convert.range.to.vector(min_max)
        #call d is distane
        min_value=min(min_max)
        max_value=max(min_max)
        d <- max_value - min_value
        
        for (j in 0:d){
            list_sp<-append(list_sp,as.character(data$sp[i]))
            list_value<-append(list_value,min_value+j)
        }
        
    }
    df <- data.frame(list_sp,list_value)
    colnames(df)[colnames(df) == 'list_sp'] <- "sp"
    colnames(df)[colnames(df) == 'list_value'] <- "value"
    return(df)
} 

b.mergemode.one.trait.interval_discrete

This funtion will compare list species is selected by each pairs of value. And it will suggest what pairs of value that can be merge together

#...2. b.mergemode_value 
#'It will to check 2 list species corespend with 2 value m1 and m2 
#'It will return TRUE if 2 intervals can be merge. 
#'Means one of two list is selected by 2 value have all elements are present in the other one.
b.mergemode_value <- function(df,m1,m2){
    a <- subset(df, df[,"value"]==m1)
    b <- subset(df, df[,"value"]==m2)
    re <-b.check.a.in.b(a,b)
    return(re)
}

#' And this funtion will compare pairs of value. And it will suggest what pairs of value that can be merge together
#'  df imput is dataframe have 3 columns sp, min , max
b.mergemode.one.trait.interval_discrete<-function(df){
    min <- min(df$value)
    max <- max(df$value)
    mod <- c(min:max)
    n <- max-min+1
    res <-c()
    for (i in 1:n){
        for (j in i:n){
            if(j==n){
                break
            }
            else{
                a<-b.mergemode_value(df,mod[i],mod[j+1])
                if (a==T){
                    res <-append(res,paste(mod[i],mod[j+1], sep = "-"))
                }
            }
        }
    }
    return(res)
}

RUN

Load package

list_pkg <- c(
            "stringr",
            "tidyverse",
            "igraph",
            "visNetwork")
sapply(list_pkg, library, character.only = TRUE)

Load data

We use dataframe cecropia_min_max that we have already load in R

head(cecropia_min_max,10)
##                            sp   segment   ratio      vein stamin_spike
## 1       01. Cecropia albicans 11.0;13.0 4.0;8.0 10.0;16.0     7.0;24.0
## 2         02. Cecropia andina   7.0;9.0 4.0;7.0 10.0;15.0     6.0;16.0
## 3       03. Cecropia angulata  8.0;11.0 7.0;9.0 15.0;20.0     8.0;20.0
## 4   04. Cecropia angustifolia  8.0;14.0 5.0;9.0 12.0;43.0    10.0;50.0
## 5       05. Cecropia annulata  8.0;10.0 6.0;9.0 11.0;20.0    10.0;25.0
## 6        06. Cecropia bullata   7.0;9.0 5.0;6.0 10.0;20.0     4.0;10.0
## 7  07. Cecropia chlorostachya  7.0;10.0 0.0;6.0 15.0;20.0      7.0;8.0
## 8       08. Cecropia concolor  7.0;11.0 9.0;9.0 16.0;30.0     8.0;20.0
## 9      09. Cecropia distachya  5.0;12.0 6.0;9.0 12.0;25.0     8.0;50.0
## 10      10. Cecropia elongata  8.0;16.0 7.0;9.0  8.0;40.0    13.0;25.0
##    pistil_spike    pet_len
## 1       4.0;9.0  30.0;75.0
## 2       3.0;4.0  30.0;85.0
## 3       4.0;6.0 50.0;110.0
## 4      3.0;12.0  20.0;70.0
## 5       4.0;6.0  25.0;70.0
## 6       1.0;2.0  40.0;55.0
## 7       4.0;4.0  25.0;55.0
## 8       4.0;4.0  25.0;50.0
## 9       2.0;4.0 15.0;100.0
## 10      4.0;9.0 20.0;105.0

We make an example with the number of segments.

Make dataframe for trait: number of segment with 3 columns ‘sp’, ‘min’, and ‘max’.

segment <- cecropia_min_max %>% select(sp, segment) %>%
    separate(segment, into = c("min", "max"), sep = ";")
#Convert all value to numeric
segment[,-1] <- sapply(segment[,-1], as.numeric)
head(segment,10)
##                            sp min max
## 1       01. Cecropia albicans  11  13
## 2         02. Cecropia andina   7   9
## 3       03. Cecropia angulata   8  11
## 4   04. Cecropia angustifolia   8  14
## 5       05. Cecropia annulata   8  10
## 6        06. Cecropia bullata   7   9
## 7  07. Cecropia chlorostachya   7  10
## 8       08. Cecropia concolor   7  11
## 9      09. Cecropia distachya   5  12
## 10      10. Cecropia elongata   8  16

We plot our dataframe with value min, max and cut it by max-min+1 class. We have:

b.plot_min_max(segment)+
    geom_hline(yintercept = c(min(segment$min):max(segment$max)), col="orange")

MERGEMODE

Convert dataframe to n lines with b.convert.sp.to.specimens

segment_indi <- b.convert.sp.to.specimens(cecropia_min_max, "segment")

Similar to the above part, we compare 2 list species is selected by pairs of value with function b.mergemode.one.trait.interval_discrete

segment_merg <- data.frame(value=b.mergemode.one.trait.interval_discrete(segment_indi))%>%
    separate(value, into = c("V1", "V2"), sep = "-")

head(segment_merg,10)
##    V1 V2
## 1   5  6
## 2   5  7
## 3   6  7
## 4  12 13
## 5  14 15
## 6  14 16
## 7  14 17
## 8  14 18
## 9  14 19
## 10 14 20
dim(segment_merg)
## [1] 59  2

We have 59 pairs can be merge. Attention: here we don’t see value 8,9,10,11. It means that these class can not be merge, so we need to keep them in our class.

Find Clique

D <- segment_merg
#'Make with igraph
g1 <- graph.data.frame(D, directed = F)
#plot(g1)

#'Find largest cliques
largest <- largest.cliques(g1)

#'convert g1 to Visnetwork
g2 <- toVisNetworkData(g1)

#plot
b.visNetwork(g2,largest,1)

Result

With the results in the graph, we can see that:
Number of segment from interval [5-24] can be merge into 7 class

  1. 5-6-7
  2. 8
  3. 9
  4. 10
  5. 11
  6. 12-13
  7. 14->24

  1. nlxbach@gmail.com

  2. patrick.heuret@ecofog.gf