1 linked lists 链表

Linear linked list 线性链表

A singly linked list is a sequence of nodes, where each node stores an element and a link to the next node

相关函数的定义:

## 创建一个空环境
create_emptyenv <- function() {
  emptyenv()
}

# 判断列表是不是空的
isEmpty <- function(llist) {
  if (class(llist) != "linkList")
    warning("Not linkList class") # 判断是不是linear linked list
  identical(llist, create_emptyenv()) # identical 函数是用来检测两个对象是不是完全相等
}

# 创建链表节点
linkListNode <- function(val, node = NULL) {
  llist <-  new.env(parent = create_emptyenv())
  llist$element <- val
  llist$nextnode <- node
  class(llist) <- "linkList"
  llist
}

# 例子
LList <- linkListNode(5, linkListNode(2, create_emptyenv()))

# 获取下一个节点
setNextNode <- function(llist) {
  llist$nextnode
}

# 获取元素
setNextElement <- function(llist) {
  llist$element
}

# 获取节点的大小,这里使用了递归的思想

sizeLinkList <- function(llist, size = 0) {
  if (isEmpty(llist))
  {
    return(size)
  } else
  {
    size <- size + 1L
    sizeLinkList(llist$nextnode, size)
  }
}

# 添加item

addElement <- function(new, llist)
{
  if (isEmpty(llist)) {
    llist <- linkedlist(new)
  } else
  {
    llist <- linkListNode(llist, new)
  }
  llist
}

# 删除item
delElement <- function(llist, pos = NULL) {
  if (is.null(pos))
    warning("Nothing to delete")
  listsize <- sizeLinkList(llist)
  if (pos > listsize)
    stop("Position greater than size of list")
  if (isEmpty(llist)) {
    warning("Empty List")
  } else if (pos == 1) {
    PreviousNode <- llist$nextnode
  } else
  {
    PreviousNode <- linkListNode(llist$element)
    for (i in 1:(listsize - 1)) {
      if (pos == (i + 1)) {
        PreviousNode$nextnode <- setNextNode(llist$nextnode)
      } else
      {
        PreviousNode$nextnode <- llist$nextnode
        llist <- llist$nextnode
      }
    }
  }
  return(PreviousNode)
}

# 查找某一个item
findItem <- function(llist,
                     item,
                     pos = 0,
                     itemFound = FALSE) {
  if (itemFound == TRUE)
  {
    return(itemFound)
  } else if (isEmpty(llist)) {
    return(FALSE)
  } else
  {
    pos <- pos + 1L
    if (llist$element == item)
      itemFound <- TRUE
    findItem(llist$nextnode, item, size, itemFound)
  }
}

Doubly linked list 双向链表

双向链表是单向链表的衍生,可以查找前面的元素

dlinkListNode <- function(val,
                          prevnode = NULL,
                          node = NULL) {
  llist <- new.env(parent = create_emptyenv())
  llist$prevnode <- prevnode
  llist$element <- val # 上一个元素
  llist$nextnode <- node # 下一个元素
  class(llist) <- "dlinkList"
  llist
}

Circular linked list 循环链表

cicularLinkList <- function(llist, val) {
  if (isEmpty(llist)) {
    llist <- linkListNode(val)
    head <- llist
  } else
  {
    llistNew <- linkListNode(val)
    llistNew$nextnode <- head
    llist <- linkListNode(llist, llistNew)
  }
  llist
}

Array based list

数组列表

ALinkList <- setRefClass(
  Class = "ALinkList",
  fields = list(
    Alist = "array",
    listsize = "integer",
    arraySize = "integer",
    maxSize = "integer"
  ),
  methods = list(
    initialize = function(...) {
      listsize <<- 0L
      arraySize <<- 100L
      Alist <<- array(dim = arraySize)
      maxSize <<- arraySize
    }
  )
)
listlen = function()
{
  return(listsize)
}

updateArrayList = function() {
  Alist <<- c(Alist, array(dim = arraySize))
  maxSize <<- maxSize + arraySize
}

addItem = function(item) {
  if (maxSize <= listsize) {
    updateArrayList()
  }
  listsize <<- listsize + 1L
  Alist[listsize] <- item
  return(listsize)
}

removeItem = function(i)
{
  Alist[i] <<- NULL
  listsize <<- listsize - 1L
}

searchItem = function(val) {
  pointer <- 1L
  while (pointer != listsize) {
    if (Alist[pointer] == val) {
      break
    }
    pointer <- pointer + 1L
  }
  return(pointer)
}

Analysis of list operations

2 stack and queues 堆和队列

stack

先进后出

相关函数

Array-based stacks 数组堆

Astack <- setRefClass(
  Class = "Astack",
  fields = list(
    Maxsize = "integer",
    topPos = "integer",
    ArrayStack = "array"
  ),
  methods = list(
    # Initialization function
    initialize = function(defaultSize = 100L, ...)
    {
      topPos <<- 0L
      Maxsize <<- defaultSize # 100L
      ArrayStack <<- array(dim = Maxsize)
    },
    
    # Check if stack is empty
    isEmpty = function() {
    },
    
    # push value to stack
    push = function(pushval) {
    },
    
    # Pop value from stack
    pop = function() {
    },
    
    # Function to get size of stack
    stacksize = function() {
    },
    
    # Function to get top value of stack
    top = function() {
    }
  )
)


isEmpty = function() {
  if (topPos == 0) {
    cat("Empty Stack!")
    return(TRUE)
  } else
  {
    return(FALSE)
  }
}


push = function(pushval) {
  if ((topPos + 1L) > Maxsize)
    stop("Stack is OUT OF MEMORY!")
  topPos <<- topPos + 1L
  ArrayStack[topPos] <<- pushval
}

pop = function() {
  isEmpty() # Check if stack is empty
  popval <- ArrayStack[topPos]
  ArrayStack[topPos] <<- NA
  topPos <<- topPos - 1L
  return(popval)
}

stacksize = function() {
  stackIsEmpty <- isEmpty()
  ifelse(stackIsEmpty, return(0), return(topPos))
}

top = function() {
  stackIsEmpty <- isEmpty()
  if (stackIsEmpty) {
    cat("Empty Stack")
  } else
  {
    return(ArrayStack[topPos])
  }
}


# array_stack_ex<- Astack$new()
# array_stack_ex$push(1)
# array_stack_ex$push(2)
# array_stack_ex$push(3)
# array_stack_ex$pop()
# array_stack_ex$push(5)
# array_stack_ex$pop()
# array_stack_ex$pop()
# array_stack_ex$top()
# array_stack_ex

Linked stacks 链表堆

Linkstack <- setRefClass(
  Class = "Linkstack",
  fields = list(Lsize = "integer",
                Lstacktop = "environment"),
  methods = list(
    # Initialization function
    initialize = function(...) {
      Lsize <<- 0L
    },
    
    # Check if stack is empty
    isEmpty = function() {
    },
    
    # Function to create empty R environment
    create_emptyenv = function() {
    },
    
    # Function to create node
    Node = function(val, node = NULL) {
    },
    
    # push value to stack
    push = function(pushval) {
    },
    
    # Pop value from stack
    pop = function() {
    },
    
    # Function to get top value of stack
    top = function() {
    }
  )
)


isEmpty = function() {
  if (Lsize == 0) {
    cat("Empty Stack!")
    return(TRUE)
  } else
  {
    return(FALSE)
  }
}

create_emptyenv = function() {
  emptyenv()
}

Node = function(val, node = NULL) {
  llist <- new.env(parent = create_emptyenv())
  llist$element <- val
  llist$nextnode <- node
  llist
}

push = function(val) {
  stackIsEmpty <- isEmpty()
  if (stackIsEmpty) {
    Lstacktop <<- Node(val)
    Lsize <<- Lsize + 1L
  } else
  {
    Lstacktop <<- Node(val, Lstacktop)
    Lsize <<- Lsize + 1L
  }
}

pop = function() {
  stackIsEmpty <- isEmpty()
  if (stackIsEmpty) {
    cat("Empty Stack")
  } else
  {
    Lstacktop <<- Lstacktop$nextnode
    Lsize <<- Lsize - 1L
  }
}

topVal = function() {
  stackIsEmpty <- isEmpty()
  if (stackIsEmpty) {
    cat("Empty Stack")
  } else
  {
    return(Lstacktop$element)
  }
}

# link_stack_ex<-Linkstack$new()
# link_stack_ex $push(1)
# link_stack_ex $push(2)
# link_stack_ex $push(3)
# link_stack_ex $pop()
# link_stack_ex $push(5)
# link_stack_ex $pop()
# link_stack_ex $pop()
# a$topVal()
# link_stack_ex

Queues

队列是先进先出

Array-based queues 基于数组的队列

使用的是RC类

aqueue <- setRefClass(
  Class = "aqueue",
  fields = list(
    Alist = "array",
    queuesize = "integer",
    maxSize = "integer",
    rear = "integer",
    top = "integer"
  ),
  methods = list(
    initialize = function(qSize, ...) {
      queuesize <<- 0L
      rear <<- 1L
      top <<- 0L
      maxSize <<- as.integer(qSize)
      Alist <<- array(dim = maxSize)
    },
    # Queue is empty
    isEmpty = function() {
      return(queuesize == 0L)
    },
    # Add element to the queue
    enqueue = function(val) {
      if (queuesize < maxSize) {
        if (top == maxSize)
          top <<- 0L
        top <<- top + 1L
        Alist[top] <<- val
        queuesize <<- queuesize + 1L
      } else{
        cat("Queue Full!")
      }
    },
    
    # remove element from queue
    dequeue = function() {
      if (queuesize > 0L) {
        Alist[rear] <<- NA
        ifelse(rear == maxSize, rear <<-
                 1L, rear <<- rear + 1L)
        queuesize <<- queuesize - 1L
      } else{
        cat("Empty Queue!")
      }
    },
    
    # size of queue
    size = function() {
      Lsize
    }
  )
)

Linked queues 基于链表的队列

ListQueue <- setRefClass(
  Class = "ListQueue",
  fields = list(
    Lsize = "integer",
    front = "environment",
    rear = "environment",
    Lqueue = "environment"
  ),
  
  methods = list(
    initialize = function(...) {
      Lsize <<- 0L
    },
    
    # Check if list is empty
    isEmpty = function() {
      if (Lsize == 0) {
        cat("Empty Stack!")
        return(TRUE)
      } else
      {
        return(FALSE)
      }
    },
    
    # create empty environment
    create_emptyenv = function() {
      emptyenv()
    } ,
    
    # Create node
    Node = function(val, node = NULL) {
      llist <- new.env(parent = create_emptyenv())
      llist$element <- val
      llist$nextnode <- node
      llist
    },
    
    # Function to add value to link list
    enqueue = function(val) {
      ListIsEmpty <- isEmpty()
      if (ListIsEmpty) {
        Lqueue <<- Node(val)
        Lsize <<- Lsize + 1L
        rear <<- Lqueue
      } else
      {
        newNode <- Node(val)
        assign("nextnode", newNode, envir = rear)
        rear <<- newNode
        Lsize <<- Lsize + 1L
      }
    },
    
    # Function to remove node from link list
    dequeue = function() {
      stackIsEmpty <- isEmpty()
      if (stackIsEmpty) {
        cat("Empty Queue")
      } else
      {
        Lqueue <<- Lqueue$nextnode
        Lsize <<- Lsize - 1L
      }
    },
    
    # Function to get link list size
    size = function() {
      Lsize
    }
  )
)

dictionaries 字典

Adict <- setRefClass(Class = "Adict",
  fields = list(
    Alist = "list",
    listsize = "integer",
    key = "integer"
  ),
  methods = list(
    # Re-initialize dictionary
    initialize = function(...) {
      listsize <<- 0L
      Alist <<- list()
    },
    
    # Check length of value
    size = function() {
      return(listsize)
    },
    
    # Add following key value pair in Array
    addElement = function(key, val) {
      Alist[[key]] <<- val
      listsize <<- listsize + 1L
    },
    
    # remove value with defined
    removeElement = function(key) {
      Alist[[key]] <<- NULL
      listsize <<- listsize - 1L
    },
    
    # remove value with following
    findElement = function(key) {
      return(key %in% names(Alist))
    }
  )
)

 

 
 


# dictvar <- Adict$new()
# dictvar$addElement("key1", 1)
# dictvar$addElement("key2", 1)
# dictvar
# dictvar$Size()
# dictvar$findElement("key1")
# dictvar$removeElement("key1")

3 Sorting Algorithms 排序算法

Insertion Sort

Insertion_Sort <- function(V,n)
{
  if(n==0) stop("No elements to sort")
  for(i in 2:(length(V)))
  {
    val <- V[i]
    j <- i - 1
    while (j >= 1 && val <= V[j])
    {
      V[j+1] <- V[j]
      j <- j-1
    }
    V[j+1] <- val
  }
  return(V)
}

Insertion_Sort(V=c(20,12,65,8,10,16,43,35),n=8)
## [1]  8 10 12 16 20 35 43 65

Bubble Sort

Bubble_Sort <- function(V,n) {
  if(n==0) stop("No elements to sort")
  for(i in 1:length(V)) {
    flag <- 0
    for(j in 1:(length(V)-i)) {
      if ( V[j] > V[j+1] ) {
        val <- V[j]
        V[j] <- V[j+1]
        V[j+1] <- val
        flag <- 1
      } 
    }
    if(!flag) break
  }
  return(V)
}

Bubble_Sort(c(20,12,65,8,10,16,43,35),n=8)
## [1]  8 10 12 16 20 35 43 65

Selection Sort

Selection_Sort_loop <- function(V,n) {
  if(n==0) stop("No elements to sort")
  keys <- seq_along(V)
  for(i in keys) {
    small_pos <- (i - 1) + which.min(V[i:length(V)])
    temp <- V[i]
    V[i] <- V[small_pos]
    V[small_pos] <-temp
  }
  return(V)
}

Selection_Sort_loop(c(20,12,65,8,10,16,43,35),n=8)
## [1]  8 10 12 16 20 35 43 65

shell sort

Shell_Sort <- function(V,n) {
  if(n==0) stop("No elements to sort")
  increment=round(n/2)  ## as.integer 
  while(increment>0) {
    for(i in (increment+1):n) {
      temp <- V[i]
      j=i
      while(j >= (increment+1)  && V[j-increment] > temp) {
        V[j] <- V[j-increment]
        j <- j-increment
      }
      V[j] <- temp
    }
    if(increment==2) {
      increment <- 1} else{
        increment <- round(increment/2.2)
      }
    }
  return(V)
}

Shell_Sort(c(20,12,65,8,10,16,43,35,23,88,2,56,41,27,67,56),n=16)
##  [1]  2  8 10 12 16 20 23 27 35 41 43 56 56 65 67 88

Merge sort

Merge_Sort <- function(V) {
  if(length(V) == 0) stop("Not enough elements to sort")
  
  ## Merge function to sort two halves or sub-vectors
  merge_fn <- function(first_half, second_half) {
    result <- c()
    while(length(first_half) > 0 && length(second_half) > 0) {
      if(first_half[1] <= second_half[1]) {
        result <- c(result, first_half[1])
        first_half <- first_half[-1]
      } else {
        result <- c(result, second_half[1])
        second_half <- second_half[-1]
      }         
    }
    if(length(first_half) > 0) result <- c(result, first_half)
    if(length(second_half) > 0) result <- c(result, second_half)
    return(result)
  }
  
  ## Recursively split the parent vector into two halves (sub-vectors)
  if(length(V) <= 1) V else {
    middle <- length(V) / 2
    first_half <- V[1:floor(middle)]
    second_half <- V[floor(middle+1):length(V)]
    first_half <- Merge_Sort(first_half)
    second_half <- Merge_Sort(second_half)
    if(first_half[length(first_half)] <= second_half[1]) {
      c(first_half, second_half)
    } else {
      merge_fn(first_half, second_half)
    } 
  }
}

Merge_Sort(c(20,12,65,8,10,16,43,35,23,88,2,56,41,27,67,56))
##  [1]  2  8 10 12 16 20 23 27 35 41 43 56 56 65 67 88

Quich sort

Quick_Sort <- function(V,n) {  
  if (n <= 1) return(V)
  left <- 0 ##start from left prior first element
  right <- n  ##start from rightmost element
  v <- V[n] ## initialize last element as pivot element
  
  ## Partition implementation
  repeat {
    while (left < n && V[left+1]  < v) left <- left+1
    while (right > 1 && V[right-1] >= v) right <- right-1
    if (left >= right-1) break
    ## Swap elements to put pivot in place
    temp <- V[left+1]
    V[left+1] <- V[right-1]
    V[right-1] <- temp
  }
  
  ## Recursive implementation of Quick sort
  if (left == 0) return(c(V[n], Quick_Sort(V[1:(n-1)],n=(n-1))))
  if (right == n) return(c(Quick_Sort(V[1:(n-1)],n=(n-1)), V[n]))
  return( c(Quick_Sort(V[1:left],n=left), V[n], Quick_Sort(V[(left+1):(n-1)],n=(n-left-1))))
}

Quick_Sort(V= c(20,12,65,8,10,16,43,35,23,88,2,56,41,27,67,55),n=16)
##  [1]  2  8 10 12 16 20 23 27 35 41 43 55 56 65 67 88

Heap sort

V <- c(20,12,65,8,10,16,43,35,23,88,2,56,41,27,67,56)
Heap_Sort <- function(V)
{
  heapsize <- length(V)
  for (i in floor(length(V)/2):1)
    V <- max_heap(V, i,heapsize)
  for (i in length(V):2) {
    temp <- V[i]
    V[i] <- V[1]
    V[1] <- temp
    heapsize <- heapsize -1
    V <- max_heap(V, 1,heapsize)
  }
  return(V)
}

max_heap <- function(V, i,heapsize) {
  left <- 2*i
  right <- 2*i+1
  if (left<=heapsize && V[left]>V[i]){
    largest <- left}else{
    largest <- i
    }
  
  if (right<=heapsize && V[right]>V[largest])
    largest <- right
  
  if (largest != i) {
    temp2 <- V[largest]
    V[largest] <- V[i]
    V[i] <- temp2
    V <- max_heap(V, largest,heapsize)
  }
  return(V)
}

Heap_Sort(c(20,12,65,8,10,16,43,35,23,88,2))
##  [1]  2  8 10 12 16 20 23 35 43 65 88

Bin sort

# add item to bin
addItem=function(V,bin,maxValue,n){
  for(i in 1:n){
    val<-V[i]
    ix<-ceiling((val*n)/maxValue)
    if(is.na(bin[["binValues"]][[ix]][1])){
      bin[["binValues"]][[ix]][1]<-val
      bin[["nElement"]][ix]<-1
    } else
    {
      bin <- insertItem(val=val, ix=ix,bin=bin)
    }
  }
  return(bin)
}

# insert a item into a bin ensuring sorting
insertItem=function(val, ix,bin){
  nElement<-bin[["nElement"]][ix]
  pos<-NULL
  for(i in 1:nElement){
    if(val<bin[["binValues"]][[ix]][i]){
      pos<-i
    }
  }
  if(is.null(pos)){
    bin[["binValues"]][[ix]][nElement+1]<-val
  } else if(pos==1) {
    bin[["binValues"]][[ix]]<-c(val, bin[["binValues"]][[ix]][1])
  } else
  {
    bin[["binValues"]][[ix]]<-c(bin[["binValues"]][[ix]][1:(pos-1)], val, bin[["binValues"]][[ix]][pos:nElement])
  }
  bin[["nElement"]][ix]<-nElement+1
  return(bin)
}

# bind the list into a sorted vector
bindSorted_vec=function(bin,n){
  output <- c()
  currentIx<-1
  for(i in 1:n){
    if(!is.na(bin[["binValues"]][[i]][1])){
      nElement<-bin[["nElement"]][i]
      for(m in 1:nElement){
        output[currentIx]<-bin[["binValues"]][[i]][m]
        currentIx<-currentIx+1
      }
    }
  }
  return(output)
}

# binsort Algorithm
Bin_Sort=function(V,n,maxValue){
  bin <-list("binValues"=list(), "nElement"=NA)
  # create empty bins
  for(i in 1:n){
    bin[["binValues"]][[i]]<-NA
    bin[["nElement"]][i]<-0
  }
  ## Add elements into suitable bins
  bin <- addItem(V=V,bin=bin,maxValue=maxValue,n=n)
  ## Bind all bins inot a single sorted vector
  output <- bindSorted_vec(bin=bin,n=n)
  return(output)
}


## Example of bin sorting
V<-c(20,12,65,8,10,16,43,35,23,88,2,56,41,27,67,55)
n<-16
maxValue<-88
Bin_Sort(V=V,n=n,maxValue=maxValue)
##  [1]  2  8 10 12 16 20 23 27 35 41 43 55 56 65 67 88

radix sort

# add item to bin
addItem=function(V,bin,digLength,n){
  for(i in 1:n){
    val<-V[i]
    ## Extract the required digit from the number
    ix<-floor((val/digLength) %% 10)+1
    ## Assign element to each bin
    bin[["binValues"]][[ix]][bin[["nElement"]][ix]+1]<-val
    ## Track count of elements in each bin
    bin[["nElement"]][ix]<-bin[["nElement"]][ix] + 1
  }
  return(bin)
}

# bind the list into a sorted vector
bindSorted_vec=function(bin){
  output <- c()
  currentIx<-1
  for(i in 1:10){
    if(!is.na(bin[["binValues"]][[i]][1])){
      nElement<-bin[["nElement"]][i]
      for(m in 1:nElement){
        output[currentIx]<-bin[["binValues"]][[i]][m]
        currentIx<-currentIx+1
      }
    }
  }
  return(output)
}

# radixsort Algorithm
radix_Sort=function(V,n,maxValue,digLength){
  for(digLength in c(10^(0:digLength)))
  {
  bin <-list("binValues"=list(), "nElement"=NA)
  # create empty bins
  for(i in 1:10){
    bin[["binValues"]][[i]]<-NA
    bin[["nElement"]][i]<-0
  }
  bin <- addItem(V=V,bin=bin,digLength=digLength,n=n)
  V <- bindSorted_vec(bin=bin)
  }
  return(V)
}


## Example of radix sorting
V<-c(67,54,10,988,15,5,16,43,35,23,88,2,103,83)
n<-14
maxValue<-988
digLength <- 2
radix_Sort(V=V,n=n,maxValue=maxValue,digLength=digLength)
##  [1]   2   5  10  15  16  23  35  43  54  67  83  88 103 988

4 Exploring Search Options 搜索算法

搜索是计算机应用程序中广泛使用的过程,主要是确定具有特定值的元素是否存在于向量或元素列表中。它在删除的情况下充当替代,因为如果不搜索特定值的元素,删除操作就不能发生。搜索可以是在一组给定的元素中查找一个元素(精确匹配),或者查找一组元素(范围匹配),这些元素属于某个值范围。在搜索操作中,元素的位置也被确定。该位置可以在稍后的删除操作中使用。如果在给定的vector(或list)中找到特定键值的元素,则说搜索成功;如果在给定的vector(或list)中找不到特定键值的元素,则说搜索不成功。

Searching unsorted and sorted vectors

Self-organizing lists

Count based SOL

V <- c(1,2,3,4,5,6,7,8)
S <- c(6,4,6,7,5,7,6,1,4,6,7,5)
n_search = length(S)
n=length(V)

SOL_count <- function(V,S,n_search,n)
{
  if(is.null(V)) stop("NO elements in input vector")
  if(is.null(S)) stop("NO elemens to search")
  i=1
  count <- as.list(sapply(1:n,function(x) 0))
  names(count) <- V
  while(i<=n_search)
  {
    if(Sequential_search(V,S[i],n)$present){
      key <- Sequential_search(V,S[i],n)$key
      count[key][[1]] <- count[key][[1]] + 1
      count <- count[order(-unlist(count))]
      V <- as.numeric(names(count))
  }
  i=i+1
  }
  return(V)
}

SOL_count(V,S,n_search,n)
## [1] 6 7 4 5 1 2 3 8

Move To Front Self Organizind List

SOL_move <- function(V,S,n_search,n)
{
  if(is.null(V)) stop("NO elements in input vector")
  if(is.null(S)) stop("NO elemens to search")
  i=1
  while(i<=n_search)
  {
    if(Sequential_search(V,S[i],n)$present){
      if(Sequential_search(V,S[i],n)$key !=1){
        key <- Sequential_search(V,S[i],n)$key
        temp <- V[key]
        V <- V[-key] 
        V <- c(temp,V)
      }
    }
    i <- i+1
  }
  return(V)
}

SOL_move(V,S,n_search,n)
## [1] 5 7 6 4 1 2 3 8

Transpose Self Organizind List

SOL_transpose <- function(V,S,n_search,n)
{
  if(is.null(V)) stop("NO elements in input vector")
  if(is.null(S)) stop("NO elemens to search")
  i=1
  while(i<=n_search)
  {
    if(Sequential_search(V,S[i],n)$present){
      if(Sequential_search(V,S[i],n)$key !=1){
        key <- Sequential_search(V,S[i],n)$key
        temp <- V[key-1]
        V[key-1] <- V[key]
        V[key] <- temp
      }
    }
    i <- i+1
  }
  return(V)
}

SOL_transpose(V,S,n_search,n)
## [1] 1 2 6 4 7 5 3 8

Hashing

hash_int <- function(K)
{
  return (K %% 18)
}

hash_string <- function(K,n)
{
  hashValue <- 0
  for(i in 1:n){
    hashValue <- hashValue+as.numeric(charToRaw(substr(K,i,i)))
  }
  return(hashValue)
}

5 Indexing 索引

我们将介绍索引概念,这是必不可少的文件结构组织磁盘上的大数据。索引还有助于提高数据访问、数据搜索和内存分配的效率。

Tree-based indexing

Tree based Indexing : BST node structure

bstnode <- function(key, value) {
  node <- new.env(hash = FALSE, parent = emptyenv())
  node$key <- key   # Node key
  node$value <- value       # Node Value
  node$left <- NULL     # left children key
  node$right <- NULL        # Right children key
  class(node) <- "bstnode"
  return(node)
}

2-3 Tree

tttnode <- function(lkey=NULL, lvalue=NULL, rkey=NULL, rvalue=NULL) {
  node <- new.env(hash = FALSE, parent = emptyenv())
  node$lkey <- lkey     # left Node key
  node$lvalue <- lvalue     # Node Value
  node$rkey <- rkey     # right Node key
  node$rvalue <- rvalue     # right Node Value
  node$left <- NULL     # left children key
  node$center <- NULL       # left children key
  node$right <- NULL        # Right children key
  class(node) <- "tttnode"
  return(node)
}

extttree <- tttnode(70, 70)

Function to check if node is empty

check_empty<-function(node){
  ifelse((is.null(node$lkey) & is.null(node$rkey)), T, F)
}

Function to insert if the node has empty space

leaf_insert<-function(node, key, val){
  if(check_empty(node)) return(tttnode(lkey=key, lvalue=val))
  if(is.null(node$rkey)){
    if(key>node$lkey){
      node$rkey<-key
      node$rvalue<-val
    } else
    {
      node$rkey<-node$lkey
      node$rvalue<-node$lvalue
      node$lkey<-key
      node$lvalue<-val
    }
  } else
  {
    node$left<-tttnode(key, val)
  }
  return(node)
} 

The generalized pseudocode for element insertion in a 2-3 tree is as follows(在2-3树中插入元素的广义伪代码如下)

ttinsert<-function(node=NULL, key, val){
  if(check_empty(node)) return(tttnode(lkey=key, lvalue=val))
  if(is.null(node$left)) node<-leaf_insert(node, key, val)   
  ## Add element to internal nodes
  if(key<node$lkey){
    subtree = ttinsert(node$left, key, val)
    if (identical(subtree, node$left))
    {
      return(node);
    } else
    {
      assign("left", subtree, envir = node)
      return(node)
    }
  } else if(ifelse(is.null(node$rkey), T,  key<node$rkey)){
    subtree = ttinsert(node$center, key, val)
    if(identical(subtree, node$center)) 
    {
      return(node)
    } else
    {
      assign("center", subtree, envir = node)
      return(node)
    }
  } else
  {
    subtree = ttinsert(node$right, key, val)
    if(identical(subtree, node$right)) {
      return(node)
    } else
    {
      assign("right", subtree, envir = node)
      return(node)
    } 
  } 
}

搜索2-3树中的伪代码如下

search_keys<-function(node, key){
  if (is.null(node)) return(NULL) # empty node
  if (node$lkey== key) return(node$lvalue)
  if(!is.null(node$rkey) & node$rkey==key) return(node$rvalue)
  if(key<node$lkey) {
    sort_keys(node$left, key)
  } else if(is.null(node$rkey)){
    sort_keys(node$center, key)
  } else if(key<node$rkey) {
    sort_keys(node$center, key)
  } else
  {
    sort_keys(node$right, key)
  }
}

B+ Trees

内存效率是设计数据结构和算法时必须考虑的一个重要方面。记忆可以大致分为两种类型:

1.主存储器(RAM) 2.外部存储器,如硬盘、CD ROM、磁带等

存储在主存(RAM)中的数据具有最小的访问时间,因此大多数算法首选,然而,如果数据存储在外部驱动器中,那么访问时间就变得至关重要,因为从外部存储器访问数据通常需要更长的时间。此外,随着数据大小的增加,检索成为一个问题。为了处理这个问题,数据以页面、块或外部存储设备中的分配单元的形式存储,并使用索引来高效地检索这些块。b -树是用于从外部存储设备访问数据的常用数据结构之一。b树是由R. Bayer和E. M. McCreight在1972年提出的,它比二叉搜索树更好,特别是当数据存储在外部存储器时。

bplusnode<-function(node=NULL, key, val){
  node <- new.env(hash = FALSE, parent = emptyenv())  
  node$keys<-keys
  node$child<-NULL
  node$isleaf<-NULL 
  node$d<-NULL
  class(node) <- "bplustree"
  return(node)
}

dlinkchildNode <- function(val, prevnode=NULL, node=NULL) {
  llist <- new.env(parent=create_emptyenv())
  llist$prevnode <- prevnode
  llist$element <- val
  llist$nextnode <- node
  class(llist) <- " dlinkchildNode"
  llist
}

Range Query

querry_search<-function(node, key1, Key2){
  ## Function to get values within leaf node using link list
  search_range<-function(child, key1, key2, val=NULL){
    if(child$element>key1 & child$key2){
      val<-c(val, child$element)
      search_range(child$nextnode, key1, key2, val)
    } else
    {
      return(val)
    }
  }
  
  if(key1>key2){
    temp<-Key2
    key2<-key1
    key1<-temp
  }
  child<-search_lower_key(node, key1)   # search lower leaf
  rangeVal<-search_range(child, key1, key2)   # Return Range 
  return(rangeVal)
}

6 Graphs 图

图是一种能够处理网络的数据结构类型。图被广泛地应用于不同的领域,例如:

  1. Transportation 交通: To find the shortest routes to travel between two places
  2. Communication-signaling networks 通信信号网络: To optimize the network of inter-connected computers and systems
  3. Understanding relationships 理解关系: To build relationship trees across families or organizations
  4. Hydrology 水文学: To perform flow regime simulation analysis of various fluids

图(G)是由一组边(E)相互连接的顶点(V)组成的网络。图的基本概念,补充以下。

考虑一个有n个顶点的图(G),它可以用两种形式表示。这些形式可以直接用于执行数学计算:

  1. Adjacency matrix(邻接矩阵):邻接矩阵是一个n×n数组,行表示从顶点,列表示到顶点。矩阵中的数字既可以表示两个顶点之间有向连接的存在,也可以表示连接两个顶点的边的权值(或距离)。因为邻接矩阵中的每个位置都可以接受一个数值,所以它需要一个比特的内存。
  2. 邻接表:顾名思义,邻接表是一个长度为n的链表数组。数组中的每个位置存储连接相邻连接顶点的链表的指针,每个链表存储其连接边的值。

Graph Abstract Data type

Graph_ADT <- setRefClass(
  Class = "adjacency_Matrix",
  fields = list(n = "integer"),
  methods = list(
    ## Initialise a graph of n vertices
    Initialize = function(n) {
    },
    
    ## Return number of vertices and edges
    num_vert = function() {
    },
    num_edges = function() {
    },
    
    ## Return weight of an edge for a pair of connecting vertices v1 and v2
    weightEdge = function(v1, v2) {
    },
    
    ## Assign weight(wt) of an edge for a pair of connecting vertices v1 and v2
    assignEdge = function(v1, v2, wt) {
    },
    
    ## Delete weight of an edge for a pair of connecting vertices v1 and v2
    deleteEdge = function(v1, v2) {
    },
    
    ## Return first connecting vertex for a given vertex v
    firstVertex = function(v) {
    },
    
    ## Return next connecting vertex for a given v and its neighbour w
    nextVertex = function(v, w) {
    },
    
    ## Check for presence of an edge connection for a pair of vertices v1 and v2
    isEdge = function(v1, v2) {
    }
  )
)

Adjacency matrix

adjacencyMatrix <-
  setRefClass(
    Class = "adjacencyMatrix",
    fields = list(n = "integer"),
    methods = list(
      ## Initialise the graph of n vertices
      Initialize <- function(n) {
        numVertices <<- as.integer(n)  ## with n vertices
        numEdges <<- 0L     ## with no connected edges
        mark <<- list()       ## initialise mark list
        
        ## initialse the mark of all vertices to 0 (unvisited)
        for (i in 1:numVertices)
          mark[[i]] <<- 0L
        
        ## generate a new nxn matrix with initial weights as 0
        mat <- matrix()
        for (i in 1:numVertices)
          for (j in 1:numVertices)
            mat[i, j] <<- 0L
      },
      
      ## get number of vertices
      num_vert <- function()
        return(numVertices),
      
      ## get number of edges
      num_edges <- function()
        return(numEdges),
      
      ## return the first adjacent neighbout of vertex index v
      firstVertex <- function(v) {
        for (i in 1:numVertices)
          if (mat[v, i] != 0)
            return(i)
        return(numVertices + 1)
      },
      
      ## return next adjacent vertices of index v after
      ## getting index w using firstVertex
      nextVertex <- function(v, w) {
        for (i in (w + 1):numVertices)
          if (mat[v, i] != 0)
            return(i)
        return(numVertices + 1)
      },
      
      ## Assign weight (wt) to each connected edge of indices v1 and v2
      assignEdge <- function(v1, v2, wt) {
        if (wt < 0)
          stop("Weight should be positive")
        ## increase the count of edges as the weights are assigned
        if (mat[v1, v2] == 0)
          numEdges <<- numEdges + 1L
        ## replace 0 with the wt
        mat[v1, v2] <<- wt
      },
      
      ## Delete a connected edge between indices v1 and v2
      deleteEdge <- function(v1, v2) {
        if (mat[v1, v2] != 0)
          numEdges <<- numEdges - 1L
        mat[v1, v2] <<- 0
      },
      
      ## Check whether an edge exists between indices v1 and v2
      isEdge <- function(v1, v2) {
        return(mat[v1, v2] != 0)
      },
      
      ## Get weight of the connected edge between indices v1 and v2
      weightEdge <- function(v1, v2) {
        return(mat[v1, v2])
      },
      
      ## Get the mark of a vertex of index v1
      getMark <- function(v1) {
        return(mark[[v1]])
      },
      
      ## initialise the mark of a vertex of index v1 with 1
      initMark <- function(v1, val) {
        mark[[v]] <<- val
      }
    )
  )

Adjacency list

adjacencyList <-
  setRefClass(
    Class = "adjacencyList",
    fields = list(n = "integer"),
    methods = list(
      ## Initialise the graph of n vertices
      Initialize <- function(n) {
        ## with n vertices
        numVertices <<- n
        ## with no connected edges
        numEdges <<- 0L
        ## initialise mark list
        mark <<- list()
        
        ## initialse the mark of all vertices to 0 (unvisited)
        for (i in 1:numVertices)
          mark[[i]] <<- 0L
        
        ## generate a linked list of edges each for
        ## each vertex in the list
        vertex <- list()
        for (i in 1:numVertices)
          vertex[[i]] <<- llistofEdges()
      },
      
      ## get number of vertices
      num_vert <- function()
        return(numVertices),
      
      ## get number of edges
      num_edges <- function()
        return(numEdges),
      
      ## return the first adjacent neighbout of vertex index v
      firstVertex <- function(v) {
        if (length(vertex[[v]]) == 0)
          ## indicates no adjacent neighbour
          return(numVertices + 1)
        ## Move to the first adjacent vertex
        adjVert <<- firstAdjVert(vertex[[v]])
        ## get the current position of AdjVert
        pos <<- currentPos(vertex[[v]], adjVert)
        ## get value of connecting edge
        adjEdge <<- getValue(vertex[[v]], adjVert)
        return(adjVert)
      },
      
      ## return next adjacent vertices of index v after
      ## getting index w using firstVertex
      nextVertex <- function(v, w) {
        if (isEdge(v, w)) {
          if (pos + 1 > length(vertex[[v]])) {
            ## move the next adjcent vertex of w
            adjVert <<- nextAdjVertex(vertex[[v]], w)
            ## get the current position of adjcent vertex
            pos <<- currentPos(vertex[[v]], adjVert)
            ## get value of connecting edge
            adjEdge <<- getValue(vertex[[v]], adjVert)
            return(adjVert)
          }
          ## no connecting edge
        } else
          return(numVertices + 1)
      },
      
      ## Assign weight (wt) to each connected edge of indices v1 and v2
      assignEdge <- function(v1, v2, wt) {
        if (wt < 0)
          stop("Weight should be positive")
        ##check whether edge exists between v1 and v2
        if (isEdge(v1, v2)) {
          ## insert vertex v2 along with edge weight wt
          insertVertex(vertex[[v1]], v2, wt)
        }
      },
      
      ## Delete a connected edge between indices v1 and v2
      deleteEdge <- function(v1, v2) {
        if (isEdge(v1, v2)) {
          removeEdge(v1, v2)
          numEdges <<- numEdges - 1L
        }
      },
      
      ## Check whether an edge exists between indices v1 and v2
      isEdge <- function(v1, v2) {
        pos <- currentPos(vertex[[v1]], firstAdjVert(vertex[[v1]]))
        while (pos < length(vertex[[v1]])) {
          adjVert <- nextAdjVertex(vertex[[v1]], vertex[[v1]][pos])
          if (adjVert == v2) {
            return(TRUE)
          } else {
            pos = pos + 1
          }
        }
      },
      
      ## Get weight of the connected edge between indices v1 and v2
      weightEdge <- function(v1, v2) {
        if (isEdge(v1, v2)) {
          adjEdge <- getValue(vertex[[v1]], v2)
          return(adjEdge)
        } else {
          return (0)
        }
      },
      
      ## Get the mark of a vertex of index v1
      getMark <- function(v1) {
        return(mark[[v1]])
      },
      
      ## initialise the mark of a vertex of index v1 with 1
      initMark <- function(v1, val) {
        mark[[v]] <<- val
      }
    )
  )

Graph Traversing 图的遍历

graph_Traverse <- function(Graph_ADT, n, vertices)
{
  ## Initialise marks to zero
  verticesMarks <- list()
  for (i in 1:n)
    verticesMarks[[i]] <-
      Graph_ADT$initMark(vertices[i], 0) ## 0 means not visited
  
  ## Inititate traversing upon checking for unmarked nodes
  for (i in 1:n)
    if (Graph_ADT$getMark(vertices[i]) == 0)
      initTraverse(verticesMarks, vertices[i])
}

Depth First Search (DFS) 深度优先搜索

DFS是图遍历算法的递归实现,适用于有向图和无向图。在遍历的每个步骤中,DFS递归地检查和访问所有未访问的节点,这些节点直接与考虑中的节点连接。同时,沿着路径访问的所有节点将按照遍历的顺序推入堆栈。在遍历过程中,如果遇到的节点中没有直接连接的未访问节点,则将这些节点从堆栈中弹出,留下那些直接连接的未访问节点。这将跟踪那些确定向前遍历路径的节点,以便在顶点之前访问所有节点。下面的R代码用三个输入实现了DFS算法。输入为Graph_ADT, n(图中节点数),v(考虑执行DFS的节点数):

DepthFirstSearch <- function(Graph_ADT, n, v)
{
  ## Ensure all nodes are visited and processed prior node v
  preVisit(v)
  
  ## mark the node v under consideration as 1 (i.e. visited)
  VerticesMarks <- list()
  VerticesMarks[[v]] <- Graph_ADT$initMark(v, 1)
  
  ## Recursively visit all connected nodes of v till all are marked as 1
  ## get the first vertex
  node <- Graph_ADT$firstVertex(v)
  ## check node belongs to neighbouring nodes using conVert function
  while (node %in% conVert(v)) {
    ## check if the node is unvisited
    if (Graph_ADT$getMark(VerticesMarks[[node]] == 0))
      ## recursively run DFS
      DepthFirstSearch(Graph_ADT, n, node)
    ## assign next neighbouring vertex
    node <- Graph_ADT$nextVertex(v, node)
  }
  
  ## Run post processing remaining un-visited nodes
  postVisit(v)
}

Breadth First Search (BFS) 宽度优先搜索

BFS的工作原理与DFS算法相似,但有以下几点: 1. 与DFS不同,BFS不是一个递归实现 2. 为了跟踪标记的节点,BFS使用队列来对抗DFS使用的堆栈 3. 在移动到下一个节点之前,BFS确保访问所有直接未标记连接的节点 节点不同于DFS,在DFS中,每次迭代只访问一个未标记的连接节点 下面的R代码使用四个输入实现BFS算法。输入是Graph_ADT, startVertex(开始遍历的图的起始节点),queue(一个空队列,用于跟踪按访问顺序连接的节点)和n(图中的节点总数):

BreadthFirstSearch <- function(Graph_ADT, startVertex, queue, n)
{
  ## initialise an empty queue with a start vertex
  queue <- initQueue(startVertex)
  
  ## Initialise first vertex by marking it as 1 (visited)
  VerticesMarks <- list()
  VerticesMarks[[v]] <- Graph_ADT$initMark(v, 1)
  
  ## Subsequently start processing in queues
  while (length(queue) != 0) {
    ## extract first element in the queue
    v <- extQueue(queue)
    
    ## Pre-Process all directly connected nodes of v
    preVisit(v)
    
    ## Mark visited nodes with 1 and accordingly queue the nodes
    node <- firstVertex(v)
    while (node %in% conVert(v)) {
      if (getMark(graph[node] == 0)) {
        graph <- Graph_ADT$initMark(node, 1)
        queue <- initQueue(node)
      }
      node <- Graph_ADT$nextVertex(startVertex, node)
    }
  }
}

Topological Sort 拓扑排序

Main function to perform topological sort

Topological_DFS_sort <- function(Graph_ADT, n, vertices)
{
  ## initialise all nodes with 0 (unvisited)
  verticesMarks <- list()
  for(i in 1:n)
    verticesMarks[[i]] <<- Graph_ADT$initMark(vertices[i],0)
  
  ## Process all nodes by recursive traversing
  for(i in 1:n)
    if(Graph_ADT$getMark(vertices[i]) == 0)
      topological_secondary(Graph_ADT,i)
}

recursive secondary function to help main function

topological_secondary <- function(Graph_ADT,i)
{
  ## Mark the node as 1 (visited)
  verticesMarks[[i]] <<- Graph_ADT$initMark(vertices[i], 1)
  
  ## Perform traversing across connected nodes
  v <- Graph_ADT$firstVertex(vertices[i])
  while(v %in% conVert(vertices[i])){
    if(Graph_ADT$getMark(vertices[i] == 0))
      topological_secondary(vertices,v)
    v <- Graph_ADT$nextVertex(vertices[i],v)
  }
  return(v)
}

Topological sort using Breadth First Search (BFS)

Topological_BFS_sort <- function(Graph_ADT, queue, n, vertices)
{
  ## Initialise a list to track count of inwards edges for each node
  countEdge <- list()
  
  ## initialise count of each node to 0
  for (i in vertices)
    countEdge[[i]] <- 0
  
  ## Assign count (inward nodes) prerequisite to each node
  for(i in vertices){
    v <- Graph_ADT$firstVertex(vertices[i])
    while(v %in% conVert(vertices[i])){
      countEdge[[v]] <- countEdge[[v]] + 1
      v <- Graph_ADT$nextVertex(vertices[[i]],v)
    }
  }
  
  ## Initialize queue with nodes which have zero count of inward edges
  for(i in vertices)
    if(countEdge[[i]] == 0)
      queue <-  Graph_ADT$initQueue(i)
  
  ## Process the nodes which are in the queue
  while(length(queue) != 0){
    v <- extQueue(queue)
    print(v)
    w <- Graph_ADT$firstVertex(v)
    while(w %in% conVert(vertices[v])){
      ## Decrease the count prerequisite by 1
      countEdge[[w]] <- countEdge[[w]] - 1
      if(countEdge[[w]] == 0) ## no prerequisites
        queue <- initQueue(w)
      w <- Graph_ADT$nextVertex(vertices[v],w)
    }
  }
}

Shortest path problems 最短路径问题

PriorityQueueInit <- 
  setRefClass("PriorityQueueInit",
              fields = list(keys = "integer", values = "integer"),
              methods = list(
                push = function(key,value) {
                  keys <<- append(keys, key)
                  values <<- append(values, value)
                },
                extractMinVertex = function() {
                  minPos <- which(values==min(values))
                  key <- keys[[minPos]]
                  value <- values[[minPos]]
                  return(list(key=key,value=value))
                }
              ))
  

DijkstraShortestPath <- function(Graph_ADT, sourceVertex, vertices, n)
{
  library(hashmap)  ## To create new hashmap instances
  
  ## Initiate a new priority queue
  ## It can perform push, entractMinVertex
  ## push : add new vertices along with their distances from source node
  ## entractMinVertex : extract the vertex with minimum value
  ## key represents all the vertices of the graph
  ## value represents the vertex value or distance from the source vertex
  priorityQueue <- PriorityQueueInit$new()
  
  ## Initiate a hashmap to store shortest distance from source vertex to every vertex
  ## keys are all the vertices of the graph other than source vertex
  ## values are the corresponding distances from the source node
  ## Dimension of hashmap is n, where n is total number of vertices in graph G
  ## Initialize all vertices with distance as 0 which will later be updated 
  distanceMap <- hashmap(keys=vertices,
                         values = rep(0,n))
  
  ## Initiate another hashmap to store the parent vertex to keep a track of shortest 
  ## path from source vertex to every vertex
  ## key represents the child (to) vertex
  ## value represents the parent (from) vertex
  ## initialize key with source vertex and value with NULL
  parentMap <- hashmap(keys = sourceVertex,
                       values = "NULL")
  
  ## initialize priority queue with value of all vertices to infinity
  for( i in vertices)
    priorityQueue$push(vertices[i],Inf)
  
  ## Set the distance of sourceVertex as zero
  priorityQueue$values[which(priorityQueue$keys==sourceVertex)] <- 0
  
  ## Begin iteration till the all the vertices from priorityQueue becomes empty
  while(length(priorityQueue$keys) != 0){
    
    ## Extract vertex with minimum value from priority queue along with its value
    headVertex <- priorityQueue$extractMinVertex()
    
    ## Assign the key of the head vertex as current vertex
    currentVertex <- headVertex$key
    
    ## Append distancemap with current key and its value
    distanceMap[[currentVertex]] <- headVertex$value
    
    ## Check for all directly connected vertices for the current vertex
    for(conVert in getConVertex(graph,currentvertex)){
      ## get all the corresponding edge value
      edgeValue <- getEdgeValue(graph,currentvertex,conVert)
      
      ## Check priority queue contains the adjacent connected vertex (conVert) or not
      ## If yes, then proceed ahead 
      ## if no, conVert vertex already has shortest distance from source vertex
      if(!priorityQueue$keys %in% conVert){
        next
      }
      
      ## Now evaluate the distance of the adjacent vertex (conVert) with source vertex
      ## via current vertex. Add the distance of current vertex with edge value of 
      ## adjacent vertex (conVert)
      updDistance <- distanceMap[[currentVertex]] + edgeValue
      
      ## Check whether the value of the adjacent vertex in priorityQueue is greater than
      ## the updated distance or not. If yes, then decrease the value in the priorityQueue
      ## to the updated distance and also update the parent map of the adjacent vertex 
      ## current vertex
      if(priorityQueue$values[which(priorityQueue$keys==conVert)] > updDistance){
        priorityQueue$values[which(priorityQueue$keys==conVert)] <- updDistance
        parentmap[[conVert]]  <- currentVertex
      }
    }
  }
}

Prim’s algorithm

Priority queue which supports push, entractMinVertex, containsVertex,getWeight for Prim’s algorithm

PriorityQueueInit <- 
  setRefClass("PriorityQueueInit",
              fields = list(keys = "integer", values = "integer"),
              methods = list(
                push = function(key,value) {
                  keys <<- append(keys, key)
                  values <<- append(values, value)
                },
                extractMinVertex = function() {
                  minPos <- which(values==min(values))
                  key <- keys[[minPos]]
                  value <- values[[minPos]]
                  return(list(key=key,value=value))
                }
              ))

Prim’s algorithm

Implement Prim’s algorithm

primMST <- function(Graph_ADT,vertices,n)
{
  library(hashmap)  ## To create new hashmap instances
  
  ## Initiate a new priority queue
  ## It can perform push, entractMinVertex
  ## push : add new vertices along with their distances from source node
  ## entractMinVertex : extract the vertex with minimum value
  ## key represents all the vertices of the graph
  ## value represents the vertex value or distance from the source vertex
  priorityQueue <- PriorityQueueInit$new()
  
  ## Initiate a hashmap to store shortest distance from source vertex to every vertex
  ## keys are all the vertices of the graph other than source vertex
  ## values are the corresponding distances from the source node
  ## Dimension of hashmap is n, where n is total number of vertices in graph G
  ## Initialize all vertices with distance as 0 which will later be updated 
  distanceMap <- hashmap(keys=vertices,
                         values = rep(0,n))
  
  ## Initialise a list to store final MST result
  MSTResult <- list()
  
  ## initialize priority queue with value of all vertices to infinity
  for( i in vertices)
    priorityQueue$push(vertices[i],Inf)
  
  ## begin with a random vertex
  startVertex <<- vertices[sample(1:n, 1)]
  
  ## Set the distance of startVertex as zero
  priorityQueue$values[which(priorityQueue$keys==startVertex)] <- 0
  
  ## Begin iteration till the all the vertices from priorityQueue becomes empty
  while(length(priorityQueue$keys) != 0){
    
    ## Extract vertex with minimum value from priority queue along with its value
    headVertex <- priorityQueue$extractMinVertex()
    
    ## Assign the key of the head vertex as current vertex
    currentVertex <- headVertex$key
    
    ## Append distancemap with current key and its value
    distanceMap[[currentVertex]] <- headVertex$value
    
    ## Check for all directly connected vertices for the current vertex
    for(conVert in getConVertex(graph,currentvertex)){
      ## get all the corresponding edge value
      edgeValue <- getEdgeValue(graph,currentvertex,conVert)
      
      ## Check priority queue contains the adjacent connected vertex (conVert) or not
      ## If yes, then proceed ahead 
      ## if no, conVert vertex already has shortest distance from current vertex
      if(!priorityQueue$keys %in% conVert){
        next
      }
      
      ## Now evaluate the distance of the adjacent vertex (conVert) with current vertex. 
      ## Update the distance with the edge value
      updDistance <- edgeValue
      
      ## Check whether the value of the adjacent vertex in priorityQueue is greater than
      ## the updated distance or not. If yes, then decrease the value in the priorityQueue
      ## to the updated distance.
      if(priorityQueue$values[which(priorityQueue$keys==conVert)] > updDistance){
        priorityQueue$values[which(priorityQueue$keys==conVert)] <- updDistance
        MSTResult[[currentVertex]] <- conVert
      }
    }
  }
}

Kruskal’s algorithm

Define a reference class which can perform disjoint set operations

The operations are UNION, DIFFER and FIND

Union: to merge two sets together

Differ: to check whether vertices are in different sets

Find: to find whether a vertex is in a set or not

disjoinSetPointer <- setRefClass("disjoinSetPointer",
                                 fields = list(vertex = "vector",
                                               set1 = "vector",
                                               set2 = "vector",
                                               currentVertex = "integer"),
                                 methods = list(
                                   ## merge two sets
                                   union = function(set1,set2){
                                     return(c(set1,set2))
                                   },
                                   ## check whether set1 and set 2 are disjoint
                                   ## return TRUE if they are disjoint
                                   differ = function(set1,set2){
                                     if(sum(set1 %in% set1) ==0){
                                       return(TRUE)} else(return(FALSE))
                                   },
                                   ## Find whether a vertex is in a set or not
                                   ## returns root of the currentVertex
                                   ## function ROOT returns root of the vector
                                   find = function(currentVertex){
                                     return(ROOT(vertex[currentvertex]))
                                   }
                                 ))

Define a reference class to store the edges along with from and to vertices on min-heap

kruskalArray <- setRefClass("kruskalArray",
                            fields = list(fromVertex = "numeric",
                                          toVertex = "numeric",
                                          weight = "numeric"),
                            methods = list(
                              ## insert new from and to vertices along with edge
                              push = function(f, t, w){
                                fromVertex <<- append(fromVertex,f)
                                toVertex <<- append(toVertex,t)
                                weight <<- append(weight,w)
                              },
                              ## extract from and to vertices having minimum edge value
                              ## alos remove from, to and edge value from the array
                              extractMinEdge = function() {
                                minPos <- which(weight==min(weight))
                                from <- fromVertex[[minPos]]
                                to <- toVertex[[minPos]]
                                fromVertex <<- fromVertex[[-minPos]]
                                toVertex <<- toVertex[[-minPos]]
                                weight <<- weight[[-minPos]]
                                return(list(from=from,to=to))
                              }
                            ))

Implement Kruskals algorithm

kruskalMST <- function(Graph_ADT, n, e)
{
  ## initialse reference classes disjoinSetPointer and kruskalArray
  vertexArray <- disjoinSetPointer$new()
  edgeArray <- kruskalArray$new()
  
  ## Initialise a list to store final MST result
  MSTResult <- list()
  
  ##  Put all the edges in the edgeArray
  for(i in 1:n){
    j <- firstVertex(i)
    while(i <= n){
      edgeArray$push(i,j,Graph_ADT$weightEdge(i,j))
    }
  }
  
  ## Initialise n equivalent sets
  numMST <- n
  
  ## Iteratively combine equivalent sets based on edge weights
  ## edges are extracted based on their value. Smallest edges are extracted first
  for(i in 1:e){
    while(numMST >= 1){
      # get the from and to vertices having minimum edge value
      temp <- edgeArray$extractMinEdge()
      fromVertex <- temp$from
      toVertex <- temp$to
      
      ## Check whether two vertices are in different sets
      if(vertexArray$differ(fromvertex,toVertex)){
        ## if yes, then combine from and to vertices into one set
        vertexArray$union(fromvertex,toVertex)
        ## add this edge to MST
        MSTResult[[i]] <- c(fromVertex,toVertex)
        ## decrease the sets by 1
        numMST <- numMST - 1
      }
    }
  }
  return(MSTResult)  
}

7 Programming and Randomized Algorithms 随机化算法

动态规划可以定义为一种使用循环公式(将一个问题分解为一个子问题)来解决任何复杂问题的方法。利用先前的状态解重构任意问题的子解。基于动态规划的方法能够达到多项式复杂度来解决问题,并确保比其他经典方法(如蛮力算法)更快的计算速度。

使用递归计算狄波拉契数列

nfib<-function(n){
  assertthat::assert_that(n>0) & assertthat::assert_that(n<50)
  if(n==1 || n==2) return(1)
  val<- nfib(n-1) + nfib(n-2)
  return(val)
}

使用动态规划计算狄波拉契数列

nfib_DP<-function(n){
  assertthat::assert_that(n>0) & assertthat::assert_that(n<50)
  if(n<=2) return(1)
  lag2_val<-0
  lag1_val<-1
  nfibval<-1
  
  # Loop to compute Fibonacci value
  for(i in 3:n){
    lag2_val<-lag1_val
    lag1_val<-nfibval
    nfibval<-lag2_val+lag1_val
  }
  return(nfibval)
}

The knapsack problem 背包问题

背包问题是一个组合优化问题,它要求在不超过容量约束的情况下,选择某一给定项目的一个子集使利润最大化。根据物品数量和可用背包的数量,文献中报告了不同类型的背包问题:0-1背包问题,其中每个物品最多被选择一次;有界背包问题对每个项目的选择都有一个约束;多选题背包问题,其中多个背包是礼物,从多个集合中选择物品;在多约束背包问题中,背包的体积和重量都有约束。

0-1背包问题是,给你一个可装载重量为 W 的背包和 n 个物品,每个物品有重量和价值两个属性。其中第 i 个物品的重量为 S[i],价值为 C[i],现在让你用这个背包装物品,最多能装的价值是多少?

公示可以表示为:

使用动态规划计算0-1背包问题:

knapsack_DP<-function(W, S, C, n){
  require(pracma)
  K<-zeros(n+1, W+1)
  for(i in 1:(n+1)){
    for(j in 1:(W+1)){
      if(i==1 | j==1){
        K[i,j]=0
      } else if(S[i-1] <= j){
        K[i, j] = max(C[i-1] + K[i-1, (j-S[i-1])],  K[i-1, j])
      } else
      {
        K[i, j] = K[i-1, j]
} }
}
  return(K[n+1, W+1])
}

All pairs shortest paths 最短路径问题

所有对最短路径(APSP)问题的重点是寻找所有对顶点之间的最短路径。

解决APSP问题的方法之一是使用Floyd-Warshall算法,该算法使用动态规划。该方法基于这样一种观察,即连接两个顶点u和v的任何路径之间可能有零个或多个顶点,从而定义了一条路径。Floyd-Warshall算法的R实现如下:

floydWarshall<-function(graph){
  nodes<-names(graph)
  dist<-graph  
  for (n in nodes){
    for(ni in nodes){
      for(nj in nodes){
        if((dist[[ni]][n]+dist[[n]][nj])<dist[[ni]][nj]){
          dist[[ni]][nj]<-dist[[ni]][n]+dist[[n]][nj]
        }
      }
    }
  }
  return(dist)
}

Example script for Floyd-Warshall algorithm

# Defining graph structure
graph<-list()
graph[["A"]]=c("A"=0, "B"=8, "C"=Inf, "D"=Inf, "E"=1, "F"=Inf)
graph[["B"]]=c("A"=Inf, "B"=0, "C"=7, "D"=6, "E"=Inf, "F"=Inf)
graph[["C"]]=c("A"=Inf, "B"=Inf, "C"=0, "D"=6, "E"=Inf, "F"=Inf)
graph[["D"]]=c("A"=Inf, "B"=Inf, "C"=Inf, "D"=0, "E"=Inf, "F"=4)
graph[["E"]]=c("A"=Inf, "B"=3, "C"=Inf, "D"=Inf, "E"=0, "F"=9)
graph[["F"]]=c("A"=Inf, "B"=3, "C"=Inf, "D"=4, "E"=9, "F"=0)

# get shortest pair distance
APSP_Dist<-floydWarshall(graph) 

图形存储为R中的列表,可以使用边名称调用。Inf表示没有链接

在计算非常昂贵的情况下,引入随机性可以帮助减少计算工作量,但会牺牲准确性。

Randomized algorithms for finding large values

在未排序的列表中查找最大值的计算成本为O(n)。任何确定性算法都需要O(n)努力来确定最大值。然而,在时间至关重要且n非常大的场景中,使用近似算法,而不是寻找实际解,而是确定更接近实际解的解。随机算法可根据其目标分为以下几类:

1.Las Vegas algorithm(拉斯维加斯算法):拉斯维加斯算法以一定的概率失败,因此我们可以继续该算法,直到我们得到合适的结果,但它对时间有影响,时间变得无界。因此,拉斯维加斯算法通常在规定的时间内使用合理的结果(如果算法失败,则会引发错误)。 2.Monte Carlo algorithm(蒙特卡罗算法):使用蒙特卡罗算法,我们无法测试算法何时可能失败;然而,可以通过增加迭代次数和获得预期结果来降低失效概率。

通常,拉斯维加斯算法是首选。然而,在某些情况下,蒙特卡罗算法可能是有用的,尤其是如果我们能够降低失效概率,并且数据本身涉及不确定性的话。例如,假设我们想使用样本数据确定一座城市的平均高度。我们必须使用样本运行蒙特卡罗算法来估计平均值,以获得预期值和涉及的波动系数。如果样本不能很好地代表总体,总是有可能得到错误的答案,但除非我们有总体数据集,否则我们将无法检测到它。

skip list列表是Bill Pugh在1990年开发的概率数据结构。它是为了解决链接列表和数组中的搜索限制而开发的。skip list提供了二叉搜索树(BST)和类似基于树的数据结构的替代方案,这些数据结构往往会变得不平衡。Skip list比基于树的数据结构更容易实现。然而,它们不能保证最佳性能,因为它们使用随机化来排列条目,以便搜索和更新时间平均为O(logn),其中n是字典中的条目数。skip 列表中的平均时间与键和输入的分布无关。相反,它只取决于在插入等操作期间选择的随机化种子。这是实现复杂性和算法性能之间折衷的一个很好的例子。

skip list可以被视为已排序链接列表的扩展。 skip list

其中有两个链:S0连接每个节点,S1连接排序链表的每个备用节点。

skip list 可以是n阶的,以下是一个2阶skp list。

Skip node data structure

skListNode<-function(val, height=1){
  
  # function to create empty environment
  create_emptyenv = function() {
    emptyenv()
  }
  
  # Create skiplist node
  skiplist <- new.env(parent=create_emptyenv())
  skiplist$element <- val
  skiplist$nextnode<-rep(list(NULL), height)
  class(skiplist) <- "skiplist"
  skiplist
}

Function to find a value in skip list

findvalue<-function(skiplist, searchkey){
  for (i in level:1){
    # Assign head values
    skiplist<-skiplist$nextnode[[i]]
    while(!is.null(node) && node$element>searchkey){
      skiplist<-skiplist$nextnode[[i]]
    } 
    skiplist = skiplist$nextnode[0]
    if(!is.null(skiplist) && searchkey==skiplist$element){
      # Return element from Skip node
      return(skiplist$element) 
    } else
    {
      # Element not found return NULL
      return(NULL) 
    }
  }
}

8 Functional Data Structures

数据结构是任何算法的组成部分。堆、队列、堆栈、树和哈希表是广泛跨编程语言使用的各种形式的数据结构。其中一些主要用于查找,如树和哈希表,而另一些,如堆、队列和堆栈,则用于更新修改,如插入和删除。

rstackdeque提供了fully persistent stack data structure,在这个包中,堆栈是使用无序链表实现的,其中每个节点(列表)由数据元素和对下一个节点的引用组成。这些堆栈是S3对象,可以通过head堆栈节点访问。

Wiki of fully persistent stack data structure:在计算中,持久数据结构或非临时数据结构是一种在修改时始终保留其先前版本的数据结构

library(rstackdeque)
## Fully Persistent stacks 
a <- as.rstack(c("p", "q", "r","s","t"))
b <- insert_top(a, c("o"))
c <- without_top(a)
d <- peek_top(a)

## Slowly Pesistent Queues and Dequeues 
a <- as.rdeque(c("p", "q", "r","s","t","u","v"))
b <- insert_front(a, c("o"))
c <- insert_back(a, c("w"))
d <- without_front(a)
e <- without_back(a)

## Fast Fully Persistent Queues ##
a <- as.rpqueue(c("p","q","r","s")) 
b <- insert_back(a, "t")
c <- without_front(b)
d <- insert_back(c,"v")
e <- insert_back(d,"w")
f <- without_front(e)