Friedman and Rafsky

Generate samples of size \(m = 10\), \(n = 15\) from two independent bivariate normal distributions \(X_i \sim N_2(0,I)\) for \(i = 1,2,...,10\) and \(Y_j \sim N_2(\mu,I)\) for \(j = 1,2,3...,15\)

library(MASS)
mu1 <- c(0,0)
mu2 <- c(1,1)
sigma <- rbind(c(1,0),c(0,1))
sample1 <- mvrnorm(10,mu1,sigma)
sample2 <- mvrnorm(15,mu2,sigma)
sample1

##             [,1]        [,2]
##  [1,]  0.6235516 -0.47831046
##  [2,]  0.9745707 -1.89306312
##  [3,]  0.2549640  1.66950804
##  [4,] -0.1065072  0.08680403
##  [5,] -1.5508926 -0.42867709
##  [6,] -2.1714976  0.12837085
##  [7,]  0.7273024  0.06326545
##  [8,] -0.3028141 -0.13910381
##  [9,]  0.6002698  1.04173359
## [10,]  0.8667309  1.83140101

sample2

##              [,1]        [,2]
##  [1,]  1.28709980  0.76834844
##  [2,]  1.17641713  0.08805407
##  [3,] -0.12639674  0.65074280
##  [4,]  0.32803602 -0.41459550
##  [5,]  2.10764812  0.81605899
##  [6,]  2.98174663  0.60368563
##  [7,]  2.42483224  0.91246811
##  [8,]  1.99098708 -0.29783460
##  [9,] -0.10564584  0.57982759
## [10,]  0.29302290  1.62512005
## [11,]  0.58609873  0.61694629
## [12,]  1.51339316  0.57646009
## [13,]  2.08391336  0.40648645
## [14,]  1.76232916  0.85905776
## [15,] -0.01458107 -0.19590506

Build a minimal spanning tree from the combined samples, and calculate R and the Wald-Wolfowitz statistic.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(utils)
m = 10
n = 15
N = m + n
# generate all edges
edge <- combn(m+n,2)
tot <- ncol(edge)
# initialize the adjacency matrix
A <- matrix(0,25,25)

# calculate distances
dist <- c()
for(i in 1:tot){
  if(edge[1,i] <= m){
    a1 = edge[1,i]
    x1 = sample1[a1,1]
    y1 = sample1[a1,2]
  }else{
    a1 = edge[1,i] - m
    x1 = sample2[a1,1]
    y1 = sample2[a1,2]
  }
  if(edge[2,i] <= m){
    a2 = edge[2,i]
    x2 = sample1[a2,1]
    y2 = sample1[a2,2]
  }else{
    a2 = edge[2,i] - m
    x2 = sample2[a2,1]
    y2 = sample2[a2,2]
  }
  dist[i] = sqrt((x1-x2)^2 + (y1-y2)^2)
}
# combine edges and the corresponding distance
edge_list <- data.frame(t(edge),dist)
# sort it by distances
sorted_edge <- arrange(edge_list,dist)

# build mst
# keep a list of visited nodes
visited <- c()
for(i in 1:tot){
  # fetch an edge
  v1 <- sorted_edge[i,1]
  v2 <- sorted_edge[i,2]
  # if both v1 and v2 are visited, then a cycle is formed, we should do nothing and consider the next edge
  # if one is in visited, put the other in
  # if both are not in visited, put both in
  if((v1 %in% visited) & (v2 %in% visited)){
    next
  }else if((v1 %in% visited) & (v2 %in% visited == F)){
    visited <- append(visited,v2)
  }else if((v1 %in% visited == F) & (v2 %in% visited)){
    visited <- append(visited,v1)
  }else{
    visited <- append(visited,v1)
    visited <- append(visited,v2)
  }
  # if get trrough, no cycle, update adjacency matrix
  A[v1,v2] = 1
}

# calculate R
# for i,j where A[i,j] = 1, if i and j are from different samples, R = R + 1
R = 1
for(i in 1:N){
  for(j in 1:N){
    if(A[i,j] == 1){
      if(((i<=m)&(j>m)) | ((j<=m)&(i>m))){
        R = R + 1
      }
    }
  }
}
R

## [1] 8

# perform the Wald-Wolfowitz test at 0.05 significant level
W = (R - 2*m*n/N - 1)/sqrt((2*m*n*(2*m*n - N))/(N^2*(N - 1)))
if(abs(W) > 1.96){
  print(1)
}else{
  print(0)
}

## [1] 1

Repeat the above experiment 100 times for \(\mu = (0,0),(1,1)(2,2),(3,3)\)

# first rearrange the above code to make it a function
experiment <- function(x){
  mu2 <- x
  sample1 <- mvrnorm(m ,mu1,sigma)
  sample2 <- mvrnorm(n,mu2,sigma)
  # generate all edges
  edge <- combn(N,2)
  tot <- ncol(edge)
  # initialize the adjacency matrix
  A <- matrix(0,N,N)
  
  # calculate distances
  dist <- c()
  for(i in 1:tot){
    #print(i)
    if(edge[1,i] <= m){
      a1 = edge[1,i]
      x1 = sample1[a1,1]
      y1 = sample1[a1,2]
    }else{
      a1 = edge[1,i] - m
      x1 = sample2[a1,1]
      y1 = sample2[a1,2]
    }
    if(edge[2,i] <= m){
      a2 = edge[2,i]
      x2 = sample1[a2,1]
      y2 = sample1[a2,2]
    }else{
      a2 = edge[2,i] - m
      x2 = sample2[a2,1]
      y2 = sample2[a2,2]
    }
    dist[i] = sqrt((x1-x2)^2 + (y1-y2)^2)
  }
  # combine edges and the corresponding distance
  edge_list <- data.frame(t(edge),dist)
  # sort it by distances
  sorted_edge <- arrange(edge_list,dist)
  
  # build mst
  # keep a list of visited nodes
  visited <- c()
  for(i in 1:tot){
    # fetch an edge
    v1 <- sorted_edge[i,1]
    v2 <- sorted_edge[i,2]
    # if both v1 and v2 are visited, then a cycle is formed, we should do nothing and consider the next edge
    # if one is in visited, put the other in
    # if both are not in visited, put both in
    if((v1 %in% visited) & (v2 %in% visited)){
      next
    }else if((v1 %in% visited) & (v2 %in% visited == F)){
      visited <- append(visited,v2)
    }else if((v1 %in% visited == F) & (v2 %in% visited)){
      visited <- append(visited,v1)
    }else{
      visited <- append(visited,v1)
      visited <- append(visited,v2)
    }
    # if get trrough, no cycle, update adjacency matrix
    A[v1,v2] = 1
  }
  
  # calculate R
  # for i,j where A[i,j] = 1, if i and j are from different samples, R = R + 1
  R = 1
  for(i in 1:N){
    for(j in 1:N){
      if(A[i,j] == 1){
        if(((i<=m)&(j>m)) | ((j<=m)&(i>m))){
          R = R + 1
        }
      }
    }
  }
  R
  # perform the Wald-Wolfowitz test at 0.05 significant level
  W = (R - 2*m*n/N - 1)/sqrt((2*m*n*(2*m*n - N))/(N^2*(N - 1)))
  if(abs(W) > 1.96){
    significant = 1
  }else{
    significant = 0
  }
  return(c(R, W, significant))

}

# (0,0)
mu <- c(0,0)
M1 <- c()
for(i in 1:100){
  M1 <- rbind(M1, experiment(mu))
}
colnames(M1) <- c("R","W","significant")
head(M1)

##       R          W significant
## [1,] 10 -1.2792043           0
## [2,]  7 -2.5584086           1
## [3,]  8 -2.1320072           1
## [4,]  6 -2.9848100           1
## [5,] 11 -0.8528029           0
## [6,]  7 -2.5584086           1

# (1,1)
mu <- c(1,1)
M2 <- c()
for(i in 1:100){
  M2 <- rbind(M2, experiment(mu))
}
colnames(M2) <- c("R","W","significant")
head(M2)

##      R         W significant
## [1,] 5 -3.411211           1
## [2,] 2 -4.690416           1
## [3,] 6 -2.984810           1
## [4,] 6 -2.984810           1
## [5,] 6 -2.984810           1
## [6,] 9 -1.705606           0

# (2,2)
mu <- c(2,2)
M3 <- c()
for(i in 1:100){
  M3 <- rbind(M3, experiment(mu))
}
colnames(M3) <- c("R","W","significant")
head(M3)

##      R         W significant
## [1,] 7 -2.558409           1
## [2,] 1 -5.116817           1
## [3,] 3 -4.264014           1
## [4,] 4 -3.837613           1
## [5,] 4 -3.837613           1
## [6,] 3 -4.264014           1

# (3,3)
mu <- c(3,3)
M4 <- c()

for(i in 1:100){
  M4 <- rbind(M4, experiment(mu))
}
colnames(M4) <- c("R","W","significant")
head(M4)

##      R         W significant
## [1,] 2 -4.690416           1
## [2,] 1 -5.116817           1
## [3,] 1 -5.116817           1
## [4,] 1 -5.116817           1
## [5,] 3 -4.264014           1
## [6,] 1 -5.116817           1

Summarize the results.

summary(M1)

##        R               W            significant  
##  Min.   : 5.00   Min.   :-3.4112   Min.   :0.00  
##  1st Qu.: 8.00   1st Qu.:-2.1320   1st Qu.:0.00  
##  Median :10.00   Median :-1.2792   Median :0.00  
##  Mean   : 9.67   Mean   :-1.4199   Mean   :0.26  
##  3rd Qu.:11.00   3rd Qu.:-0.8528   3rd Qu.:1.00  
##  Max.   :14.00   Max.   : 0.4264   Max.   :1.00

summary(M2)

##        R               W            significant  
##  Min.   : 2.00   Min.   :-4.6904   Min.   :0.00  
##  1st Qu.: 6.00   1st Qu.:-2.9848   1st Qu.:0.00  
##  Median : 7.00   Median :-2.5584   Median :1.00  
##  Mean   : 7.28   Mean   :-2.4390   Mean   :0.73  
##  3rd Qu.: 9.00   3rd Qu.:-1.7056   3rd Qu.:1.00  
##  Max.   :12.00   Max.   :-0.4264   Max.   :1.00

summary(M3)

##        R              W           significant  
##  Min.   :1.00   Min.   :-5.117   Min.   :0.00  
##  1st Qu.:2.00   1st Qu.:-4.690   1st Qu.:1.00  
##  Median :3.00   Median :-4.264   Median :1.00  
##  Mean   :3.29   Mean   :-4.140   Mean   :0.99  
##  3rd Qu.:4.00   3rd Qu.:-3.838   3rd Qu.:1.00  
##  Max.   :9.00   Max.   :-1.706   Max.   :1.00

summary(M4)

##        R              W           significant
##  Min.   :1.00   Min.   :-5.117   Min.   :1   
##  1st Qu.:1.00   1st Qu.:-5.117   1st Qu.:1   
##  Median :1.00   Median :-5.117   Median :1   
##  Mean   :1.51   Mean   :-4.899   Mean   :1   
##  3rd Qu.:2.00   3rd Qu.:-4.690   3rd Qu.:1   
##  Max.   :3.00   Max.   :-4.264   Max.   :1

Friedman and Rafsky

Peter Yan

February 14, 2018