- Generate samples of size \(m = 10\), \(n = 15\) from two independent bivariate normal distributions \(X_i \sim N_2(0,I)\) for \(i = 1,2,...,10\) and \(Y_j \sim N_2(\mu,I)\) for \(j = 1,2,3...,15\)
library(MASS)
mu1 <- c(0,0)
mu2 <- c(1,1)
sigma <- rbind(c(1,0),c(0,1))
sample1 <- mvrnorm(10,mu1,sigma)
sample2 <- mvrnorm(15,mu2,sigma)
sample1
## [,1] [,2]
## [1,] 0.6235516 -0.47831046
## [2,] 0.9745707 -1.89306312
## [3,] 0.2549640 1.66950804
## [4,] -0.1065072 0.08680403
## [5,] -1.5508926 -0.42867709
## [6,] -2.1714976 0.12837085
## [7,] 0.7273024 0.06326545
## [8,] -0.3028141 -0.13910381
## [9,] 0.6002698 1.04173359
## [10,] 0.8667309 1.83140101
sample2
## [,1] [,2]
## [1,] 1.28709980 0.76834844
## [2,] 1.17641713 0.08805407
## [3,] -0.12639674 0.65074280
## [4,] 0.32803602 -0.41459550
## [5,] 2.10764812 0.81605899
## [6,] 2.98174663 0.60368563
## [7,] 2.42483224 0.91246811
## [8,] 1.99098708 -0.29783460
## [9,] -0.10564584 0.57982759
## [10,] 0.29302290 1.62512005
## [11,] 0.58609873 0.61694629
## [12,] 1.51339316 0.57646009
## [13,] 2.08391336 0.40648645
## [14,] 1.76232916 0.85905776
## [15,] -0.01458107 -0.19590506
- Build a minimal spanning tree from the combined samples, and calculate R and the Wald-Wolfowitz statistic.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(utils)
m = 10
n = 15
N = m + n
# generate all edges
edge <- combn(m+n,2)
tot <- ncol(edge)
# initialize the adjacency matrix
A <- matrix(0,25,25)
# calculate distances
dist <- c()
for(i in 1:tot){
if(edge[1,i] <= m){
a1 = edge[1,i]
x1 = sample1[a1,1]
y1 = sample1[a1,2]
}else{
a1 = edge[1,i] - m
x1 = sample2[a1,1]
y1 = sample2[a1,2]
}
if(edge[2,i] <= m){
a2 = edge[2,i]
x2 = sample1[a2,1]
y2 = sample1[a2,2]
}else{
a2 = edge[2,i] - m
x2 = sample2[a2,1]
y2 = sample2[a2,2]
}
dist[i] = sqrt((x1-x2)^2 + (y1-y2)^2)
}
# combine edges and the corresponding distance
edge_list <- data.frame(t(edge),dist)
# sort it by distances
sorted_edge <- arrange(edge_list,dist)
# build mst
# keep a list of visited nodes
visited <- c()
for(i in 1:tot){
# fetch an edge
v1 <- sorted_edge[i,1]
v2 <- sorted_edge[i,2]
# if both v1 and v2 are visited, then a cycle is formed, we should do nothing and consider the next edge
# if one is in visited, put the other in
# if both are not in visited, put both in
if((v1 %in% visited) & (v2 %in% visited)){
next
}else if((v1 %in% visited) & (v2 %in% visited == F)){
visited <- append(visited,v2)
}else if((v1 %in% visited == F) & (v2 %in% visited)){
visited <- append(visited,v1)
}else{
visited <- append(visited,v1)
visited <- append(visited,v2)
}
# if get trrough, no cycle, update adjacency matrix
A[v1,v2] = 1
}
# calculate R
# for i,j where A[i,j] = 1, if i and j are from different samples, R = R + 1
R = 1
for(i in 1:N){
for(j in 1:N){
if(A[i,j] == 1){
if(((i<=m)&(j>m)) | ((j<=m)&(i>m))){
R = R + 1
}
}
}
}
R
## [1] 8
# perform the Wald-Wolfowitz test at 0.05 significant level
W = (R - 2*m*n/N - 1)/sqrt((2*m*n*(2*m*n - N))/(N^2*(N - 1)))
if(abs(W) > 1.96){
print(1)
}else{
print(0)
}
## [1] 1
- Repeat the above experiment 100 times for \(\mu = (0,0),(1,1)(2,2),(3,3)\)
# first rearrange the above code to make it a function
experiment <- function(x){
mu2 <- x
sample1 <- mvrnorm(m ,mu1,sigma)
sample2 <- mvrnorm(n,mu2,sigma)
# generate all edges
edge <- combn(N,2)
tot <- ncol(edge)
# initialize the adjacency matrix
A <- matrix(0,N,N)
# calculate distances
dist <- c()
for(i in 1:tot){
#print(i)
if(edge[1,i] <= m){
a1 = edge[1,i]
x1 = sample1[a1,1]
y1 = sample1[a1,2]
}else{
a1 = edge[1,i] - m
x1 = sample2[a1,1]
y1 = sample2[a1,2]
}
if(edge[2,i] <= m){
a2 = edge[2,i]
x2 = sample1[a2,1]
y2 = sample1[a2,2]
}else{
a2 = edge[2,i] - m
x2 = sample2[a2,1]
y2 = sample2[a2,2]
}
dist[i] = sqrt((x1-x2)^2 + (y1-y2)^2)
}
# combine edges and the corresponding distance
edge_list <- data.frame(t(edge),dist)
# sort it by distances
sorted_edge <- arrange(edge_list,dist)
# build mst
# keep a list of visited nodes
visited <- c()
for(i in 1:tot){
# fetch an edge
v1 <- sorted_edge[i,1]
v2 <- sorted_edge[i,2]
# if both v1 and v2 are visited, then a cycle is formed, we should do nothing and consider the next edge
# if one is in visited, put the other in
# if both are not in visited, put both in
if((v1 %in% visited) & (v2 %in% visited)){
next
}else if((v1 %in% visited) & (v2 %in% visited == F)){
visited <- append(visited,v2)
}else if((v1 %in% visited == F) & (v2 %in% visited)){
visited <- append(visited,v1)
}else{
visited <- append(visited,v1)
visited <- append(visited,v2)
}
# if get trrough, no cycle, update adjacency matrix
A[v1,v2] = 1
}
# calculate R
# for i,j where A[i,j] = 1, if i and j are from different samples, R = R + 1
R = 1
for(i in 1:N){
for(j in 1:N){
if(A[i,j] == 1){
if(((i<=m)&(j>m)) | ((j<=m)&(i>m))){
R = R + 1
}
}
}
}
R
# perform the Wald-Wolfowitz test at 0.05 significant level
W = (R - 2*m*n/N - 1)/sqrt((2*m*n*(2*m*n - N))/(N^2*(N - 1)))
if(abs(W) > 1.96){
significant = 1
}else{
significant = 0
}
return(c(R, W, significant))
}
# (0,0)
mu <- c(0,0)
M1 <- c()
for(i in 1:100){
M1 <- rbind(M1, experiment(mu))
}
colnames(M1) <- c("R","W","significant")
head(M1)
## R W significant
## [1,] 10 -1.2792043 0
## [2,] 7 -2.5584086 1
## [3,] 8 -2.1320072 1
## [4,] 6 -2.9848100 1
## [5,] 11 -0.8528029 0
## [6,] 7 -2.5584086 1
# (1,1)
mu <- c(1,1)
M2 <- c()
for(i in 1:100){
M2 <- rbind(M2, experiment(mu))
}
colnames(M2) <- c("R","W","significant")
head(M2)
## R W significant
## [1,] 5 -3.411211 1
## [2,] 2 -4.690416 1
## [3,] 6 -2.984810 1
## [4,] 6 -2.984810 1
## [5,] 6 -2.984810 1
## [6,] 9 -1.705606 0
# (2,2)
mu <- c(2,2)
M3 <- c()
for(i in 1:100){
M3 <- rbind(M3, experiment(mu))
}
colnames(M3) <- c("R","W","significant")
head(M3)
## R W significant
## [1,] 7 -2.558409 1
## [2,] 1 -5.116817 1
## [3,] 3 -4.264014 1
## [4,] 4 -3.837613 1
## [5,] 4 -3.837613 1
## [6,] 3 -4.264014 1
# (3,3)
mu <- c(3,3)
M4 <- c()
for(i in 1:100){
M4 <- rbind(M4, experiment(mu))
}
colnames(M4) <- c("R","W","significant")
head(M4)
## R W significant
## [1,] 2 -4.690416 1
## [2,] 1 -5.116817 1
## [3,] 1 -5.116817 1
## [4,] 1 -5.116817 1
## [5,] 3 -4.264014 1
## [6,] 1 -5.116817 1
- Summarize the results.
summary(M1)
## R W significant
## Min. : 5.00 Min. :-3.4112 Min. :0.00
## 1st Qu.: 8.00 1st Qu.:-2.1320 1st Qu.:0.00
## Median :10.00 Median :-1.2792 Median :0.00
## Mean : 9.67 Mean :-1.4199 Mean :0.26
## 3rd Qu.:11.00 3rd Qu.:-0.8528 3rd Qu.:1.00
## Max. :14.00 Max. : 0.4264 Max. :1.00
summary(M2)
## R W significant
## Min. : 2.00 Min. :-4.6904 Min. :0.00
## 1st Qu.: 6.00 1st Qu.:-2.9848 1st Qu.:0.00
## Median : 7.00 Median :-2.5584 Median :1.00
## Mean : 7.28 Mean :-2.4390 Mean :0.73
## 3rd Qu.: 9.00 3rd Qu.:-1.7056 3rd Qu.:1.00
## Max. :12.00 Max. :-0.4264 Max. :1.00
summary(M3)
## R W significant
## Min. :1.00 Min. :-5.117 Min. :0.00
## 1st Qu.:2.00 1st Qu.:-4.690 1st Qu.:1.00
## Median :3.00 Median :-4.264 Median :1.00
## Mean :3.29 Mean :-4.140 Mean :0.99
## 3rd Qu.:4.00 3rd Qu.:-3.838 3rd Qu.:1.00
## Max. :9.00 Max. :-1.706 Max. :1.00
summary(M4)
## R W significant
## Min. :1.00 Min. :-5.117 Min. :1
## 1st Qu.:1.00 1st Qu.:-5.117 1st Qu.:1
## Median :1.00 Median :-5.117 Median :1
## Mean :1.51 Mean :-4.899 Mean :1
## 3rd Qu.:2.00 3rd Qu.:-4.690 3rd Qu.:1
## Max. :3.00 Max. :-4.264 Max. :1