SOMbrero validation

Read in salmon data

salmon<-read_csv(paste(shared.path,"/Mills Lab/Projects/Atlantic salmon/Penobscot River Growth/Data/all.csv", sep = ""))

# select origin=H and seaage=2SW then remove years 2010-2011
subdat<-salmon%>%filter(origin=="H"&seaage=="2SW")
subdat<-subdat%>%filter(year%in%c(1978,1979,1980,1986,1987,1988,1998,1999,2000,2012,2013,2014))

Pre-processing to create square data

# select circuli columns
circdat<-as.data.frame(subdat[,55:187])

# remove all smolt circuli
len<-c()
for(i in 1:nrow(circdat)){
  vct<-circdat[i,-c(1:subdat$smolt.circ[i]-1)]
  vct<-vct[!is.na(vct)]
  len<-c(len,length(vct))
}

# add extra NA columns for each so that all are max 79
df<-data.frame(matrix(NaN,nrow=nrow(circdat),ncol=max(len)))
for(i in 1:nrow(circdat)){
  xtra<-max(len)-len[i]
  vct<-circdat[i,-c(1:subdat$smolt.circ[i]-1)]
  vct<-vct[!is.na(vct)]
  df[i,]<-c(vct,rep(NA,xtra))
}

# min length is 40, max length is 79, stretch/squish all to 60
circdf<-data.frame(matrix(NaN,nrow=nrow(df),ncol=60))
for(i in 1:nrow(df)){
  vct<-df[i,]
  vct<-vct[!is.na(vct)]
  vct<-diff(vct)
  newvct<-approx(x=1:length(vct),y=vct,method="constant",n=60)$y
  circdf[i,]<-newvct
}

# unscaled data
salm.sc<-as.matrix(circdf)

x<-c()
for(i in 1:nrow(df)){
x<-c(x,sum(!is.na(df[i,])))
}
hist(x)

Use dtwclust to create distance matrix

#dtwclust 

res<-tsclust(series=salm.sc,k=4,trace=TRUE,args = tsclust_args(dist = list(window.size = 6)))

## 
##  Precomputing distance matrix...
## 
## Iteration 1: Changes / Distsum = 615 / 298.7499
## Iteration 2: Changes / Distsum = 328 / 266.9173
## Iteration 3: Changes / Distsum = 0 / 266.9173
## 
##  Elapsed time is 0.73 seconds.

#res@clusinfo

Insert distance matrix into SOM function, 100 model runs

#sombrero 

dislist<-list()

#set for loop to number of models you want to test iteratively
for(i in 1:100){
dis.som<-trainSOM(x.data=res@distmat,type="relational",dimension=c(2,2),nb.save=5)
dislist[[i]]<-dis.som
}

#create a matrix with all the possible unique combinations of models (excluding repeats i.e. order doesn't matter)
uniquepairsmatrix<-gtools::combinations(100,2,seq(1,100,1),repeats=F)

#create a design matrix to loop through (removes the need for a nested for loop)
design.matrix<-tibble(left = uniquepairsmatrix[,1], right = uniquepairsmatrix[,2])

Compare each model against 99 others (4950 possible combinations for 100 models)

dflist<-list()
consistencylist<-list()

for(row.i in 1:nrow(design.matrix)) {

# Get factor values for current row
  left.i <- design.matrix$left[row.i]
  right.i <- design.matrix$right[row.i]

# Merge cluster classifications with salmon ID data
dflist[[row.i]]<-subdat %>% 
  dplyr::select(ID) %>% 
  mutate(l = dislist[[left.i]]$clustering) %>% 
  mutate(r = dislist[[right.i]]$clustering)

# Use table function to decide which group # on LHS = group # on RHS
t<-table(dflist[[row.i]]$l, dflist[[row.i]]$r)
row1<-max(t[1,])
row2<-max(t[2,])
row3<-max(t[3,])
row4<-max(t[4,])

mat<-matrix(data = table(dflist[[row.i]]$l, dflist[[row.i]]$r), nrow = 4, ncol = 4)

# Get matrix location (in row and col) of LHS=RHS
p1<-as.data.frame(which(mat == row1, arr.ind = T))
p2<-as.data.frame(which(mat == row2, arr.ind = T))
p3<-as.data.frame(which(mat == row3, arr.ind = T))
p4<-as.data.frame(which(mat == row4, arr.ind = T))

# If the two models DO NOT assign an individual to the same cluster, write "different assignment"
dflist[[row.i]]$cluster = "different assignment"

# If the two models DO assign an individual to the same cluster, write cluster letter
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p1$row & dflist[[row.i]]$r == p1$col] = "A"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p2$row & dflist[[row.i]]$r == p2$col] = "B"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p3$row & dflist[[row.i]]$r == p3$col] = "C"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p4$row & dflist[[row.i]]$r == p4$col] = "D"
mat
check(dflist[[row.i]]$cluster)

# Calculate the percentage of individuals that both models assigned to the same cluster
dflist[[row.i]]$clusterconsistency = 1-(sum(dflist[[row.i]]$cluster == "different assignment")/length(dflist[[row.i]]$cluster))
# Add unique identifier
dflist[[row.i]]$row.i = row.i

# Create another list that contains only the "consistency of assignment" between the two tested models
consistencylist[[row.i]]<-tibble(row.i=row.i, l=left.i, r=right.i, 
                                 consistency = unique(dflist[[row.i]]$clusterconsistency))

}

Assess “consistency” of clustering

consistency_of_assignment<-do.call(rbind,consistencylist)

ggplot(consistency_of_assignment) + geom_histogram(aes(x=consistency)) + xlim(0,1) +
  labs(x = "Proportion of individuals that both models assigned to the same cluster")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Are the individuals being clustered differently by the models random, or are the same few individuals being “misclustered?”

cluster_df<-do.call(rbind,dflist) 

cluster_df %>% 
  filter(cluster == "different assignment") %>% 
  count(ID) %>% 
  arrange(desc(n)) %>% 
  rowid_to_column(var = "IDnum") %>% 
  mutate(commonlymisclassified = ifelse(n>500,"y","n")) %>% 
  ggplot() + geom_bar(aes(x=IDnum, y = n, fill = commonlymisclassified),stat = "identity") + 
    labs(y="Number of times clustered differently by two models", x="Individual #", 
         fill="Commonly 'misclustered'")

SOMbrero validation

LGCarlson

02/04/2020

Read in salmon data

Pre-processing to create square data

Use dtwclust to create distance matrix

Insert distance matrix into SOM function, 100 model runs

Compare each model against 99 others (4950 possible combinations for 100 models)

Assess “consistency” of clustering

Are the individuals being clustered differently by the models random, or are the same few individuals being “misclustered?”