Read in salmon data

salmon<-read_csv(paste(shared.path,"/Mills Lab/Projects/Atlantic salmon/Penobscot River Growth/Data/all.csv", sep = ""))

# select origin=H and seaage=2SW then remove years 2010-2011
subdat<-salmon%>%filter(origin=="H"&seaage=="2SW")
subdat<-subdat%>%filter(year%in%c(1978,1979,1980,1986,1987,1988,1998,1999,2000,2012,2013,2014))

Pre-processing to create square data

# select circuli columns
circdat<-as.data.frame(subdat[,55:187])

# remove all smolt circuli
len<-c()
for(i in 1:nrow(circdat)){
  vct<-circdat[i,-c(1:subdat$smolt.circ[i]-1)]
  vct<-vct[!is.na(vct)]
  len<-c(len,length(vct))
}

# add extra NA columns for each so that all are max 79
df<-data.frame(matrix(NaN,nrow=nrow(circdat),ncol=max(len)))
for(i in 1:nrow(circdat)){
  xtra<-max(len)-len[i]
  vct<-circdat[i,-c(1:subdat$smolt.circ[i]-1)]
  vct<-vct[!is.na(vct)]
  df[i,]<-c(vct,rep(NA,xtra))
}

# min length is 40, max length is 79, stretch/squish all to 60
circdf<-data.frame(matrix(NaN,nrow=nrow(df),ncol=60))
for(i in 1:nrow(df)){
  vct<-df[i,]
  vct<-vct[!is.na(vct)]
  vct<-diff(vct)
  newvct<-approx(x=1:length(vct),y=vct,method="constant",n=60)$y
  circdf[i,]<-newvct
}

# unscaled data
salm.sc<-as.matrix(circdf)

x<-c()
for(i in 1:nrow(df)){
x<-c(x,sum(!is.na(df[i,])))
}
hist(x)

Use dtwclust to create distance matrix

#dtwclust 

res<-tsclust(series=salm.sc,k=4,trace=TRUE,args = tsclust_args(dist = list(window.size = 6)))
## 
##  Precomputing distance matrix...
## 
## Iteration 1: Changes / Distsum = 615 / 298.7499
## Iteration 2: Changes / Distsum = 328 / 266.9173
## Iteration 3: Changes / Distsum = 0 / 266.9173
## 
##  Elapsed time is 0.73 seconds.
#res@clusinfo

Insert distance matrix into SOM function, 100 model runs

#sombrero 

dislist<-list()

#set for loop to number of models you want to test iteratively
for(i in 1:100){
dis.som<-trainSOM(x.data=res@distmat,type="relational",dimension=c(2,2),nb.save=5)
dislist[[i]]<-dis.som
}
#create a matrix with all the possible unique combinations of models (excluding repeats i.e. order doesn't matter)
uniquepairsmatrix<-gtools::combinations(100,2,seq(1,100,1),repeats=F)

#create a design matrix to loop through (removes the need for a nested for loop)
design.matrix<-tibble(left = uniquepairsmatrix[,1], right = uniquepairsmatrix[,2])

Compare each model against 99 others (4950 possible combinations for 100 models)

dflist<-list()
consistencylist<-list()

for(row.i in 1:nrow(design.matrix)) {

# Get factor values for current row
  left.i <- design.matrix$left[row.i]
  right.i <- design.matrix$right[row.i]

# Merge cluster classifications with salmon ID data
dflist[[row.i]]<-subdat %>% 
  dplyr::select(ID) %>% 
  mutate(l = dislist[[left.i]]$clustering) %>% 
  mutate(r = dislist[[right.i]]$clustering)

# Use table function to decide which group # on LHS = group # on RHS
t<-table(dflist[[row.i]]$l, dflist[[row.i]]$r)
row1<-max(t[1,])
row2<-max(t[2,])
row3<-max(t[3,])
row4<-max(t[4,])

mat<-matrix(data = table(dflist[[row.i]]$l, dflist[[row.i]]$r), nrow = 4, ncol = 4)

# Get matrix location (in row and col) of LHS=RHS
p1<-as.data.frame(which(mat == row1, arr.ind = T))
p2<-as.data.frame(which(mat == row2, arr.ind = T))
p3<-as.data.frame(which(mat == row3, arr.ind = T))
p4<-as.data.frame(which(mat == row4, arr.ind = T))

# If the two models DO NOT assign an individual to the same cluster, write "different assignment"
dflist[[row.i]]$cluster = "different assignment"

# If the two models DO assign an individual to the same cluster, write cluster letter
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p1$row & dflist[[row.i]]$r == p1$col] = "A"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p2$row & dflist[[row.i]]$r == p2$col] = "B"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p3$row & dflist[[row.i]]$r == p3$col] = "C"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p4$row & dflist[[row.i]]$r == p4$col] = "D"
mat
check(dflist[[row.i]]$cluster)

# Calculate the percentage of individuals that both models assigned to the same cluster
dflist[[row.i]]$clusterconsistency = 1-(sum(dflist[[row.i]]$cluster == "different assignment")/length(dflist[[row.i]]$cluster))
# Add unique identifier
dflist[[row.i]]$row.i = row.i

# Create another list that contains only the "consistency of assignment" between the two tested models
consistencylist[[row.i]]<-tibble(row.i=row.i, l=left.i, r=right.i, 
                                 consistency = unique(dflist[[row.i]]$clusterconsistency))

}

Assess “consistency” of clustering

consistency_of_assignment<-do.call(rbind,consistencylist)

ggplot(consistency_of_assignment) + geom_histogram(aes(x=consistency)) + xlim(0,1) +
  labs(x = "Proportion of individuals that both models assigned to the same cluster")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

Are the individuals being clustered differently by the models random, or are the same few individuals being “misclustered?”

cluster_df<-do.call(rbind,dflist) 

cluster_df %>% 
  filter(cluster == "different assignment") %>% 
  count(ID) %>% 
  arrange(desc(n)) %>% 
  rowid_to_column(var = "IDnum") %>% 
  mutate(commonlymisclassified = ifelse(n>500,"y","n")) %>% 
  ggplot() + geom_bar(aes(x=IDnum, y = n, fill = commonlymisclassified),stat = "identity") + 
    labs(y="Number of times clustered differently by two models", x="Individual #", 
         fill="Commonly 'misclustered'")