Read in salmon data
salmon<-read_csv(paste(shared.path,"/Mills Lab/Projects/Atlantic salmon/Penobscot River Growth/Data/all.csv", sep = ""))
# select origin=H and seaage=2SW then remove years 2010-2011
subdat<-salmon%>%filter(origin=="H"&seaage=="2SW")
subdat<-subdat%>%filter(year%in%c(1978,1979,1980,1986,1987,1988,1998,1999,2000,2012,2013,2014))
Pre-processing to create square data
# select circuli columns
circdat<-as.data.frame(subdat[,55:187])
# remove all smolt circuli
len<-c()
for(i in 1:nrow(circdat)){
vct<-circdat[i,-c(1:subdat$smolt.circ[i]-1)]
vct<-vct[!is.na(vct)]
len<-c(len,length(vct))
}
# add extra NA columns for each so that all are max 79
df<-data.frame(matrix(NaN,nrow=nrow(circdat),ncol=max(len)))
for(i in 1:nrow(circdat)){
xtra<-max(len)-len[i]
vct<-circdat[i,-c(1:subdat$smolt.circ[i]-1)]
vct<-vct[!is.na(vct)]
df[i,]<-c(vct,rep(NA,xtra))
}
# min length is 40, max length is 79, stretch/squish all to 60
circdf<-data.frame(matrix(NaN,nrow=nrow(df),ncol=60))
for(i in 1:nrow(df)){
vct<-df[i,]
vct<-vct[!is.na(vct)]
vct<-diff(vct)
newvct<-approx(x=1:length(vct),y=vct,method="constant",n=60)$y
circdf[i,]<-newvct
}
# unscaled data
salm.sc<-as.matrix(circdf)
x<-c()
for(i in 1:nrow(df)){
x<-c(x,sum(!is.na(df[i,])))
}
hist(x)

Use dtwclust to create distance matrix
#dtwclust
res<-tsclust(series=salm.sc,k=4,trace=TRUE,args = tsclust_args(dist = list(window.size = 6)))
##
## Precomputing distance matrix...
##
## Iteration 1: Changes / Distsum = 615 / 298.7499
## Iteration 2: Changes / Distsum = 328 / 266.9173
## Iteration 3: Changes / Distsum = 0 / 266.9173
##
## Elapsed time is 0.73 seconds.
#res@clusinfo
Insert distance matrix into SOM function, 100 model runs
#sombrero
dislist<-list()
#set for loop to number of models you want to test iteratively
for(i in 1:100){
dis.som<-trainSOM(x.data=res@distmat,type="relational",dimension=c(2,2),nb.save=5)
dislist[[i]]<-dis.som
}
#create a matrix with all the possible unique combinations of models (excluding repeats i.e. order doesn't matter)
uniquepairsmatrix<-gtools::combinations(100,2,seq(1,100,1),repeats=F)
#create a design matrix to loop through (removes the need for a nested for loop)
design.matrix<-tibble(left = uniquepairsmatrix[,1], right = uniquepairsmatrix[,2])
Compare each model against 99 others (4950 possible combinations for 100 models)
dflist<-list()
consistencylist<-list()
for(row.i in 1:nrow(design.matrix)) {
# Get factor values for current row
left.i <- design.matrix$left[row.i]
right.i <- design.matrix$right[row.i]
# Merge cluster classifications with salmon ID data
dflist[[row.i]]<-subdat %>%
dplyr::select(ID) %>%
mutate(l = dislist[[left.i]]$clustering) %>%
mutate(r = dislist[[right.i]]$clustering)
# Use table function to decide which group # on LHS = group # on RHS
t<-table(dflist[[row.i]]$l, dflist[[row.i]]$r)
row1<-max(t[1,])
row2<-max(t[2,])
row3<-max(t[3,])
row4<-max(t[4,])
mat<-matrix(data = table(dflist[[row.i]]$l, dflist[[row.i]]$r), nrow = 4, ncol = 4)
# Get matrix location (in row and col) of LHS=RHS
p1<-as.data.frame(which(mat == row1, arr.ind = T))
p2<-as.data.frame(which(mat == row2, arr.ind = T))
p3<-as.data.frame(which(mat == row3, arr.ind = T))
p4<-as.data.frame(which(mat == row4, arr.ind = T))
# If the two models DO NOT assign an individual to the same cluster, write "different assignment"
dflist[[row.i]]$cluster = "different assignment"
# If the two models DO assign an individual to the same cluster, write cluster letter
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p1$row & dflist[[row.i]]$r == p1$col] = "A"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p2$row & dflist[[row.i]]$r == p2$col] = "B"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p3$row & dflist[[row.i]]$r == p3$col] = "C"
dflist[[row.i]]$cluster[dflist[[row.i]]$l == p4$row & dflist[[row.i]]$r == p4$col] = "D"
mat
check(dflist[[row.i]]$cluster)
# Calculate the percentage of individuals that both models assigned to the same cluster
dflist[[row.i]]$clusterconsistency = 1-(sum(dflist[[row.i]]$cluster == "different assignment")/length(dflist[[row.i]]$cluster))
# Add unique identifier
dflist[[row.i]]$row.i = row.i
# Create another list that contains only the "consistency of assignment" between the two tested models
consistencylist[[row.i]]<-tibble(row.i=row.i, l=left.i, r=right.i,
consistency = unique(dflist[[row.i]]$clusterconsistency))
}
Assess “consistency” of clustering
consistency_of_assignment<-do.call(rbind,consistencylist)
ggplot(consistency_of_assignment) + geom_histogram(aes(x=consistency)) + xlim(0,1) +
labs(x = "Proportion of individuals that both models assigned to the same cluster")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

Are the individuals being clustered differently by the models random, or are the same few individuals being “misclustered?”
cluster_df<-do.call(rbind,dflist)
cluster_df %>%
filter(cluster == "different assignment") %>%
count(ID) %>%
arrange(desc(n)) %>%
rowid_to_column(var = "IDnum") %>%
mutate(commonlymisclassified = ifelse(n>500,"y","n")) %>%
ggplot() + geom_bar(aes(x=IDnum, y = n, fill = commonlymisclassified),stat = "identity") +
labs(y="Number of times clustered differently by two models", x="Individual #",
fill="Commonly 'misclustered'")
