Load packages.

library(factoextra)
library(cluster)
library(tidyverse)

Load data.

toolmarks_dat <- read.csv("/Users/mariacuellar/Desktop/NYU interview/Teaching materials/large_df.csv")

Visualize data

We will draw a single toolmarks, the marks made by the two sides of the same screwdriver, eight replicate marks from one tool-side, and marks from different tools.

toolmarks_dat %>% filter(tool=="1", side=="A", angle=="80", mark=="1") %>% 
  ggplot(aes(x = x, y = aligned, color=mark)) + 
  geom_line() + 
  labs(x = "Signal length (mm)", 
       y = "Signal depth (mm)", 
       title = "Single toolmark") + 
  guides(color="none") +
  theme_minimal()+
  ylim(-0.002, 0.002)

toolmarks_dat %>% filter(tool=="1", side %in% c("A","B"), angle=="80", mark=="1") %>% 
  ggplot(aes(x = x, y = aligned, color=side)) + 
  geom_line() + 
  labs(x = "Signal length (mm)", 
       y = "Signal depth (mm)", 
       title = "Marks from different sides of screwdriver", 
       color = "Side") + 
  scale_color_brewer(palette = "Set1") + 
  facet_wrap(vars(side), ncol=1) + 
  guides(color="none") +
  theme_minimal() +
  ylim(-0.004, 0.004)

toolmarks_dat %>% filter(tool=="1", side=="A", angle=="80") %>% 
  ggplot(aes(x = x, y = aligned, colour = as.factor(mark))) + 
  geom_line() + 
  labs(x = "Signal length (mm)", 
       y = "Signal depth (mm)", 
       title = "Replicate marks made with same tool-side", 
       color = "Mark") + 
  scale_color_brewer(palette = "Blues")+ 
  theme_minimal()+
  ylim(-0.002, 0.002)

toolmarks_dat %>% filter(tool %in% c(1,2), side=="A", angle=="80", mark=="1") %>% 
  ggplot(aes(x = x, y = aligned, color=as.factor(tool))) + 
  geom_line() + 
  labs(x = "Signal length (mm)", 
       y = "Signal depth (mm)", 
       title = "Two marks made by different tools", 
       color = "Tool") + 
  scale_color_brewer(palette = "Set1") +
  facet_wrap(vars(tool), ncol=1) + 
  guides(color="none") +
  theme_minimal()+
  ylim(-0.004, 0.004)

toolmarks_dat %>% filter(tool==c(1,2), side=="A", angle=="80") %>% 
  ggplot(aes(x = x, y = aligned, colour = as.factor(mark))) + 
  geom_line() + 
  labs(x = "Signal length (mm)", 
       y = "Signal depth (mm)", 
       title = "Marks from different tools, with their corresponding replicates", 
       color = "Mark") + 
  facet_wrap(vars(tool), ncol=1) + 
  scale_color_brewer(palette = "Blues") + 
  theme_minimal()+
  ylim(-0.004, 0.004)

Clustering

Load similarity score, which will be used for clustering.

sim_score <- readRDS("/Users/mariacuellar/Desktop/NYU interview/Teaching materials/sim_score.rds")

Find optimal number of clusters.

# Finding the optimal number of clusters
fviz_nbclust(sim_score, pam)

# fviz_nbclust(sim_score, kmeans) # oops, gives 8 when it should be 6

Draw a cluster plot using kmeans.

fviz_cluster(kmeans(sim_score, centers = 6), 
             data = sim_score,
             # ellipse.type = "norm",
             ggtheme = theme_minimal()
             )

Draw a cluster plot using pam.

pam.res <- pam(sim_score, 6,  metric = "euclidean", stand = FALSE)
fviz_cluster(pam.res, 
             data = sim_score, 
             ggtheme = theme_minimal() )