Check if correlations between variables exist

#check correlations
corr.m <- round(cor(train[,-c(1,13)]),2)
highlyCorrelated <- findCorrelation(corr.m, cutoff = 0.5) #find highly correlated
highlyCorrelated <- sort(highlyCorrelated)
names(dat[c(highlyCorrelated)])
## [1] "numDeadRelations"  "boolDeadRelations"
#correlation plot

corrplot(corr.m, type="upper", order="hclust", tl.col="black", tl.srt=45)

No high correlations exist between variables, we still use PCA for visualisation purposes and exploratory purposes.

dat.3 <- train[,-c(1,13)]
res.pca <- PCA(dat.3, graph = FALSE, scale.unit = TRUE)

eigenvalues <- res.pca$eig
head(eigenvalues)
##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1  2.9045190              26.404718                          26.40472
## comp 2  1.8150569              16.500518                          42.90524
## comp 3  1.1715873              10.650793                          53.55603
## comp 4  1.0144185               9.221987                          62.77802
## comp 5  0.9380728               8.527935                          71.30595
## comp 6  0.7866610               7.151463                          78.45741
fviz_screeplot(res.pca, ncp=10, main = "Scree plot",  x = "Dimensions", y = "% of explained variance") +  theme_gray() + theme(plot.title = element_text(size=10), axis.title=element_text(size=10))

Optimal number of components is 3.

fviz_pca_var(res.pca, col.var="contrib")

Positive correlations between all variables. Popularity, number of dead relations and number of blood dead relations explain the majority of the variable in our data.

We can’t sucessful visualise individuals with PCA

plot(res.pca, choix = "ind")

We can use SOM(Self Organising Maps) for this

data2 <- scale(dat.2[,-13])

k.max <- 15 # Maximal number of clusters
wss <- sapply(1:k.max, function(k){kmeans(data2, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,type="b", pch = 19, frame = FALSE,xlab="Number of clusters K",ylab="Total within-clusters sum of squares")

Recommended clusters the data froms is 11.

pretty_palette <- c("#1f77b4", '#2ca02c', '#ff7f0e', '#d62728', '#9467bd', '#8c564b', '#e377c2')
coolBlueHotRed <- function(n, alpha = 1) {
  rainbow(n, end=4/6, alpha=alpha)[n:1]
}


som.dat <- scale(data2)
set.seed(123)
som_grid <- somgrid(xdim = 6, ydim=7, topo="hexagonal")
som_model <- som(som.dat,grid=som_grid, rlen=200, alpha=c(0.05,0.01), keep.data = TRUE)
plot(som_model, type="changes", main = "training process")

h.clust <- (hclust(dist(as.data.frame(som_model$codes)), method = "complete"))
som_cluster <- cutree(h.clust, k =6)

We can see that we have adequate iterations

SOM Visualisations

plot(som_model, type="count", palette.name= coolBlueHotRed, main="Counts Plot")
add.cluster.boundaries(som_model, som_cluster, lwd = 3)

plot(som_model, type="dist.neighbours", palette.name= coolBlueHotRed, main="distance neighbours")
add.cluster.boundaries(som_model, som_cluster, lwd = 3)

plot(som_model, type = "property", property = getCodes(som_model)[,2], main=colnames(getCodes(som_model))[2], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 4)