Check if correlations between variables exist

#check correlations
corr.m <- round(cor(train[,-c(1,13)]),2)
highlyCorrelated <- findCorrelation(corr.m, cutoff = 0.5) #find highly correlated
highlyCorrelated <- sort(highlyCorrelated)
names(dat[c(highlyCorrelated)])
## [1] "numDeadRelations"  "boolDeadRelations"
#correlation plot

corrplot(corr.m, type="upper", order="hclust", tl.col="black", tl.srt=45)

No high correlations exist between variables, we still use PCA for visualisation purposes and exploratory purposes.

dat.3 <- train[,-c(1,13)]
res.pca <- PCA(dat.3, graph = FALSE, scale.unit = TRUE)

eigenvalues <- res.pca$eig
head(eigenvalues)
##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1  2.9045190              26.404718                          26.40472
## comp 2  1.8150569              16.500518                          42.90524
## comp 3  1.1715873              10.650793                          53.55603
## comp 4  1.0144185               9.221987                          62.77802
## comp 5  0.9380728               8.527935                          71.30595
## comp 6  0.7866610               7.151463                          78.45741
fviz_screeplot(res.pca, ncp=10, main = "Scree plot",  x = "Dimensions", y = "% of explained variance") +  theme_gray() + theme(plot.title = element_text(size=10), axis.title=element_text(size=10))

Optimal number of components is 3.

fviz_pca_var(res.pca, col.var="contrib")

Positive correlations between all variables. Popularity, number of dead relations and number of blood dead relations explain the majority of the variable in our data.

We can’t sucessful visualise individuals with PCA

plot(res.pca, choix = "ind")

We can use SOM(Self Organising Maps) for this

data2 <- scale(dat.2[,-13])

k.max <- 15 # Maximal number of clusters
wss <- sapply(1:k.max, function(k){kmeans(data2, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,type="b", pch = 19, frame = FALSE,xlab="Number of clusters K",ylab="Total within-clusters sum of squares")

Recommended clusters the data froms is 11.

pretty_palette <- c("#1f77b4", '#2ca02c', '#ff7f0e', '#d62728', '#9467bd', '#8c564b', '#e377c2')
coolBlueHotRed <- function(n, alpha = 1) {
  rainbow(n, end=4/6, alpha=alpha)[n:1]
}


som.dat <- scale(data2)
set.seed(123)
som_grid <- somgrid(xdim = 6, ydim=7, topo="hexagonal")
som_model <- som(som.dat,grid=som_grid, rlen=200, alpha=c(0.05,0.01), keep.data = TRUE)
plot(som_model, type="changes", main = "training process")

h.clust <- (hclust(dist(as.data.frame(som_model$codes)), method = "complete"))
som_cluster <- cutree(h.clust, k =6)

We can see that we have adequate iterations

SOM Visualisations

plot(som_model, type="count", palette.name= coolBlueHotRed, main="Counts Plot")
add.cluster.boundaries(som_model, som_cluster, lwd = 3)

plot(som_model, type="dist.neighbours", palette.name= coolBlueHotRed, main="distance neighbours")
add.cluster.boundaries(som_model, som_cluster, lwd = 3)

plot(som_model, type = "property", property = getCodes(som_model)[,2], main=colnames(getCodes(som_model))[2], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 4)

plot(som_model, type = "property", property = getCodes(som_model)[,9], main=colnames(getCodes(som_model))[9], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 5)

plot(som_model, type = "property", property = getCodes(som_model)[,10], main=colnames(getCodes(som_model))[10], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 5)

plot(som_model, type = "property", property = getCodes(som_model)[,11], main=colnames(getCodes(som_model))[11], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 5)

Charaters with the most dead relations fall into the most popular group.

How Well can we Predict Deaths

Using random forest

controlcv <- trainControl(method = "repeatedcv", number =10, repeats = 3, allowParallel = TRUE)

set.seed(123)
mod <- train(isAlive~., data=train[,-1], method = "rf", importance = TRUE, trControl = controlcv, ntree = 50)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
plot(mod, main = "Error rate of random forest")

mod
## Random Forest 
## 
## 1557 samples
##   11 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1401, 1402, 1401, 1401, 1402, 1401, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8009162  0.3368146
##    6    0.7944976  0.4147352
##   11    0.7833521  0.3938079
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.
varImpPlot(mod$finalModel, pch = 20, main = "Importance of Variables")

Training model has a 80% accuracy

Let’s not test our model on training data

pred.rf <- predict(mod, test)
confu <- confusionMatrix(data = pred.rf, reference = test$isAlive)
confu
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  23   8
##          1  76 282
##                                           
##                Accuracy : 0.7841          
##                  95% CI : (0.7398, 0.8239)
##     No Information Rate : 0.7455          
##     P-Value [Acc > NIR] : 0.0439          
##                                           
##                   Kappa : 0.2646          
##  Mcnemar's Test P-Value : 2.665e-13       
##                                           
##             Sensitivity : 0.23232         
##             Specificity : 0.97241         
##          Pos Pred Value : 0.74194         
##          Neg Pred Value : 0.78771         
##              Prevalence : 0.25450         
##          Detection Rate : 0.05913         
##    Detection Prevalence : 0.07969         
##       Balanced Accuracy : 0.60237         
##                                           
##        'Positive' Class : 0               
## 

Our model can perdict if a character will die with approximately 80% accurracy.

Who is going to live or die based on the model

##       train$name mod$finalModel$predicted
## X1467 Arya Stark                        1
##        train$name mod$finalModel$predicted
## X1793 Sansa Stark                        0
##       train$name mod$finalModel$predicted
## X1475 Bran Stark                        1
##            train$name mod$finalModel$predicted
## X1742 Jaime Lannister                        1
##         train$name mod$finalModel$predicted
## X274 Samwell Tarly                        1
##             train$name mod$finalModel$predicted
## X1482 Cersei Lannister                        1
##              test$name pred.rf
## 361           Jon Snow       1
## 350 Daenerys Targaryen       0
## 378   Tyrion Lannister       1