Check if correlations between variables exist
#check correlations
corr.m <- round(cor(train[,-c(1,13)]),2)
highlyCorrelated <- findCorrelation(corr.m, cutoff = 0.5) #find highly correlated
highlyCorrelated <- sort(highlyCorrelated)
names(dat[c(highlyCorrelated)])
## [1] "numDeadRelations" "boolDeadRelations"
#correlation plot
corrplot(corr.m, type="upper", order="hclust", tl.col="black", tl.srt=45)
No high correlations exist between variables, we still use PCA for visualisation purposes and exploratory purposes.
dat.3 <- train[,-c(1,13)]
res.pca <- PCA(dat.3, graph = FALSE, scale.unit = TRUE)
eigenvalues <- res.pca$eig
head(eigenvalues)
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.9045190 26.404718 26.40472
## comp 2 1.8150569 16.500518 42.90524
## comp 3 1.1715873 10.650793 53.55603
## comp 4 1.0144185 9.221987 62.77802
## comp 5 0.9380728 8.527935 71.30595
## comp 6 0.7866610 7.151463 78.45741
fviz_screeplot(res.pca, ncp=10, main = "Scree plot", x = "Dimensions", y = "% of explained variance") + theme_gray() + theme(plot.title = element_text(size=10), axis.title=element_text(size=10))
Optimal number of components is 3.
fviz_pca_var(res.pca, col.var="contrib")
Positive correlations between all variables. Popularity, number of dead relations and number of blood dead relations explain the majority of the variable in our data.
We can’t sucessful visualise individuals with PCA
plot(res.pca, choix = "ind")
We can use SOM(Self Organising Maps) for this
data2 <- scale(dat.2[,-13])
k.max <- 15 # Maximal number of clusters
wss <- sapply(1:k.max, function(k){kmeans(data2, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,type="b", pch = 19, frame = FALSE,xlab="Number of clusters K",ylab="Total within-clusters sum of squares")
Recommended clusters the data froms is 11.
pretty_palette <- c("#1f77b4", '#2ca02c', '#ff7f0e', '#d62728', '#9467bd', '#8c564b', '#e377c2')
coolBlueHotRed <- function(n, alpha = 1) {
rainbow(n, end=4/6, alpha=alpha)[n:1]
}
som.dat <- scale(data2)
set.seed(123)
som_grid <- somgrid(xdim = 6, ydim=7, topo="hexagonal")
som_model <- som(som.dat,grid=som_grid, rlen=200, alpha=c(0.05,0.01), keep.data = TRUE)
plot(som_model, type="changes", main = "training process")
h.clust <- (hclust(dist(as.data.frame(som_model$codes)), method = "complete"))
som_cluster <- cutree(h.clust, k =6)
We can see that we have adequate iterations
plot(som_model, type="count", palette.name= coolBlueHotRed, main="Counts Plot")
add.cluster.boundaries(som_model, som_cluster, lwd = 3)
plot(som_model, type="dist.neighbours", palette.name= coolBlueHotRed, main="distance neighbours")
add.cluster.boundaries(som_model, som_cluster, lwd = 3)
plot(som_model, type = "property", property = getCodes(som_model)[,2], main=colnames(getCodes(som_model))[2], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 4)
plot(som_model, type = "property", property = getCodes(som_model)[,9], main=colnames(getCodes(som_model))[9], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 5)
plot(som_model, type = "property", property = getCodes(som_model)[,10], main=colnames(getCodes(som_model))[10], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 5)
plot(som_model, type = "property", property = getCodes(som_model)[,11], main=colnames(getCodes(som_model))[11], palette.name=coolBlueHotRed)
add.cluster.boundaries(som_model, som_cluster, lwd = 5)
Charaters with the most dead relations fall into the most popular group.
Using random forest
controlcv <- trainControl(method = "repeatedcv", number =10, repeats = 3, allowParallel = TRUE)
set.seed(123)
mod <- train(isAlive~., data=train[,-1], method = "rf", importance = TRUE, trControl = controlcv, ntree = 50)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
plot(mod, main = "Error rate of random forest")
mod
## Random Forest
##
## 1557 samples
## 11 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1401, 1402, 1401, 1401, 1402, 1401, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8009162 0.3368146
## 6 0.7944976 0.4147352
## 11 0.7833521 0.3938079
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
varImpPlot(mod$finalModel, pch = 20, main = "Importance of Variables")
Training model has a 80% accuracy
Let’s not test our model on training data
pred.rf <- predict(mod, test)
confu <- confusionMatrix(data = pred.rf, reference = test$isAlive)
confu
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 23 8
## 1 76 282
##
## Accuracy : 0.7841
## 95% CI : (0.7398, 0.8239)
## No Information Rate : 0.7455
## P-Value [Acc > NIR] : 0.0439
##
## Kappa : 0.2646
## Mcnemar's Test P-Value : 2.665e-13
##
## Sensitivity : 0.23232
## Specificity : 0.97241
## Pos Pred Value : 0.74194
## Neg Pred Value : 0.78771
## Prevalence : 0.25450
## Detection Rate : 0.05913
## Detection Prevalence : 0.07969
## Balanced Accuracy : 0.60237
##
## 'Positive' Class : 0
##
Our model can perdict if a character will die with approximately 80% accurracy.
Who is going to live or die based on the model
## train$name mod$finalModel$predicted
## X1467 Arya Stark 1
## train$name mod$finalModel$predicted
## X1793 Sansa Stark 0
## train$name mod$finalModel$predicted
## X1475 Bran Stark 1
## train$name mod$finalModel$predicted
## X1742 Jaime Lannister 1
## train$name mod$finalModel$predicted
## X274 Samwell Tarly 1
## train$name mod$finalModel$predicted
## X1482 Cersei Lannister 1
## test$name pred.rf
## 361 Jon Snow 1
## 350 Daenerys Targaryen 0
## 378 Tyrion Lannister 1