Select the feature column
Pokemon.data <- Pokemon[6:11]
Find the best k
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:15) {
# Fit the model: km.out
km.out <- kmeans(Pokemon.data, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
k <- 3
# Build model with k clusters: km.out
km.out <- kmeans(Pokemon.data, centers = k, nstart = 20, iter.max = 50)
# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 175, 270, 355
##
## Cluster means:
## HP Attack Defense Sp. Atk Sp. Def Speed
## 1 79.30857 97.29714 108.93143 66.71429 87.04571 57.29143
## 2 81.90370 96.15926 77.65556 104.12222 86.87778 94.71111
## 3 54.68732 56.93239 53.64507 52.02254 53.04789 53.58873
##
## Clustering vector:
## [1] 3 3 2 2 3 3 2 2 2 3 3 1 2 3 3 3 3 3 3 2 3 3 2 2 3 3 3 2 3 2 3 2 3 1 3
## [36] 3 1 3 3 2 3 2 3 2 3 3 3 2 3 3 2 3 1 3 2 3 3 3 2 3 2 3 2 3 2 3 3 1 3 2
## [71] 2 2 3 1 1 3 3 2 3 2 3 1 1 3 2 3 1 1 3 2 3 3 2 3 1 3 1 3 1 3 2 2 2 1 3
## [106] 1 3 1 3 2 3 2 3 1 1 1 3 3 1 3 1 3 1 1 1 3 2 3 1 3 2 2 2 2 2 2 1 1 1 3
## [141] 1 1 1 3 3 2 2 2 3 3 1 3 1 2 2 1 2 2 2 3 3 2 2 2 2 2 3 3 1 3 3 2 3 3 1
## [176] 3 3 3 2 3 3 3 3 2 3 2 3 3 3 3 3 3 2 3 3 2 2 1 3 3 1 2 3 3 2 3 3 3 3 3
## [211] 1 2 1 3 1 2 3 3 2 3 1 3 1 1 1 3 1 3 1 1 1 1 1 3 3 1 3 1 3 1 3 3 2 3 2
## [246] 1 3 2 2 2 3 1 2 2 3 3 1 3 3 3 1 2 2 2 1 3 3 1 1 2 2 2 3 3 2 2 3 3 2 2
## [281] 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 2 3 3 3 1 3 3 2 2 3 3 3 1 3 3 2 3
## [316] 2 3 3 3 2 3 1 3 1 3 3 3 1 3 1 3 1 1 1 3 3 2 3 2 2 3 3 3 3 3 3 1 3 2 2
## [351] 3 2 3 2 1 1 3 2 3 3 3 2 3 2 3 1 2 2 2 2 1 3 1 3 1 3 1 3 1 3 1 3 2 3 1
## [386] 3 2 2 3 1 1 3 2 2 3 3 2 2 3 3 2 3 1 1 1 3 3 1 2 2 3 1 1 2 1 1 1 2 2 2
## [421] 2 2 2 1 2 2 2 2 2 2 1 2 3 1 1 3 3 2 3 3 2 3 3 2 3 3 3 3 3 3 2 3 2 3 1
## [456] 3 1 3 1 1 1 2 3 1 3 3 2 3 2 3 1 2 3 2 3 2 2 2 2 3 2 3 3 2 3 1 3 3 3 3
## [491] 1 3 3 2 2 3 3 2 2 3 1 3 1 3 2 1 3 2 3 3 2 1 2 2 1 1 1 2 2 2 2 1 2 1 2
## [526] 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 3 3 2 3 3 2
## [561] 3 3 2 3 3 3 3 1 3 2 3 2 3 2 3 2 3 1 3 3 2 3 2 3 1 1 3 2 3 2 1 1 3 1 1
## [596] 3 3 2 1 1 3 3 2 3 3 2 3 2 3 2 2 3 3 2 3 1 2 2 3 1 3 1 2 3 1 3 1 3 2 3
## [631] 1 3 2 3 2 3 3 1 3 3 2 3 2 3 3 2 3 2 2 3 1 3 1 3 2 1 3 2 3 1 3 1 1 3 3
## [666] 2 3 2 3 3 2 3 1 1 3 1 2 3 2 1 3 2 1 3 1 3 1 1 3 1 3 1 2 1 3 3 2 3 2 2
## [701] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 1 1 3 3 2 3 3 2 3 3 3 3 2 3 3 3
## [736] 3 2 3 3 2 3 2 3 1 2 3 2 2 3 1 2 1 3 1 3 2 3 1 3 1 3 1 3 2 3 2 3 1 3 2
## [771] 2 2 2 1 3 2 2 1 3 1 3 3 3 3 1 1 1 1 3 1 3 2 2 2 1 1 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 709020.5 1018348.0 812079.9
## (between_SS / total_SS = 40.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(Pokemon.data[, c("Defense", "Speed")],
col = km.out$cluster,
main = paste("k-means clustering of Pokemon with", k, "clusters"),
xlab = "Defense", ylab = "Speed")
# View column means
colMeans(Pokemon.data)
## HP Attack Defense Sp. Atk Sp. Def Speed
## 69.25875 79.00125 73.84250 72.82000 71.90250 68.27750
# View column standard deviations
apply(Pokemon.data, 2, sd)
## HP Attack Defense Sp. Atk Sp. Def Speed
## 25.53467 32.45737 31.18350 32.72229 27.82892 29.06047
# Scale the data
pokemon.scaled <- scale(Pokemon.data)
# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method = "complete")
cut.pokemon <- cutree(hclust.pokemon, k = 3)
table(km.out$cluster, cut.pokemon)
## cut.pokemon
## 1 2 3
## 1 171 3 1
## 2 267 3 0
## 3 350 5 0
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
poke.ma<- select(Pokemon, c(5:11,13))
library(caTools)
## Warning: package 'caTools' was built under R version 3.3.2
poke.ma
## # A tibble: 800 × 8
## Total HP Attack Defense `Sp. Atk` `Sp. Def` Speed Legendary
## <int> <int> <int> <int> <int> <int> <int> <chr>
## 1 318 45 49 49 65 65 45 False
## 2 405 60 62 63 80 80 60 False
## 3 525 80 82 83 100 100 80 False
## 4 625 80 100 123 122 120 80 False
## 5 309 39 52 43 60 50 65 False
## 6 405 58 64 58 80 65 80 False
## 7 534 78 84 78 109 85 100 False
## 8 634 78 130 111 130 85 100 False
## 9 634 78 104 78 159 115 100 False
## 10 314 44 48 65 50 64 43 False
## # ... with 790 more rows
poke.ma$Legendary <- as.factor(poke.ma$Legendary)
sample <- sample.split(poke.ma$Legendary, SplitRatio = 0.70) # SplitRatio = percent of sample==TRUE
# Training Data
train = subset(poke.ma, sample == TRUE)
# Testing Data
test = subset(poke.ma, sample == FALSE)
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
glm_model<-glm(Legendary ~ ., family = binomial(logit), data = train)
test$predicted.Lengendary = predict(glm_model, newdata=test, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
test$Lengendary.outcome <- ifelse(test$predicted.Lengendary > 0.5, "True","False")
glm_con <-confusionMatrix(test$Lengendary.outcome ,test$Legendary)
glm_con
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 215 10
## True 6 9
##
## Accuracy : 0.9333
## 95% CI : (0.894, 0.9614)
## No Information Rate : 0.9208
## P-Value [Acc > NIR] : 0.2824
##
## Kappa : 0.4941
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.9729
## Specificity : 0.4737
## Pos Pred Value : 0.9556
## Neg Pred Value : 0.6000
## Prevalence : 0.9208
## Detection Rate : 0.8958
## Detection Prevalence : 0.9375
## Balanced Accuracy : 0.7233
##
## 'Positive' Class : False
##
suppressWarnings(library(caTools))
colAUC(test$predicted.Lengendary,test$Legendary, plotROC = TRUE)
## [,1]
## False vs. True 0.9703501