Select the feature column

Pokemon.data <- Pokemon[6:11]

Find the best k

wss <- 0

# Look over 1 to 15 possible clusters
for (i in 1:15) {
  # Fit the model: km.out
  km.out <- kmeans(Pokemon.data, centers = i, nstart = 20, iter.max = 50)
  # Save the within cluster sum of squares
  wss[i] <- km.out$tot.withinss
}

# Produce a scree plot
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

k <- 3

# Build model with k clusters: km.out
km.out <- kmeans(Pokemon.data, centers = k, nstart = 20, iter.max = 50)

# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 175, 270, 355
## 
## Cluster means:
##         HP   Attack   Defense   Sp. Atk  Sp. Def    Speed
## 1 79.30857 97.29714 108.93143  66.71429 87.04571 57.29143
## 2 81.90370 96.15926  77.65556 104.12222 86.87778 94.71111
## 3 54.68732 56.93239  53.64507  52.02254 53.04789 53.58873
## 
## Clustering vector:
##   [1] 3 3 2 2 3 3 2 2 2 3 3 1 2 3 3 3 3 3 3 2 3 3 2 2 3 3 3 2 3 2 3 2 3 1 3
##  [36] 3 1 3 3 2 3 2 3 2 3 3 3 2 3 3 2 3 1 3 2 3 3 3 2 3 2 3 2 3 2 3 3 1 3 2
##  [71] 2 2 3 1 1 3 3 2 3 2 3 1 1 3 2 3 1 1 3 2 3 3 2 3 1 3 1 3 1 3 2 2 2 1 3
## [106] 1 3 1 3 2 3 2 3 1 1 1 3 3 1 3 1 3 1 1 1 3 2 3 1 3 2 2 2 2 2 2 1 1 1 3
## [141] 1 1 1 3 3 2 2 2 3 3 1 3 1 2 2 1 2 2 2 3 3 2 2 2 2 2 3 3 1 3 3 2 3 3 1
## [176] 3 3 3 2 3 3 3 3 2 3 2 3 3 3 3 3 3 2 3 3 2 2 1 3 3 1 2 3 3 2 3 3 3 3 3
## [211] 1 2 1 3 1 2 3 3 2 3 1 3 1 1 1 3 1 3 1 1 1 1 1 3 3 1 3 1 3 1 3 3 2 3 2
## [246] 1 3 2 2 2 3 1 2 2 3 3 1 3 3 3 1 2 2 2 1 3 3 1 1 2 2 2 3 3 2 2 3 3 2 2
## [281] 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 2 3 3 3 1 3 3 2 2 3 3 3 1 3 3 2 3
## [316] 2 3 3 3 2 3 1 3 1 3 3 3 1 3 1 3 1 1 1 3 3 2 3 2 2 3 3 3 3 3 3 1 3 2 2
## [351] 3 2 3 2 1 1 3 2 3 3 3 2 3 2 3 1 2 2 2 2 1 3 1 3 1 3 1 3 1 3 1 3 2 3 1
## [386] 3 2 2 3 1 1 3 2 2 3 3 2 2 3 3 2 3 1 1 1 3 3 1 2 2 3 1 1 2 1 1 1 2 2 2
## [421] 2 2 2 1 2 2 2 2 2 2 1 2 3 1 1 3 3 2 3 3 2 3 3 2 3 3 3 3 3 3 2 3 2 3 1
## [456] 3 1 3 1 1 1 2 3 1 3 3 2 3 2 3 1 2 3 2 3 2 2 2 2 3 2 3 3 2 3 1 3 3 3 3
## [491] 1 3 3 2 2 3 3 2 2 3 1 3 1 3 2 1 3 2 3 3 2 1 2 2 1 1 1 2 2 2 2 1 2 1 2
## [526] 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 3 3 2 3 3 2
## [561] 3 3 2 3 3 3 3 1 3 2 3 2 3 2 3 2 3 1 3 3 2 3 2 3 1 1 3 2 3 2 1 1 3 1 1
## [596] 3 3 2 1 1 3 3 2 3 3 2 3 2 3 2 2 3 3 2 3 1 2 2 3 1 3 1 2 3 1 3 1 3 2 3
## [631] 1 3 2 3 2 3 3 1 3 3 2 3 2 3 3 2 3 2 2 3 1 3 1 3 2 1 3 2 3 1 3 1 1 3 3
## [666] 2 3 2 3 3 2 3 1 1 3 1 2 3 2 1 3 2 1 3 1 3 1 1 3 1 3 1 2 1 3 3 2 3 2 2
## [701] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 1 1 3 3 2 3 3 2 3 3 3 3 2 3 3 3
## [736] 3 2 3 3 2 3 2 3 1 2 3 2 2 3 1 2 1 3 1 3 2 3 1 3 1 3 1 3 2 3 2 3 1 3 2
## [771] 2 2 2 1 3 2 2 1 3 1 3 3 3 3 1 1 1 1 3 1 3 2 2 2 1 1 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1]  709020.5 1018348.0  812079.9
##  (between_SS / total_SS =  40.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(Pokemon.data[, c("Defense", "Speed")],
     col = km.out$cluster,
     main = paste("k-means clustering of Pokemon with", k, "clusters"),
     xlab = "Defense", ylab = "Speed")

# View column means
colMeans(Pokemon.data)
##       HP   Attack  Defense  Sp. Atk  Sp. Def    Speed 
## 69.25875 79.00125 73.84250 72.82000 71.90250 68.27750
# View column standard deviations
apply(Pokemon.data, 2, sd)
##       HP   Attack  Defense  Sp. Atk  Sp. Def    Speed 
## 25.53467 32.45737 31.18350 32.72229 27.82892 29.06047
# Scale the data
pokemon.scaled <- scale(Pokemon.data)

# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method = "complete")
cut.pokemon <- cutree(hclust.pokemon, k = 3)
table(km.out$cluster, cut.pokemon)
##    cut.pokemon
##       1   2   3
##   1 171   3   1
##   2 267   3   0
##   3 350   5   0
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
poke.ma<-  select(Pokemon, c(5:11,13))
library(caTools)
## Warning: package 'caTools' was built under R version 3.3.2
poke.ma
## # A tibble: 800 × 8
##    Total    HP Attack Defense `Sp. Atk` `Sp. Def` Speed Legendary
##    <int> <int>  <int>   <int>     <int>     <int> <int>     <chr>
## 1    318    45     49      49        65        65    45     False
## 2    405    60     62      63        80        80    60     False
## 3    525    80     82      83       100       100    80     False
## 4    625    80    100     123       122       120    80     False
## 5    309    39     52      43        60        50    65     False
## 6    405    58     64      58        80        65    80     False
## 7    534    78     84      78       109        85   100     False
## 8    634    78    130     111       130        85   100     False
## 9    634    78    104      78       159       115   100     False
## 10   314    44     48      65        50        64    43     False
## # ... with 790 more rows
poke.ma$Legendary <- as.factor(poke.ma$Legendary)
sample <- sample.split(poke.ma$Legendary, SplitRatio = 0.70) # SplitRatio = percent of sample==TRUE

# Training Data
train = subset(poke.ma, sample == TRUE)

# Testing Data 
test = subset(poke.ma, sample == FALSE)
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
glm_model<-glm(Legendary ~ ., family = binomial(logit), data = train)
test$predicted.Lengendary = predict(glm_model, newdata=test, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
test$Lengendary.outcome <- ifelse(test$predicted.Lengendary > 0.5, "True","False")
glm_con <-confusionMatrix(test$Lengendary.outcome ,test$Legendary)
glm_con 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False   215   10
##      True      6    9
##                                          
##                Accuracy : 0.9333         
##                  95% CI : (0.894, 0.9614)
##     No Information Rate : 0.9208         
##     P-Value [Acc > NIR] : 0.2824         
##                                          
##                   Kappa : 0.4941         
##  Mcnemar's Test P-Value : 0.4533         
##                                          
##             Sensitivity : 0.9729         
##             Specificity : 0.4737         
##          Pos Pred Value : 0.9556         
##          Neg Pred Value : 0.6000         
##              Prevalence : 0.9208         
##          Detection Rate : 0.8958         
##    Detection Prevalence : 0.9375         
##       Balanced Accuracy : 0.7233         
##                                          
##        'Positive' Class : False          
## 
suppressWarnings(library(caTools))
colAUC(test$predicted.Lengendary,test$Legendary, plotROC = TRUE)

##                     [,1]
## False vs. True 0.9703501