This is the part 2 of the notebook Ggplot ’Em All | Pokemon on R.
We’ll do the following here.
Import the libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)
library(caret)
library(skimr)
library(psych)
library(e1071)
library(randomForest)
library(xgboost)
library(data.table)
library(Matrix)
library(keras)
Correct the spelling of the classification column. All non-numerical columns as imported as factors instead of characters by default.
df = read.csv(file="/home/akshaj/projects_R/Pokemon/pokemon.csv")
df = tbl_df(df)
colnames(df)[25] <- "classification"
head(df)
Select the required columns. Select a subset of df as variable classify_legendary and convert the capture_rate into numeric type.
classify_legendary = select(df, is_legendary, hp, weight_kg, height_m, speed, attack, defense, sp_attack, sp_defense, type1, type2, generation, capture_rate, experience_growth, percentage_male, base_happiness, base_egg_steps)
classify_legendary$is_legendary <- as.factor(classify_legendary$is_legendary)
classify_legendary$generation <- as.factor(classify_legendary$generation)
classify_legendary$capture_rate <- as.numeric(classify_legendary$capture_rate)
head(classify_legendary)
# Convert the `is_legendary` factor to "yes" and "no" from 1 and 0.
# classify_legendary$is_legendary <- factor(classify_legendary$is_legendary, labels=c("no", "yes"))
# To view the total number of NAs in each column
colSums(is.na(classify_legendary))
## is_legendary hp weight_kg height_m
## 0 0 20 20
## speed attack defense sp_attack
## 0 0 0 0
## sp_defense type1 type2 generation
## 0 0 0 0
## capture_rate experience_growth percentage_male base_happiness
## 0 0 98 0
## base_egg_steps
## 0
We observe that the NA values are only present in the height_m, weight_kg, and percentage_male column. We will replace the NA values in the height and weight columns by 0. It makes sense because the NA values exists for Pokemons that do not have a discernible height and weight like Gaseous Pokemon. For the percentage_male column, we’ll predict the value based on the other attributes using kNN.
classify_legendary$weight_kg[is.na(classify_legendary$weight_kg)] <- 0
classify_legendary$height_m[is.na(classify_legendary$height_m)] <- 0
colSums(is.na(classify_legendary))
## is_legendary hp weight_kg height_m
## 0 0 0 0
## speed attack defense sp_attack
## 0 0 0 0
## sp_defense type1 type2 generation
## 0 0 0 0
## capture_rate experience_growth percentage_male base_happiness
## 0 0 98 0
## base_egg_steps
## 0
For the percentage_male column, there are 98 missing values. There are multiple ways to handle missing values.
percentage_male might turn out to be an important parameter during prediction.We will predict the missing values using the K Nearest Neighbours(kNN) algorithm. What kNN imputation does in simpler terms is as follows: For every observation to be imputed, it identifies ‘k’ closest observations based on the euclidean distance and computes the weighted average (weighted based on distance) of these ‘k’ observations.
pre_process_missing_data <-preProcess(classify_legendary, method=c("knnImpute")) # You can also use the "bagImpute" algorithm.
pre_process_missing_data
## Created from 703 samples and 17 variables
##
## Pre-processing:
## - centered (13)
## - ignored (4)
## - 5 nearest neighbor imputation (13)
## - scaled (13)
Let’s now use this model to predict the missing values in classify_legendary data frame.
classify_legendary <- predict(pre_process_missing_data, newdata = classify_legendary)
anyNA(classify_legendary) # Check for any NA values in the data frame
## [1] FALSE
head(classify_legendary)
Descriptive statistics about the data.
skimmed <- skim_to_wide(classify_legendary)
skimmed
Next up, we’ll one hot encode the categorical attributes. We are saving the column that is to be predicted in a variable y. After one hot encoding, we’ll put this variable back in the data frame.
y <- classify_legendary$is_legendary
One hot encode the columns.
dummies_model <- dummyVars(is_legendary~., data=classify_legendary)
data_mat <- predict(dummies_model, newdata = classify_legendary)
classify_legendary_ohe <- data.frame(data_mat)
classify_legendary_ohe
At this point, we have our categorical data one-hot-encoded and missing data filled. We’ll normalize our dataset so that the values are between 0 and 1.
pre_process_normalize <- preProcess(classify_legendary, method="range")
pre_process_normalize_ohe <- preProcess(classify_legendary_ohe, method="range")
classify_legendary <- predict(pre_process_normalize, newdata = classify_legendary)
classify_legendary_ohe <- predict(pre_process_normalize_ohe, newdata = classify_legendary_ohe)
classify_legendary$is_legendary <- y # add the is_legendary column back into the df
classify_legendary_ohe$is_legendary <- y # add the is_legendary column back into the df
head(classify_legendary)
head(classify_legendary_ohe)
str(classify_legendary)
## Classes 'tbl_df', 'tbl' and 'data.frame': 801 obs. of 17 variables:
## $ is_legendary : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hp : num 0.173 0.232 0.311 0.15 0.224 ...
## $ weight_kg : num 0.0069 0.013 0.1 0.0085 0.019 ...
## $ height_m : num 0.0483 0.069 0.1379 0.0414 0.0759 ...
## $ speed : num 0.229 0.314 0.429 0.343 0.429 ...
## $ attack : num 0.244 0.317 0.528 0.261 0.328 ...
## $ defense : num 0.196 0.258 0.524 0.169 0.236 ...
## $ sp_attack : num 0.299 0.38 0.609 0.272 0.38 ...
## $ sp_defense : num 0.214 0.286 0.476 0.143 0.214 ...
## $ type1 : Factor w/ 18 levels "bug","dark","dragon",..: 10 10 10 7 7 7 18 18 18 1 ...
## $ type2 : Factor w/ 19 levels "","bug","dark",..: 15 15 15 1 1 9 1 1 1 1 ...
## $ generation : Factor w/ 7 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ capture_rate : num 0.758 0.758 0.758 0.758 0.758 ...
## $ experience_growth: num 0.442 0.442 0.442 0.442 0.442 ...
## $ percentage_male : num 0.881 0.881 0.881 0.881 0.881 0.881 0.881 0.881 0.881 0.5 ...
## $ base_happiness : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ base_egg_steps : num 0.13 0.13 0.13 0.13 0.13 ...
Divide the dataset(Both normal and one hot encoded) into train and test. 80% of the data is in train-set while the other 20% of the data is in test-set.
train_row_numbers <- createDataPartition(classify_legendary$is_legendary, p=0.8, list=FALSE)
train_classify_legendary <- classify_legendary[train_row_numbers, ]
test_classify_legendary <- classify_legendary[-train_row_numbers, ]
train_row_numbers_ohe <- createDataPartition(classify_legendary_ohe$is_legendary, p=0.8, list=FALSE)
train_classify_legendary_ohe <- classify_legendary_ohe[train_row_numbers, ]
test_classify_legendary_ohe <- classify_legendary_ohe[-train_row_numbers, ]
For Naive Bayes, the independent variables should not be highly correlated.
pairs.panels(classify_legendary[-1])
Train
nb_model <- naiveBayes(is_legendary ~., data = train_classify_legendary_ohe)
predict_train_nb <- predict(nb_model, train_classify_legendary_ohe)
Training Confusion matrix
confmat_train_nb <- table(predict_train_nb,train_classify_legendary_ohe$is_legendary)
confmat_train_nb
##
## predict_train_nb 0 1
## 0 147 0
## 1 438 56
Training Accuracy
(confmat_train_nb[1, 1] + confmat_train_nb[2, 2])/ sum(confmat_train_nb) * 100
## [1] 31.66927
Test
predict_test_nb <- predict(nb_model, test_classify_legendary_ohe)
Test Confusion matrix
confmat_test_nb <- table(predict_test_nb,test_classify_legendary_ohe$is_legendary)
confmat_test_nb
##
## predict_test_nb 0 1
## 0 40 0
## 1 106 14
Test Accuracy
(confmat_test_nb[1, 1] + confmat_test_nb[2, 2])/ sum(confmat_test_nb) * 100
## [1] 33.75
Train
model_svm <- svm(is_legendary~., data = train_classify_legendary_ohe)
summary(model_svm)
##
## Call:
## svm(formula = is_legendary ~ ., data = train_classify_legendary_ohe)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01754386
##
## Number of Support Vectors: 187
##
## ( 137 50 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
predict_train_svm <- predict(model_svm, train_classify_legendary_ohe)
Train Confusion Matrix
confmat_train_svm <- table(Predicted = predict_train_svm, Actual = train_classify_legendary_ohe$is_legendary)
confmat_train_svm
## Actual
## Predicted 0 1
## 0 583 6
## 1 2 50
Train Accuracy
(confmat_train_svm[1, 1] + confmat_train_svm[2, 2]) / sum(confmat_train_svm) * 100
## [1] 98.75195
Test
predict_test_svm <- predict(model_svm, test_classify_legendary_ohe)
Test Confusion Matrix
confmat_test_svm <- table(Predicted = predict_test_svm, Actual = test_classify_legendary_ohe$is_legendary)
confmat_test_svm
## Actual
## Predicted 0 1
## 0 146 5
## 1 0 9
Test Accuracy
(confmat_test_svm[1, 1] + confmat_test_svm[2, 2]) / sum(confmat_test_svm) * 100
## [1] 96.875
model_rf <- randomForest(is_legendary~., data = train_classify_legendary_ohe)
Train
model_rf
##
## Call:
## randomForest(formula = is_legendary ~ ., data = train_classify_legendary_ohe)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 1.09%
## Confusion matrix:
## 0 1 class.error
## 0 584 1 0.001709402
## 1 6 50 0.107142857
predict_train_rf <- predict(model_rf, train_classify_legendary_ohe)
Train Accuracy
confusionMatrix(predict_train_rf, train_classify_legendary_ohe$is_legendary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 585 0
## 1 0 56
##
## Accuracy : 1
## 95% CI : (0.9943, 1)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.9126
## Detection Rate : 0.9126
## Detection Prevalence : 0.9126
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
Test
predict_test_rf <- predict(model_rf, test_classify_legendary_ohe)
Test Accuracy
confusionMatrix(predict_test_rf, test_classify_legendary_ohe$is_legendary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 146 1
## 1 0 13
##
## Accuracy : 0.9938
## 95% CI : (0.9657, 0.9998)
## No Information Rate : 0.9125
## P-Value [Acc > NIR] : 7.089e-06
##
## Kappa : 0.9596
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9286
## Pos Pred Value : 0.9932
## Neg Pred Value : 1.0000
## Prevalence : 0.9125
## Detection Rate : 0.9125
## Detection Prevalence : 0.9187
## Balanced Accuracy : 0.9643
##
## 'Positive' Class : 0
##
trainm <- sparse.model.matrix(is_legendary~.-1, data = train_classify_legendary)
testm <- sparse.model.matrix(is_legendary~.-1, data = test_classify_legendary)
train_label <- as.matrix(train_classify_legendary[,"is_legendary"])
test_label <- as.matrix(test_classify_legendary[,"is_legendary"])
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
test_matrix <- xgb.DMatrix(data = as.matrix(testm), label = test_label)
nc <- length(unique(train_label))
model_xgb <- xgboost(data = train_matrix, # the data
nround = 26, # max number of boosting iterations
objective = "binary:logistic") # the objective function
## [1] train-error:0.007800
## [2] train-error:0.007800
## [3] train-error:0.010920
## [4] train-error:0.007800
## [5] train-error:0.007800
## [6] train-error:0.007800
## [7] train-error:0.006240
## [8] train-error:0.006240
## [9] train-error:0.006240
## [10] train-error:0.004680
## [11] train-error:0.001560
## [12] train-error:0.001560
## [13] train-error:0.001560
## [14] train-error:0.001560
## [15] train-error:0.000000
## [16] train-error:0.000000
## [17] train-error:0.000000
## [18] train-error:0.000000
## [19] train-error:0.000000
## [20] train-error:0.000000
## [21] train-error:0.000000
## [22] train-error:0.000000
## [23] train-error:0.000000
## [24] train-error:0.000000
## [25] train-error:0.000000
## [26] train-error:0.000000
pred_xgb <- predict(model_xgb, test_matrix)
acc <- mean(as.numeric(pred_xgb > 0.5) == test_label)
print(paste("test-accuracy=", acc))
## [1] "test-accuracy= 0.99375"
model_nn <- keras_model_sequential()
model_nn %>%
layer_dense(units = 10, activation = "relu", input_shape = c(57)) %>%
layer_dense(units = 20, activation = "relu") %>%
layer_dense(units = 20, activation = "relu") %>%
layer_dense(units = 20, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
model_nn %>% compile(
loss = 'binary_crossentropy',
optimizer = 'adam',
metrics = c('accuracy')
)
X_train <- train_classify_legendary_ohe[, 1:57]
y_train <- train_classify_legendary_ohe[, 58]
X_test <- test_classify_legendary_ohe[, 1:57]
y_test <- test_classify_legendary_ohe[, 58]
X_train = as.matrix(X_train)
X_test = as.matrix(X_test)
y_train = as.matrix(y_train)
y_test = as.matrix(y_test)
history <- model_nn %>% fit(
as.matrix(X_train),
as.matrix(y_train),
epochs = 20,
batch_size = 4,
validation_data = list(as.matrix(X_test), as.matrix(y_test))
)
plot(history)
score <- model_nn %>% evaluate(X_test, y_test)
cat('Test loss:', score$loss, "\n")
## Test loss: 0.1386675
cat('Test accuracy:', score$acc, "\n")
## Test accuracy: 0.95625