knitr::opts_chunk$set(echo = TRUE,warning=F)
#library(tidyverse)
library(dplyr)
library(purrr)
library(ggplot2)
dataset=read.csv('https://osf.io/download/c3bfs/')
head(dataset)
| dexnum | name | generation | type1 | type2 | species | height | weight | ability1 | ability2 | hidden_ability | hp | attack | defense | sp_atk | sp_def | speed | total | ev_yield | catch_rate | base_friendship | base_exp | growth_rate | egg_group1 | egg_group2 | percent_male | percent_female | egg_cycles | special_group |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Bulbasaur | 1 | Grass | Poison | Seed Pokémon | 0.7 | 6.9 | Overgrow | Chlorophyll | 45 | 49 | 49 | 65 | 65 | 45 | 318 | 1 Sp. Atk | 45 | 50 | 64 | Medium Slow | Grass | Monster | 87.5 | 12.5 | 20 | Ordinary | |
| 2 | Ivysaur | 1 | Grass | Poison | Seed Pokémon | 1.0 | 13.0 | Overgrow | Chlorophyll | 60 | 62 | 63 | 80 | 80 | 60 | 405 | 1 Sp. Atk, 1 Sp. Def | 45 | 50 | 142 | Medium Slow | Grass | Monster | 87.5 | 12.5 | 20 | Ordinary | |
| 3 | Venusaur | 1 | Grass | Poison | Seed Pokémon | 2.0 | 100.0 | Overgrow | Chlorophyll | 80 | 82 | 83 | 100 | 100 | 80 | 525 | 2 Sp. Atk, 1 Sp. Def | 45 | 50 | 236 | Medium Slow | Grass | Monster | 87.5 | 12.5 | 20 | Ordinary | |
| 4 | Charmander | 1 | Fire | Lizard Pokémon | 0.6 | 8.5 | Blaze | Solar Power | 39 | 52 | 43 | 60 | 50 | 65 | 309 | 1 Speed | 45 | 50 | 62 | Medium Slow | Dragon | Monster | 87.5 | 12.5 | 20 | Ordinary | ||
| 5 | Charmeleon | 1 | Fire | Flame Pokémon | 1.1 | 19.0 | Blaze | Solar Power | 58 | 64 | 58 | 80 | 65 | 80 | 405 | 1 Sp. Atk, 1 Speed | 45 | 50 | 142 | Medium Slow | Dragon | Monster | 87.5 | 12.5 | 20 | Ordinary | ||
| 6 | Charizard | 1 | Fire | Flying | Flame Pokémon | 1.7 | 90.5 | Blaze | Solar Power | 78 | 84 | 78 | 109 | 85 | 100 | 534 | 3 Sp. Atk | 45 | 50 | 267 | Medium Slow | Dragon | Monster | 87.5 | 12.5 | 20 | Ordinary |
skimr::skim(dataset)
| Name | dataset |
| Number of rows | 1025 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 15 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| name | 0 | 1 | 3 | 17 | 0 | 1025 | 0 |
| type1 | 0 | 1 | 3 | 8 | 0 | 18 | 0 |
| type2 | 0 | 1 | 0 | 8 | 499 | 19 | 0 |
| species | 0 | 1 | 1 | 21 | 0 | 715 | 0 |
| ability1 | 0 | 1 | 4 | 16 | 0 | 223 | 0 |
| ability2 | 0 | 1 | 0 | 16 | 167 | 170 | 0 |
| hidden_ability | 0 | 1 | 0 | 13 | 495 | 126 | 0 |
| ev_yield | 0 | 1 | 4 | 30 | 0 | 48 | 0 |
| base_friendship | 0 | 1 | 1 | 3 | 0 | 9 | 0 |
| base_exp | 0 | 1 | 1 | 3 | 0 | 181 | 0 |
| growth_rate | 0 | 1 | 4 | 11 | 0 | 6 | 0 |
| egg_group1 | 0 | 1 | 3 | 12 | 0 | 26 | 0 |
| egg_group2 | 0 | 1 | 0 | 10 | 746 | 12 | 0 |
| egg_cycles | 0 | 1 | 1 | 3 | 0 | 12 | 0 |
| special_group | 0 | 1 | 6 | 15 | 0 | 8 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| dexnum | 0 | 1.00 | 513.00 | 296.04 | 1.0 | 257.0 | 513 | 769.0 | 1025.0 | ▇▇▇▇▇ |
| generation | 0 | 1.00 | 4.74 | 2.63 | 1.0 | 3.0 | 5 | 7.0 | 9.0 | ▇▇▅▅▇ |
| height | 0 | 1.00 | 1.21 | 1.25 | 0.1 | 0.5 | 1 | 1.5 | 20.0 | ▇▁▁▁▁ |
| weight | 0 | 1.00 | 66.98 | 121.28 | 0.1 | 8.5 | 28 | 70.0 | 999.9 | ▇▁▁▁▁ |
| hp | 0 | 1.00 | 70.18 | 26.63 | 1.0 | 50.0 | 68 | 85.0 | 255.0 | ▃▇▁▁▁ |
| attack | 0 | 1.00 | 77.52 | 29.78 | 5.0 | 55.0 | 75 | 100.0 | 181.0 | ▂▇▆▃▁ |
| defense | 0 | 1.00 | 72.51 | 29.29 | 5.0 | 50.0 | 70 | 90.0 | 230.0 | ▃▇▂▁▁ |
| sp_atk | 0 | 1.00 | 70.08 | 29.66 | 10.0 | 47.0 | 65 | 90.0 | 173.0 | ▃▇▅▂▁ |
| sp_def | 0 | 1.00 | 70.21 | 26.64 | 20.0 | 50.0 | 67 | 86.0 | 230.0 | ▇▇▂▁▁ |
| speed | 0 | 1.00 | 67.19 | 28.72 | 5.0 | 45.0 | 65 | 88.0 | 200.0 | ▃▇▅▁▁ |
| total | 0 | 1.00 | 427.69 | 112.77 | 175.0 | 325.0 | 450 | 508.0 | 720.0 | ▂▆▇▆▁ |
| catch_rate | 0 | 1.00 | 94.93 | 76.11 | 3.0 | 45.0 | 60 | 140.0 | 255.0 | ▇▃▂▂▂ |
| percent_male | 155 | 0.85 | 54.93 | 20.31 | 0.0 | 50.0 | 50 | 50.0 | 100.0 | ▁▁▇▁▂ |
| percent_female | 155 | 0.85 | 45.07 | 20.31 | 0.0 | 50.0 | 50 | 50.0 | 100.0 | ▂▁▇▁▁ |
# sum(is.na(dataset))
#
# sum(is.null(dataset))
#
# sum(!complete.cases(dataset))
Let’s get familiar with our data. Create a table that shows the frequency of each type of Pokémon in the dataset (variable name: type1).
tab1 = dataset %>%
filter(complete.cases(type1)) %>%
group_by(type1) %>%
count() %>%
arrange(desc(n))
tab1
| type1 | n |
|---|---|
| Water | 134 |
| Normal | 118 |
| Grass | 103 |
| Bug | 83 |
| Fire | 66 |
| Psychic | 60 |
| Electric | 59 |
| Rock | 58 |
| Dark | 45 |
| Poison | 42 |
| Fighting | 40 |
| Ground | 40 |
| Dragon | 37 |
| Steel | 36 |
| Ghost | 35 |
| Ice | 31 |
| Fairy | 29 |
| Flying | 9 |
Next, let’s look at the representation of each generation of Pokémon in the dataset. Create a table that shows the number of Pokémon in the dataset associated with each generation (variable name: generation)
tab2 = dataset %>%
filter(complete.cases(generation)) %>%
group_by(generation) %>%
count() %>%
arrange(desc(n))
tab2
| generation | n |
|---|---|
| 5 | 156 |
| 1 | 151 |
| 3 | 135 |
| 9 | 120 |
| 4 | 107 |
| 2 | 100 |
| 8 | 96 |
| 7 | 88 |
| 6 | 72 |
Provide the mean, standard deviation, median, and range of values of attack stats (variable name: attack) for ‘Water’ type Pokémon (variable name: type1).
water_attack = dataset %>%
filter(type1 == 'Water',
complete.cases(type1)) %>%
select(attack)
#create function for desc stats
desc_stat = function(x){
c(mean = mean(x,na.rm = T),
standard_dev = sd(x,na.rm = T),
median = median(x,na.rm = T),
range = range(x,na.rm = T))
}
#mean: 72.34
#std: 25.26
#median: 70
#range: 10 to 130
map(water_attack,desc_stat)
## $attack
## mean standard_dev median range1 range2
## 72.34328 25.25959 70.00000 10.00000 130.00000
-Provide the mean, standard deviation, median, and range of values of speed stats (variable name: speed) for ‘Dragon’ type Pokémon (variable name: type1).
dragon_speed = dataset %>%
filter(type1 == 'Dragon',
complete.cases(type1)) %>%
select(speed)
#mean: 80.19
#std: 24.62
#median: 82
#range: 40 to 142
map(dragon_speed,desc_stat)
## $speed
## mean standard_dev median range1 range2
## 80.18919 24.62365 82.00000 40.00000 142.00000
Are there any outliers for the ‘total’ stat? What is the value(s), and which Pokémon(s) is/are associated with it?
library(ggplot2)
boxplot(dataset$total)
outlier_lower = dataset %>%
select(name,total) %>%
arrange(total)
outlier_upper = dataset %>%
select(name,total) %>%
arrange(desc(total))
head(outlier_lower)
| name | total |
|---|---|
| Wishiwashi | 175 |
| Sunkern | 180 |
| Blipbug | 180 |
| Snom | 185 |
| Azurill | 190 |
| Kricketot | 194 |
head(outlier_upper)
| name | total |
|---|---|
| Arceus | 720 |
| Eternatus | 690 |
| Mewtwo | 680 |
| Lugia | 680 |
| Ho-oh | 680 |
| Rayquaza | 680 |
#Wishwashi with a total of 175 is the lower outlier and Arceus with a total of 720 is the upper outlier.
##Level 4-
Is there a correlation between a Pokémon’s ‘total’ stat and their base level of experience? (variable: base_exp)
dataset2=dataset %>%
filter(complete.cases(base_exp),
complete.cases(total)) %>%
mutate(base_exp2=as.numeric(base_exp)) %>%
filter(!is.na(base_exp2),
!is.na(total)) %>%
select(base_exp,base_exp2,total,everything())
#violates normality
shapiro.test(dataset2$base_exp2)
##
## Shapiro-Wilk normality test
##
## data: dataset2$base_exp2
## W = 0.92187, p-value < 2.2e-16
hist(dataset2$base_exp2)
# p-value < 2.2e-16, sample estimates: tau 0.9132707
cor.test(dataset2$total,dataset2$base_exp2,method = 'kendall')
##
## Kendall's rank correlation tau
##
## data: dataset2$total and dataset2$base_exp2
## z = 42.937, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.9132707
-Create a visualization that shows the average ‘total’ stat based on the primary type of the Pokémon (type1)
dataset3 = dataset2 %>%
group_by('Pokemon Primary Type' =type1) %>%
summarise('Average of Total Stat'=round(mean(total),2)) %>%
arrange(desc(`Average of Total Stat`))
dataset3
| Pokemon Primary Type | Average of Total Stat |
|---|---|
| Dragon | 490.16 |
| Steel | 468.03 |
| Dark | 454.73 |
| Psychic | 446.72 |
| Fire | 443.98 |
| Fighting | 441.55 |
| Rock | 438.54 |
| Ice | 436.39 |
| Flying | 436.11 |
| Fairy | 436.07 |
| Ground | 434.58 |
| Electric | 433.66 |
| Ghost | 431.17 |
| Water | 417.58 |
| Poison | 411.61 |
| Grass | 407.94 |
| Normal | 399.41 |
| Bug | 374.64 |
dataset3 %>%
ggplot(aes(y=`Pokemon Primary Type`,x=`Average of Total Stat`,fill=`Average of Total Stat`))+
geom_col()+
labs(x='Total Stat Average',y='Primary Type',
title='A Bar Chart Brawl of Pokemon Power',
subtitle='Examining Primary Types and their Average Total Stats',
fill=' ')+
theme_bw()+
theme(plot.title = element_text(hjust=.5),
plot.subtitle = element_text(hjust = .5))