モチベーション

今回は、仮説検定を用いて、ポケモン能力の属性別の違いの有意性について言及していこうと思います!

パッケージの準備

# Load package ------------------------------------------------------------

Packages <- c("tidyverse", "stringr", "DT", "plotly")
lapply(Packages, library, character.only = TRUE)

データを用意

# Importing data ----------------------------------------------------------

url <- "https://gist.github.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv"
df_pokemon <- read_csv(url)

# Tidy data ---------------------------------------------------------------

# clean names
df_pokemon <- janitor::clean_names(df_pokemon)


# show data
datatable(df_pokemon[,2:length(df_pokemon)], 
          rownames = FALSE,
          options = list(pageLength = 5))

仮説検定

# Visualization & Modeling -----------------------------------------------------------

# Is there difference in attack among type_1?

df_pokemon %>% 
  ggplot(aes(type_1, attack)) +
  geom_violin() +
  geom_jitter(width = 0.1) 

# which sub-data contains more info?

df_pokemon %>% 
  group_by(type_1) %>% 
  tally() %>% 
  arrange(desc(n))
## # A tibble: 18 x 2
##    type_1       n
##    <chr>    <int>
##  1 Water      112
##  2 Normal      98
##  3 Grass       70
##  4 Bug         69
##  5 Psychic     57
##  6 Fire        52
##  7 Electric    44
##  8 Rock        44
##  9 Dragon      32
## 10 Ghost       32
## 11 Ground      32
## 12 Dark        31
## 13 Poison      28
## 14 Fighting    27
## 15 Steel       27
## 16 Ice         24
## 17 Fairy       17
## 18 Flying       4
# Let's focus on water and normal.

water <- df_pokemon %>% 
  filter(type_1 == "Water") %>% 
  select(attack)

normal <- df_pokemon %>% 
  filter(type_1 == "Normal") %>% 
  select(attack)


# Is there difference in attack between water and normal?
# Let's see if it is true by setting there are no difference in var (var.equal = True). 
t.test(x = normal, y = water, conf.level = 0.95, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  normal and water
## t = -0.16845, df = 208, p-value = 0.8664
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.668848  7.304052
## sample estimates:
## mean of x mean of y 
##  73.46939  74.15179
# From the result, t-value is not equal to zero (t = -0.1684) but we can not reject
# null hypothesis (t = 0) within the 95% confidence interval. Therefore, water type pokemon
# and normal pokemon could share the same mu and variance, however the best way to put is 
# we can not deny the fact there is no difference between two data. 

全コード

# Load package ------------------------------------------------------------

Packages <- c("tidyverse", "stringr", "DT", "plotly")
lapply(Packages, library, character.only = TRUE)

# Importing data ----------------------------------------------------------

url <- "https://gist.github.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv"
df_pokemon <- read_csv(url)

# Tidy data ---------------------------------------------------------------

# clean names
df_pokemon <- janitor::clean_names(df_pokemon)


# check out the data
skimr::skim(df_pokemon %>% group_by(type_1))

# Visualization & Modeling -----------------------------------------------------------

# Is there difference in attack among type_1?

df_pokemon %>% 
  ggplot(aes(type_1, attack)) +
  geom_violin() +
  geom_jitter(width = 0.1) 

# which sub-data contains more info?

df_pokemon %>% 
  group_by(type_1) %>% 
  tally() %>% 
  arrange(desc(n))

# Let's focus on water and normal.

water <- df_pokemon %>% 
  filter(type_1 == "Water") %>% 
  select(attack)

normal <- df_pokemon %>% 
  filter(type_1 == "Normal") %>% 
  select(attack)


# Is there difference in attack between water and normal?
# Let's see if it is true by setting there are no difference in var (var.equal = True). 
t.test(x = normal, y = water, conf.level = 0.95, var.equal = TRUE)

# From the result, t-value is not equal to zero (t = -0.1684) but we can not reject
# null hypothesis (t = 0) within the 95% confidence interval. Therefore, water type pokemon
# and normal pokemon could share the same mu and variance, however the best way to put is 
# we can not deny the fact there is no difference between two data.