今回は、仮説検定を用いて、ポケモン能力の属性別の違いの有意性について言及していこうと思います!
# Importing data ----------------------------------------------------------
url <- "https://gist.github.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv"
df_pokemon <- read_csv(url)
# Tidy data ---------------------------------------------------------------
# clean names
df_pokemon <- janitor::clean_names(df_pokemon)
# show data
datatable(df_pokemon[,2:length(df_pokemon)],
rownames = FALSE,
options = list(pageLength = 5))# Visualization & Modeling -----------------------------------------------------------
# Is there difference in attack among type_1?
df_pokemon %>%
ggplot(aes(type_1, attack)) +
geom_violin() +
geom_jitter(width = 0.1) # which sub-data contains more info?
df_pokemon %>%
group_by(type_1) %>%
tally() %>%
arrange(desc(n))## # A tibble: 18 x 2
## type_1 n
## <chr> <int>
## 1 Water 112
## 2 Normal 98
## 3 Grass 70
## 4 Bug 69
## 5 Psychic 57
## 6 Fire 52
## 7 Electric 44
## 8 Rock 44
## 9 Dragon 32
## 10 Ghost 32
## 11 Ground 32
## 12 Dark 31
## 13 Poison 28
## 14 Fighting 27
## 15 Steel 27
## 16 Ice 24
## 17 Fairy 17
## 18 Flying 4
# Let's focus on water and normal.
water <- df_pokemon %>%
filter(type_1 == "Water") %>%
select(attack)
normal <- df_pokemon %>%
filter(type_1 == "Normal") %>%
select(attack)
# Is there difference in attack between water and normal?
# Let's see if it is true by setting there are no difference in var (var.equal = True).
t.test(x = normal, y = water, conf.level = 0.95, var.equal = TRUE)##
## Two Sample t-test
##
## data: normal and water
## t = -0.16845, df = 208, p-value = 0.8664
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.668848 7.304052
## sample estimates:
## mean of x mean of y
## 73.46939 74.15179
# From the result, t-value is not equal to zero (t = -0.1684) but we can not reject
# null hypothesis (t = 0) within the 95% confidence interval. Therefore, water type pokemon
# and normal pokemon could share the same mu and variance, however the best way to put is
# we can not deny the fact there is no difference between two data. # Load package ------------------------------------------------------------
Packages <- c("tidyverse", "stringr", "DT", "plotly")
lapply(Packages, library, character.only = TRUE)
# Importing data ----------------------------------------------------------
url <- "https://gist.github.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv"
df_pokemon <- read_csv(url)
# Tidy data ---------------------------------------------------------------
# clean names
df_pokemon <- janitor::clean_names(df_pokemon)
# check out the data
skimr::skim(df_pokemon %>% group_by(type_1))
# Visualization & Modeling -----------------------------------------------------------
# Is there difference in attack among type_1?
df_pokemon %>%
ggplot(aes(type_1, attack)) +
geom_violin() +
geom_jitter(width = 0.1)
# which sub-data contains more info?
df_pokemon %>%
group_by(type_1) %>%
tally() %>%
arrange(desc(n))
# Let's focus on water and normal.
water <- df_pokemon %>%
filter(type_1 == "Water") %>%
select(attack)
normal <- df_pokemon %>%
filter(type_1 == "Normal") %>%
select(attack)
# Is there difference in attack between water and normal?
# Let's see if it is true by setting there are no difference in var (var.equal = True).
t.test(x = normal, y = water, conf.level = 0.95, var.equal = TRUE)
# From the result, t-value is not equal to zero (t = -0.1684) but we can not reject
# null hypothesis (t = 0) within the 95% confidence interval. Therefore, water type pokemon
# and normal pokemon could share the same mu and variance, however the best way to put is
# we can not deny the fact there is no difference between two data.