install.packages("pwr", repos="https://cloud.r-project.org/")
##
## The downloaded binary packages are in
## /var/folders/p0/800mfk513715kyz1k35_86r80000gn/T//RtmpoPZicX/downloaded_packages
library(pwr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 2. Read the dataset
pokemon_data <- read.csv("PokemonStats.csv")
# Devising Hypotheses:
#Hypothesis 1: Pokémon with a primary type of "Grass" have, on average, the same HP as Pokémon with a primary type of "Fire".
#Null Hypothesis - The mean HP of Pokémon with Type1 "Grass" is equal to the mean HP of Pokémon with Type1 "Fire".
#Alternative Hypothesis - The mean HP of Pokémon with Type1 "Grass" is not equal to the mean HP of Pokémon with Type1 "Fire".
#Hypothesis 2: The average speed of Pokémon weighing less than 10kg is the same as the average speed of Pokémon weighing more than 50kg.
#Null Hypothesis - The mean speed of Pokémon weighing <10kg is equal to the mean speed of Pokémon weighing >50kg.
#Alternative Hypothesis -The mean speed of Pokémon weighing <10kg is not equal to the mean speed of Pokémon weighing >50kg.
# Choosing alpha, power, and minimum effect size
#Alpha: 0.05. This is a commonly chosen value, indicating a 5% chance of a Type I error (rejecting the null hypothesis when it's true).
# Power: 0.8. This suggests an 80% chance of detecting an effect if one exists.
# Minimum Effect Size: A difference of 10 HP would be practically significant. This means that we would want to detect a difference of at least 10 HP between the two groups.
# Calculating the required sample size for the Neyman-Pearson hypothesis test using these parameters.
alpha <- 0.05
power <- 0.8
# Calculating effect size using standard deviation of entire 'HP' column
effect_size_all_hp <- 10 / sd(pokemon_data$HP)
# Calculating required sample size with this effect size
power_analysis_all_hp <- pwr.t.test(d = effect_size_all_hp, sig.level = alpha, power = power, alternative = "two.sided", type = "two.sample")
power_analysis_all_hp$n
## [1] 114.2357
# The calculated required sample size for the Neyman-Pearson hypothesis test is approximately 114.24 for each group (Grass and Fire types). This means we would need at least 115 Pokémon of each type (Grass and Fire) to achieve the desired power of 0.8, given our specified effect size and alpha level.
# Determining if we have enough data:
grass_count <- nrow(pokemon_data[pokemon_data$Type1 == "Grass", ])
fire_count <- nrow(pokemon_data[pokemon_data$Type1 == "Fire", ])
grass_count
## [1] 105
fire_count
## [1] 75
#Given our calculated required sample size of 115 for each group, we don't have enough Pokémon in either the "Grass" or "Fire" categories to achieve the desired power of 0.8 with our specified effect size and alpha level.
#Since we don't meet the required sample size for a Neyman-Pearson hypothesis test, we won't be able to perform this test with the desired power for Hypothesis 1.
# We'll perform a Fisher's style t-test to determine if there's a significant difference in the means of the two groups
grass_hp <- pokemon_data$HP[pokemon_data$Type1 == "Grass"]
fire_hp <- pokemon_data$HP[pokemon_data$Type1 == "Fire"]
test_result <- t.test(grass_hp, fire_hp, var.equal = FALSE)
test_result$statistic
## t
## -1.292622
test_result$p.value
## [1] 0.1979533
# t-statistic: −1.29 ; p-value: 0.198
# Given our chosen alpha level of 0.05, the p-value of 0.198 is greater than 0.05, so we fail to reject the null hypothesis. This means that, based on our sample, we do not have enough evidence to suggest a significant difference in the mean HP of Pokémon with Type1 "Grass" compared to those with Type1 "Fire".
# Insights:
# The result suggests that there might not be a significant difference in HP between Pokémon with primary types of "Grass" and "Fire". However, it's worth noting that our sample size was not large enough to achieve the desired power for a Neyman-Pearson test. It would be beneficial to have a larger dataset to draw more robust conclusions.
# Hypothesis 2
# We'll use the same parameters as before for alpha, power, and minimum effect.
# calculating the required sample size for the Neyman-Pearson hypothesis.
effect_size_speed <- 10 / sd(pokemon_data$Speed)
power_analysis_speed <- pwr.t.test(d = effect_size_speed, sig.level = alpha, power = power, alternative = "two.sided", type = "two.sample")
power_analysis_speed$n
## [1] 144.144
#The calculated required sample size for the Neyman-Pearson hypothesis test related to speed is approximately 144.14 for each weight group (Pokémon weighing <10kg and Pokémon weighing >50kg). This means we would need at least 145 Pokémon in each weight category to achieve the desired power of 0.8, given our specified effect size and alpha level
# Determining if we have enough data - We need to check how many Pokémon we have that weigh less than 10kg and more than 50kg to see if we meet this sample size requirement.
light_pokemon_count <- nrow(pokemon_data[pokemon_data$Weight < 10, ])
heavy_pokemon_count <- nrow(pokemon_data[pokemon_data$Weight > 50, ])
light_pokemon_count
## [1] 321
heavy_pokemon_count
## [1] 425
# Given our calculated required sample size of 145 for each weight group, we have enough Pokémon in both weight categories to achieve the desired power of 0.8 with our specified effect size and alpha level
# Filtering data for light (<10kg) and heavy (>50kg) Pokémon
light_pokemon_speed <- pokemon_data$Speed[pokemon_data$Weight < 10]
heavy_pokemon_speed <- pokemon_data$Speed[pokemon_data$Weight > 50]
test_result_speed <- t.test(light_pokemon_speed, heavy_pokemon_speed, var.equal = FALSE)
test_result_speed$statistic
## t
## -7.290771
test_result_speed$p.value
## [1] 8.036404e-13
# Given our chosen alpha level of 0.05, the p-value is much less than 0.05, so we reject the null hypothesis. This means that, based on our sample, there is a statistically significant difference in the mean speed of Pokémon weighing less than 10kg compared to those weighing more than 50kg.
# So, in simple terms, we have strong evidence to believe that lighter Pokémon (less than 10kg) and heavier Pokémon (more than 50kg) have different average speeds.
# INSIGHTS:
#The result suggests that there is a significant difference in speed between lightweight Pokémon (less than 10kg) and heavyweight Pokémon (more than 50kg). Lightweight Pokémon might tend to be faster or slower than their heavyweight counterparts, a trend that can be explored further, especially in the context of Pokémon battles and strategies.
# VISUALIZATIONS
# Creating a visualization for the results from Hypothesis 1.
ggplot(pokemon_data[pokemon_data$Type1 %in% c("Grass", "Fire"), ], aes(x = Type1, y = HP)) +
geom_boxplot(aes(fill = Type1)) +
scale_fill_manual(values = c("Grass" = "#98A886", "Fire" = "#FFA69E")) +
labs(title = "Distribution of HP for Pokémon with Type1 as Grass and Fire", x = "Pokémon Type", y = "HP") +
theme_minimal() +
theme(legend.position = "none")

#The boxplot visualizes the distribution of HP for Pokémon with Type1 as "Grass" and "Fire". The median HP for both types seems to be close, but there's a slightly wider spread for "Grass" Pokémon.
# Creating a visualization for the results from Hypothesis 2
# Creating a new column to classify Pokémon based on their weight
pokemon_data <- pokemon_data %>%
mutate(Weight_Category = ifelse(Weight < 10, "<10kg", ifelse(Weight > 50, ">50kg", "Others")))
ggplot(pokemon_data[pokemon_data$Weight_Category %in% c("<10kg", ">50kg"), ], aes(x = Weight_Category, y = Speed)) +
geom_boxplot(aes(fill = Weight_Category)) +
scale_fill_manual(values = c("<10kg" = "#A9DEF9", ">50kg" = "#C4A69D")) +
labs(title = "Distribution of Speed for Pokémon Weighing <10kg and >50kg", x = "Pokémon Weight Category", y = "Speed") +
theme_minimal() +
theme(legend.position = "none")

#The boxplot visualizes the distribution of speed for Pokémon based on their weight categories. From the plot, we can see that Pokémon weighing less than 10kg tend to have higher speeds on average compared to Pokémon weighing more than 50kg. This aligns with our statistical test results and suggests that lighter Pokémon might generally be faster than their heavier counterparts.
#Summary of Insights and Further Investigation:
#Hypothesis 1:
#We did not find a significant difference in the HP between Pokémon with primary types of "Grass" and "Fire".
#Further questions: Are there specific evolutionary stages where the difference becomes more pronounced? How do other attributes, such as Attack or Defense, compare between these two types?
#Hypothesis 2:
#There is a significant difference in speed between lightweight Pokémon (less than 10kg) and heavyweight Pokémon (more than 50kg), with lighter Pokémon tending to be faster.
#Further questions: Are there specific lightweight Pokémon that are exceptions to this trend?