library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
# Load the dataset
pokemon_data <- read_csv("Downloads/PokemonStats.csv")
## Rows: 1194 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, Type1, Type2
## dbl (10): ID, Total, HP, Attack, Defense, SpAtk, SpDef, Speed, Height, Weight
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Numeric summary for Total, HP, Attack, Defense, SpAtk, SpDef, Speed, Height, and Weight
summary(pokemon_data[c("Total", "HP", "Attack", "Defense", "SpAtk", "SpDef", "Speed", "Height", "Weight")])
## Total HP Attack Defense
## Min. : 175.0 Min. : 1.00 Min. : 5.00 Min. : 5.00
## 1st Qu.: 330.0 1st Qu.: 52.00 1st Qu.: 56.00 1st Qu.: 51.25
## Median : 460.5 Median : 70.00 Median : 80.00 Median : 70.00
## Mean : 441.2 Mean : 70.88 Mean : 80.95 Mean : 74.59
## 3rd Qu.: 520.0 3rd Qu.: 85.00 3rd Qu.:100.00 3rd Qu.: 90.00
## Max. :1125.0 Max. :255.00 Max. :190.00 Max. :250.00
##
## SpAtk SpDef Speed Height
## Min. : 10.00 Min. : 20.00 Min. : 5.00 Min. : 0.100
## 1st Qu.: 50.00 1st Qu.: 50.00 1st Qu.: 45.00 1st Qu.: 0.600
## Median : 65.00 Median : 70.00 Median : 67.50 Median : 1.000
## Mean : 72.88 Mean : 72.12 Mean : 69.78 Mean : 1.377
## 3rd Qu.: 95.00 3rd Qu.: 90.00 3rd Qu.: 90.75 3rd Qu.: 1.600
## Max. :194.00 Max. :250.00 Max. :200.00 Max. :100.000
##
## Weight
## Min. : 0.10
## 1st Qu.: 9.00
## Median : 30.00
## Mean : 73.28
## 3rd Qu.: 75.50
## Max. :999.90
## NA's :1
# Categorical summary for Type1 and Type2 columns
table(pokemon_data$Type1)
##
## Bug Dark Dragon Electric Fairy Fighting Fire Flying
## 91 56 49 73 31 50 75 10
## Ghost Grass Ground Ice Normal Poison Psychic Rock
## 47 105 46 43 131 45 82 67
## Steel Water
## 43 150
table(pokemon_data$Type2)
##
## Bug Dark Dragon Electric Fairy Fighting Fire Flying
## 9 33 39 13 42 41 20 122
## Ghost Grass Ground Ice Normal Poison Psychic Rock
## 37 33 43 22 18 47 49 19
## Steel Water
## 40 25
## NOVEL QUESTIONS
# 1. What is the average Total stat for each Type1 category?
## Finding the average Total stat for each Type1 category in Pokémon data helps assess the relative power and balance of different Pokémon types
pokemon_data %>%
group_by(Type1) %>%
summarise(avg_total = mean(Total))
## # A tibble: 18 × 2
## Type1 avg_total
## <chr> <dbl>
## 1 Bug 381.
## 2 Dark 450.
## 3 Dragon 528.
## 4 Electric 447.
## 5 Fairy 449.
## 6 Fighting 458.
## 7 Fire 455.
## 8 Flying 450.
## 9 Ghost 438.
## 10 Grass 420.
## 11 Ground 440.
## 12 Ice 439.
## 13 Normal 409.
## 14 Poison 428.
## 15 Psychic 486.
## 16 Rock 447.
## 17 Steel 485
## 18 Water 437.
# 2. Which Pokemon types have the highest and lowest average HP (Hit Points)?
## Helps to investigate which types tend to have the highest and lowest average HP values
### Calculate the average HP for each type
average_hp_by_type <- tapply(pokemon_data$HP, pokemon_data$Type1, mean)
### Find the type with the highest and lowest average HP
highest_hp_type <- names(average_hp_by_type[which.max(average_hp_by_type)])
lowest_hp_type <- names(average_hp_by_type[which.min(average_hp_by_type)])
highest_hp_type
## [1] "Dragon"
lowest_hp_type
## [1] "Bug"
# 3. Is there a correlation between a Pokemon's attack and defense stats?
## To unerstand the relationship between the attack and defense stats of Pokémon in the dataset.
### Calculate the correlation between attack and defense
correlation_attack_defense <- cor(pokemon_data$Attack, pokemon_data$Defense)
### After calculating the correlation, it simply prints the value of correlation_attack_defense to the console
correlation_attack_defense
## [1] 0.4694961
# The correlation coefficient of approximately 0.4694961 indicates a moderately positive correlation between the "Attack" and "Defense" attributes in the dataset.
# A positive correlation means that as the "Attack" stat tends to increase, the "Defense" stat also tends to increase, and vice versa. In other words, Pokémon with higher attack stats tend to have higher defense stats, and Pokémon with lower attack stats tend to have lower defense stats.
# The value of approximately 0.4694961 indicates a moderate strength of correlation. It's not a perfect correlation (1.0) but still suggests that there is some relationship between these two attributes.
# It's important to note that correlation does not imply causation. Just because there's a correlation between two variables doesn't mean that one causes the other. In this case, it suggests that there's a tendency for Pokémon with higher attack stats to also have higher defense stats, but other factors may be at play as well
# 4. Which Type2 category has the highest average Attack stat?
pokemon_data %>%
group_by(Type2) %>%
summarise(avg_attack = mean(Attack)) %>%
arrange(desc(avg_attack)) %>%
head(1)
## # A tibble: 1 × 2
## Type2 avg_attack
## <chr> <dbl>
## 1 Fighting 113
# 5. Find the type that has the highest difference between average attack and average defense:
## It calculates the difference between average attack and average defense for each type and identifies the type with the highest difference.
### Calculate the average attack and defense for each type
average_attack_by_type <- tapply(pokemon_data$Attack, pokemon_data$Type1, mean)
average_defense_by_type <- tapply(pokemon_data$Defense, pokemon_data$Type1, mean)
### Calculate the difference between average attack and average defense for each type
difference_attack_defense <- abs(average_attack_by_type - average_defense_by_type)
### Find the type with the highest difference
highest_difference_type <- names(difference_attack_defense[which.max(difference_attack_defense)])
highest_difference_type
## [1] "Fighting"
# 6. Most common type pokemon and it's frequency
### Create a table of Pokemon types and their frequencies
type_frequency <- table(pokemon_data$Type1)
most_common_type <- names(sort(type_frequency, decreasing = TRUE)[1])
result_table <- data.frame(Type = names(type_frequency), Frequency = as.vector(type_frequency))
result_table <- result_table[order(result_table$Frequency, decreasing = TRUE), ]
# Display the most common type and its frequency
most_common_type_and_frequency <- result_table[1, ]
most_common_type_and_frequency
## Type Frequency
## 18 Water 150
# 7. Calculate the average Attack and Defense for each Type1 category
type1_avg_stats <- pokemon_data %>%
group_by(Type1) %>%
summarise(avg_attack = mean(Attack),
avg_defense = mean(Defense),
count = n()) %>%
arrange(desc(avg_attack)) ### Sort by average Attack in descending order
### Display Type1, average Attack, average Defense, and the count of Pokemon
type1_avg_stats
## # A tibble: 18 × 4
## Type1 avg_attack avg_defense count
## <chr> <dbl> <dbl> <int>
## 1 Fighting 105. 76.4 50
## 2 Dragon 104. 80.8 49
## 3 Ground 94.6 86.1 46
## 4 Steel 92.7 114. 43
## 5 Rock 90.2 97.6 67
## 6 Dark 85.5 71.0 56
## 7 Fire 84.1 68.6 75
## 8 Flying 81.9 67.4 10
## 9 Ice 80.0 75.3 43
## 10 Water 76.9 72.8 150
## 11 Grass 76.6 71.1 105
## 12 Normal 76.6 61.2 131
## 13 Psychic 75.6 71.5 82
## 14 Poison 75.4 74.5 45
## 15 Electric 73.2 65.5 73
## 16 Ghost 72 78.0 47
## 17 Fairy 71.1 73.6 31
## 18 Bug 70.9 71.2 91
# VISULAIZATIONS
# Histogram of HP distribution
ggplot(data = pokemon_data, aes(x = HP)) +
geom_histogram(binwidth = 10, fill = "blue", color = "black") +
labs(title = "Distribution of HP", x = "HP", y = "Frequency")

#Insight:The histogram reveals a relatively even distribution of HP values among Pokémon, characterized by a prominent peak at 70 HP. Beyond this threshold, as HP increases, the frequency gradually diminishes.
#Significance: Understanding the distribution of HP is essential because it helps us identify the common range of HP values among Pokemon.
#Further Questions: Are there any outlier Pokemon with exceptionally high or low HP values? Does HP correlate with other attributes like Attack or Defense?
## Correlation matrix for base stats
cor_matrix <- cor(pokemon_data[, c("HP", "Attack", "Defense", "SpAtk", "SpDef", "Speed")])
## Heatmap of the correlation matrix
ggplot(data = melt(cor_matrix), aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "Correlation Heatmap for Base Stats",
x = "",
y = "")

#Insight: The heatmap visually represents correlations between various base stats. Positive correlations are shown in warmer colors (red), while negative correlations are in cooler colors (blue). Here, speed and defence are negitively related. In other words, when one of these stats tends to increase, the other tends to decrease. The higher Speed is associated with lower Defense, and vice versa.
#Significance: This helps us identify which base stats tend to be positively or negatively related, aiding in understanding how a Pokemon's stats are interrelated.
#Further Questions: Are there specific combinations of base stats that indicate a particular role or strategy for a Pokemon?
# Bar chart of Type1 frequency
ggplot(data = pokemon_data, aes(x = Type1, fill = Type1)) +
geom_bar() +
labs(title = "Frequency of Pokemon Types",
x = "Type",
y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Insight:The bar chart illustrates the frequency of various Pokémon types, highlighting disparities in their occurrence. Notably, the Water type stands out as the most abundant, occurring 150 times, while the Flying type ranks as the least prevalent.
#Significance: This provides insights into the distribution of Pokemon types in the dataset and can be useful for balancing team compositions in the game. Also, by analyzing the frequency of types, you can identify which types are more prevalent, potentially indicating which types players favor for their effectiveness in battles. This data can help trainers understand common strengths and weaknesses in battles.
# Distribution of Speed by Type1:
# Calculate the average Speed for each Type1 category
type1_avg_speed <- pokemon_data %>%
group_by(Type1) %>%
summarise(avg_speed = mean(Speed))
# Create a grouped bar chart
ggplot(type1_avg_speed, aes(x = Type1, y = avg_speed, fill = Type1)) +
geom_bar(stat = "identity") +
labs(x = "Type1", y = "Average Speed") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Insights: It allows you to easily identify which Pokémon types tend to have higher or lower average Speed. Electric types rank highest in this regard, while Steel types occupy the lowest position.
# Significance: It provides a basic but effective way to convey key information about Pokémon characteristics to a broad audience
# Further Questions: Are there any specific Pokémon types with unusually high or low Speed values?
# Scatter plot of Attack vs. Defense
ggplot(pokemon_data, aes(x = Attack, y = Defense, color = Type1)) +
geom_point()

#Insights: This plot shows how Attack and Defense stats are related for Pokémon. Clusters or patterns may emerge, indicating types that excel in offense or defense.
#Significance: It helps in evaluating the trade-off between offensive and defensive capabilities within Pokémon types.
#Further Questions: Are there Type1 categories that consistently have higher Attack and Defense stats? Outliers?