##
## The downloaded binary packages are in
## /var/folders/p0/800mfk513715kyz1k35_86r80000gn/T//RtmpCUyDA1/downloaded_packages
##
## The downloaded binary packages are in
## /var/folders/p0/800mfk513715kyz1k35_86r80000gn/T//RtmpCUyDA1/downloaded_packages
##
## The downloaded binary packages are in
## /var/folders/p0/800mfk513715kyz1k35_86r80000gn/T//RtmpCUyDA1/downloaded_packages
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
# 2. Read the dataset
pokemon_data <- read.csv("PokemonStats.csv")
# I will choose 'Total' column as the response variable as it represents the overall power of a Pokémon.
# Categorical Variable Selection for ANOVA
# We'll start with the 'Type1' column as our explanatory variable and set up our null and alternative hypotheses for the ANOVA test
# Null Hypothesis: The mean 'Total' is the same across all 'Type1' categories.
# Alternative Hypothesis: At least one 'Type1' category has a different mean 'Total'.
# We'll check the number of categories in 'Type1'. If there are more than 10, we'll consider consolidating them
# Checking the number of unique categories in 'Type1'
unique_types <- unique(pokemon_data$Type1)
num_unique_types <- length(unique_types)
# Displaying the results
num_unique_types
## [1] 18
unique_types
## [1] "Grass" "Fire" "Water" "Bug" "Normal" "Dark"
## [7] "Poison" "Electric" "Ground" "Ice" "Fairy" "Steel"
## [13] "Fighting" "Psychic" "Rock" "Ghost" "Dragon" "Flying"
# There are 18 unique categories in the 'Type1' column, which is more than 10
# Checking the frequency of each type to decide how best to consolidate them
type1_frequencies <- table(pokemon_data$Type1)
type1_frequencies
##
## Bug Dark Dragon Electric Fairy Fighting Fire Flying
## 91 56 49 73 31 50 75 10
## Ghost Grass Ground Ice Normal Poison Psychic Rock
## 47 105 46 43 131 45 82 67
## Steel Water
## 43 150
#We can group those with frequencies less than 50 into an "Other" category. This will reduce the number of categories while ensuring that we don't lose much information
# Consolidating 'Type1' categories with frequencies less than 50 into "Other"
pokemon_data$Consolidated_Type1 <- ifelse(type1_frequencies[pokemon_data$Type1] < 50, "Other", as.character(pokemon_data$Type1))
# Checking the new frequency distribution of 'Consolidated_Type1'
consolidated_type1_frequencies <- table(pokemon_data$Consolidated_Type1)
consolidated_type1_frequencies
##
## Bug Dark Electric Fighting Fire Grass Normal Other
## 91 56 73 50 75 105 131 314
## Psychic Rock Water
## 82 67 150
# We can now proceed to run the ANOVA test
anova_model <- aov(Total ~ Consolidated_Type1, data = pokemon_data)
anova_table <- Anova(anova_model, type=2)
anova_table
## Anova Table (Type II tests)
##
## Response: Total
## Sum Sq Df F value Pr(>F)
## Consolidated_Type1 812313 10 5.7685 1.646e-08 ***
## Residuals 16658825 1183
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Given the very low p-value (much less than 0.05), we reject the null hypothesis. This suggests that at least one 'Consolidated_Type1' category has a different mean 'Total' compared to the others.
# Conclusion:
#There is enough evidence to conclude that the mean 'Total' power varies across different Pokémon types in the 'Consolidated_Type1' column. For those interested in the data, this means that a Pokémon's type can influence its overall power, and not all types have Pokémon with the same average power.
# Continuous Variable Selection for Regression
# We'll consider 'HP' as a continuous explanatory variable for the response 'Total'.
# Creating a scatter plot of 'Total' against 'HP' to check the linearity of their relationship
ggplot(pokemon_data, aes(x = HP, y = Total)) +
geom_point(alpha = 0.5) +
ggtitle('Relationship between Total Power and HP of Pokémon') +
xlab('HP') +
ylab('Total Power') +
theme_minimal() +
theme(panel.grid.major = element_line(colour = "grey", linetype = "dotted"))

# The scatter plot suggests a roughly linear relationship between 'Total' and 'HP'. As the HP of a Pokémon increases, its overall power (Total) tends to increase as well.
# I'll build a linear regression model with 'Total' as the response variable and 'HP' as the predictor.
# Fitting the linear regression model
lin_reg <- lm(Total ~ HP, data = pokemon_data)
# Extracting model coefficients
coef <- coef(lin_reg)
y_pred <- predict(lin_reg, newdata = pokemon_data)
residuals <- pokemon_data$Total - y_pred
r2 <- summary(lin_reg)$r.squared
rmse <- sqrt(mean(residuals^2))
list(Coefficients = coef, R_squared = r2, RMSE = rmse)
## $Coefficients
## (Intercept) HP
## 231.948783 2.952137
##
## $R_squared
## [1] 0.4293987
##
## $RMSE
## [1] 91.37446
# The linear regression model results
# 1. Coefficient for 'HP' (slope): Approximately 2.95. This means that for every unit increase in 'HP', the 'Total' power of a Pokémon increases by roughly 2.95 units, on average.
# 2. Intercept: Approximately 231.95. This is the estimated 'Total' power when 'HP' is zero.
# 3. R-squared: 0.4294. This indicates that about 42.94% of the variability in 'Total' can be explained by 'HP'.
# 4. Root Mean Squared Error (RMSE): 91.37. This is a measure of the model's prediction error.
# We can conclude that 'HP' is a significant predictor of a Pokémon's total power. However, since the R-squared value is less than 0.5, other factors also play a crucial role in determining a Pokémon's overall power
# Interpretation:
# The coefficient of 'HP' indicates its positive relationship with the 'Total' power of a Pokémon. If one were to optimize a Pokémon's power, ensuring it has a high 'HP' would be beneficial. However, one should also consider other attributes, as 'HP' alone explains less than half of the variability in a Pokémon's total power.
# Creating diagnostic plots to identify any potential issues with the model. We'll create a residual plot and a QQ-plot to check for homoscedasticity and normality of residuals, respectively.
# Calculating residuals
residuals <- residuals(lin_reg)
# Plotting the residual plot
residual_plot <- ggplot(data.frame(Predicted = y_pred, Residuals = residuals), aes(x = Predicted, y = Residuals)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
ggtitle('Residual Plot') +
xlab('Predicted Values') +
ylab('Residuals') +
theme_minimal() +
theme(panel.grid.major = element_line(colour = "grey", linetype = "dotted"))
# Plotting the QQ-plot
qq_plot <- qqPlot(residuals, main = "QQ-Plot of Residuals", col = "green")

print(residual_plot)

print(qq_plot)
## [1] 152 306
# 1. Residual Plot:
#The residuals appear to be randomly scattered around the horizontal line at zero, suggesting that the assumption of homoscedasticity (constant variance of residuals) is reasonably met.
#There are no clear patterns or trends, which is a good sign.
# 2. QQ-Plot:
#The QQ-plot shows how well the residuals match a normal distribution.
#The points largely follow the straight line, suggesting that the residuals are approximately normally distributed. There are some deviations at the tails, but they aren't too concerning.
# Based on these plots, the linear regression model seems to meet the assumptions reasonably well
# Including Additional Variable(s) in the Regression Model
# We'll include the 'Consolidated_Type1' categorical variable in the regression model. This will allow us to examine if certain Pokémon types have a unique effect on the 'Total' power beyond the effect of 'HP.
# Building the extended regression model with both 'HP' and 'Consolidated_Type1' as predictors
extended_model <- lm(Total ~ HP + Consolidated_Type1, data = pokemon_data)
summary(extended_model)
##
## Call:
## lm(formula = Total ~ HP + Consolidated_Type1, data = pokemon_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -472.67 -59.90 1.11 59.71 306.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 211.57394 10.83470 19.527 < 2e-16 ***
## HP 2.98564 0.09775 30.543 < 2e-16 ***
## Consolidated_Type1Dark 20.50854 15.15567 1.353 0.176253
## Consolidated_Type1Electric 47.81021 13.95819 3.425 0.000635 ***
## Consolidated_Type1Fighting 20.37371 15.73298 1.295 0.195584
## Consolidated_Type1Fire 33.72902 13.90428 2.426 0.015424 *
## Consolidated_Type1Grass 10.13443 12.74719 0.795 0.426754
## Consolidated_Type1Normal -35.31522 12.28892 -2.874 0.004129 **
## Consolidated_Type1Other 28.06648 10.69051 2.625 0.008767 **
## Consolidated_Type1Psychic 54.36450 13.61599 3.993 6.94e-05 ***
## Consolidated_Type1Rock 33.05628 14.32762 2.307 0.021217 *
## Consolidated_Type1Water 9.21174 11.89051 0.775 0.438664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 88.75 on 1182 degrees of freedom
## Multiple R-squared: 0.4671, Adjusted R-squared: 0.4621
## F-statistic: 94.18 on 11 and 1182 DF, p-value: < 2.2e-16
#The extended regression model results:
# 1. R-squared: 0.467. This indicates that the model explains 46.7% of the variability in the 'Total' power of Pokémon. This is an improvement from the model with only 'HP', which had an R-squared of 0.4294.
# 2. Coefficient for 'HP': 2.9856. As before, this suggests that for each unit increase in 'HP', the 'Total' power increases by approximately 2.9856 units.
# 3. Coefficients for 'Consolidated_Type1' Categories: These coefficients indicate the average difference in 'Total' power for each Pokémon type relative to the base category, which in this case is 'Bug'. For example, the coefficient for 'Dark' is 20.5085, suggesting that, on average, Pokémon of type 'Dark' have a 'Total' power that's higher by about 20.5085 units compared to 'Bug' Pokémon, after accounting for 'HP'. However, it's essential to note the p-values for each coefficient to determine its statistical significance.
# From the p-values, several Pokémon types show statistically significant differences in 'Total' power compared to the reference type 'Bug', after accounting for 'HP'.
#Interpretation:
#The inclusion of 'Consolidated_Type1' in the regression model provides valuable insights. It suggests that a Pokémon's type and its 'HP' both play crucial roles in determining its overall power. Certain types, like 'Electric' and 'Psychic', tend to have higher overall power than others, even after adjusting for 'HP'. Conversely, 'Normal' Pokémon tend to have lower overall power compared to 'Bug' Pokémon after accounting for 'HP'.
# ANOTHER VARIABLE:
# 'Attack' is another important continuous variable that likely influences a Pokémon's total power. We could consider adding it to the model to see if it provides additional explanatory power beyond 'HP' and 'Consolidated_Type1'.
# ITERACTION TERM:
#Since 'HP' is a continuous variable and 'Consolidated_Type1' is categorical, an interaction term between them would capture any unique relationship between 'HP' and 'Total' for each Pokémon type.
# building the updated regression model with these additions
updated_model <- lm(Total ~ HP + Attack + Consolidated_Type1 + HP:Consolidated_Type1, data = pokemon_data)
summary(updated_model)
##
## Call:
## lm(formula = Total ~ HP + Attack + Consolidated_Type1 + HP:Consolidated_Type1,
## data = pokemon_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -181.25 -45.48 -4.60 39.45 349.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.24592 23.84024 3.324 0.000915 ***
## HP 2.77109 0.41317 6.707 3.09e-11 ***
## Attack 2.03795 0.07181 28.379 < 2e-16 ***
## Consolidated_Type1Dark 81.00349 33.15137 2.443 0.014695 *
## Consolidated_Type1Electric 46.67876 37.22657 1.254 0.210125
## Consolidated_Type1Fighting 74.28917 38.18856 1.945 0.051975 .
## Consolidated_Type1Fire 18.10942 38.46848 0.471 0.637900
## Consolidated_Type1Grass 22.67051 33.10201 0.685 0.493563
## Consolidated_Type1Normal 49.19708 28.42298 1.731 0.083735 .
## Consolidated_Type1Other 66.24447 26.15335 2.533 0.011441 *
## Consolidated_Type1Psychic 135.77746 31.52584 4.307 1.79e-05 ***
## Consolidated_Type1Rock 75.64100 35.89304 2.107 0.035295 *
## Consolidated_Type1Water 67.74873 28.66367 2.364 0.018262 *
## HP:Consolidated_Type1Dark -1.18991 0.49704 -2.394 0.016824 *
## HP:Consolidated_Type1Electric -0.03335 0.59352 -0.056 0.955202
## HP:Consolidated_Type1Fighting -1.57434 0.54769 -2.875 0.004120 **
## HP:Consolidated_Type1Fire -0.11793 0.57790 -0.204 0.838337
## HP:Consolidated_Type1Grass -0.33007 0.52058 -0.634 0.526169
## HP:Consolidated_Type1Normal -1.17118 0.44512 -2.631 0.008622 **
## HP:Consolidated_Type1Other -0.86008 0.42518 -2.023 0.043314 *
## HP:Consolidated_Type1Psychic -1.18369 0.48058 -2.463 0.013920 *
## HP:Consolidated_Type1Rock -1.17440 0.55042 -2.134 0.033078 *
## HP:Consolidated_Type1Water -0.93215 0.45347 -2.056 0.040041 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.46 on 1171 degrees of freedom
## Multiple R-squared: 0.704, Adjusted R-squared: 0.6984
## F-statistic: 126.6 on 22 and 1171 DF, p-value: < 2.2e-16
# insights:
# 1.R-squared: 0.704. This indicates that the model explains 70.4% of the variability in the 'Total' power of Pokémon. This is a significant improvement from our earlier models. The inclusion of 'Attack' and the interaction term adds substantial explanatory power.
# 2.Coefficient for 'HP': 2.7711. This value indicates the average change in 'Total' for each unit increase in 'HP' for the reference Pokémon type (Bug).
# 3.Coefficient for 'Attack': 2.0379. This suggests that for every unit increase in 'Attack', the 'Total' power of a Pokémon increases by approximately 2.0379 units, on average.
# 4.Interaction Coefficients: These coefficients capture any unique relationship between 'HP' and 'Total' for each Pokémon type, beyond the main effects. For example, the interaction coefficient for 'Dark' is -1.1899, suggesting that the relationship between 'HP' and 'Total' for 'Dark' Pokémon is slightly weaker compared to 'Bug' Pokémon.
#Interpretation:
#The positive coefficients for both 'HP' and 'Attack' confirm that these attributes significantly influence a Pokémon's total power.
#The interaction term suggests that the influence of 'HP' on 'Total' power varies across different Pokémon types.
#Certain types, like 'Dark' and 'Normal', show weaker relationships between 'HP' and 'Total' compared to the reference type 'Bug', after adjusting for 'Attack'.
#Recommendations:
#For trainers looking to optimize a Pokémon's power, focusing on increasing both its 'HP' and 'Attack' would be beneficial.
#However, the effectiveness of increasing 'HP' might differ depending on the Pokémon's type. For instance, boosting 'HP' might be less effective for 'Dark' Pokémon compared to 'Bug' Pokémon in terms of increasing total power.