Data Source

Data: TidyTuesday – 2025-04-01 Pokémon week, based on a curated dataset of Pokémon attributes. https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-04-01/pokemon_df.csv”

#Load Packages

# Packages
library(tidytuesdayR)

## Warning: package 'tidytuesdayR' was built under R version 4.5.2

library(readr)

## Warning: package 'readr' was built under R version 4.5.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.2

library(GGally)

## Warning: package 'GGally' was built under R version 4.5.2

library(broom)     
library(stringr)

## Warning: package 'stringr' was built under R version 4.5.2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.1

## Warning: package 'tibble' was built under R version 4.5.1

## Warning: package 'tidyr' was built under R version 4.5.1

## Warning: package 'purrr' was built under R version 4.5.1

## Warning: package 'forcats' was built under R version 4.5.1

## Warning: package 'lubridate' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

tuesdata   <- tidytuesdayR::tt_load("2025-04-01")

## ---- Compiling #TidyTuesday Information for 2025-04-01 ----
## --- There is 1 file available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 1: "pokemon_df.csv"

pokemon_df <- tuesdata$pokemon_df

Documentation

The dataset is described in the TidyTuesday README and in several blog posts analyzing this week’s data.

Key columns (from the screenshot you gave plus documentation): id – unique Pokémon ID pokemon – name species_id – species ID height, weight – physical dimensions base_experience – experience yield type_1, type_2 – primary and secondary type hp, attack, defense, special_attack, special_defense, speed – combat stats color_1, color_2, color_f – color codes egg_group_1, egg_group_2 – breeding groups generation_id – generation number

Description of the Data

str(pokemon_df)

## spc_tbl_ [949 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id             : num [1:949] 1 2 3 4 5 6 7 8 9 10 ...
##  $ pokemon        : chr [1:949] "bulbasaur" "ivysaur" "venusaur" "charmander" ...
##  $ species_id     : num [1:949] 1 2 3 4 5 6 7 8 9 10 ...
##  $ height         : num [1:949] 0.7 1 2 0.6 1.1 1.7 0.5 1 1.6 0.3 ...
##  $ weight         : num [1:949] 6.9 13 100 8.5 19 90.5 9 22.5 85.5 2.9 ...
##  $ base_experience: num [1:949] 64 142 236 62 142 240 63 142 239 39 ...
##  $ type_1         : chr [1:949] "grass" "grass" "grass" "fire" ...
##  $ type_2         : chr [1:949] "poison" "poison" "poison" NA ...
##  $ hp             : num [1:949] 45 60 80 39 58 78 44 59 79 45 ...
##  $ attack         : num [1:949] 49 62 82 52 64 84 48 63 83 30 ...
##  $ defense        : num [1:949] 49 63 83 43 58 78 65 80 100 35 ...
##  $ special_attack : num [1:949] 65 80 100 60 80 109 50 65 85 20 ...
##  $ special_defense: num [1:949] 65 80 100 50 65 85 64 80 105 20 ...
##  $ speed          : num [1:949] 45 60 80 65 80 100 43 58 78 45 ...
##  $ color_1        : chr [1:949] "#78C850" "#78C850" "#78C850" "#F08030" ...
##  $ color_2        : chr [1:949] "#A040A0" "#A040A0" "#A040A0" NA ...
##  $ color_f        : chr [1:949] "#81A763" "#81A763" "#81A763" NA ...
##  $ egg_group_1    : chr [1:949] "monster" "monster" "monster" "monster" ...
##  $ egg_group_2    : chr [1:949] "plant" "plant" "plant" "dragon" ...
##  $ url_icon       : chr [1:949] "//archives.bulbagarden.net/media/upload/7/7b/001MS6.png" "//archives.bulbagarden.net/media/upload/a/a0/002MS6.png" "//archives.bulbagarden.net/media/upload/0/07/003MS6.png" "//archives.bulbagarden.net/media/upload/7/7d/004MS6.png" ...
##  $ generation_id  : num [1:949] 1 1 1 1 1 1 1 1 1 1 ...
##  $ url_image      : chr [1:949] "https://raw.githubusercontent.com/HybridShivam/Pokemon/master/assets/images/001.png" "https://raw.githubusercontent.com/HybridShivam/Pokemon/master/assets/images/002.png" "https://raw.githubusercontent.com/HybridShivam/Pokemon/master/assets/images/003.png" "https://raw.githubusercontent.com/HybridShivam/Pokemon/master/assets/images/004.png" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   pokemon = col_character(),
##   ..   species_id = col_double(),
##   ..   height = col_double(),
##   ..   weight = col_double(),
##   ..   base_experience = col_double(),
##   ..   type_1 = col_character(),
##   ..   type_2 = col_character(),
##   ..   hp = col_double(),
##   ..   attack = col_double(),
##   ..   defense = col_double(),
##   ..   special_attack = col_double(),
##   ..   special_defense = col_double(),
##   ..   speed = col_double(),
##   ..   color_1 = col_character(),
##   ..   color_2 = col_character(),
##   ..   color_f = col_character(),
##   ..   egg_group_1 = col_character(),
##   ..   egg_group_2 = col_character(),
##   ..   url_icon = col_character(),
##   ..   generation_id = col_double(),
##   ..   url_image = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

summary(select(
  pokemon_df,
  height, weight, base_experience,
  hp, attack, defense, special_attack, special_defense, speed
))

##      height           weight       base_experience       hp        
##  Min.   : 0.100   Min.   :  0.10   Min.   : 36.0   Min.   :  1.00  
##  1st Qu.: 0.500   1st Qu.:  8.50   1st Qu.: 68.0   1st Qu.: 50.00  
##  Median : 1.000   Median : 28.80   Median :157.0   Median : 65.00  
##  Mean   : 1.228   Mean   : 66.21   Mean   :150.5   Mean   : 68.95  
##  3rd Qu.: 1.500   3rd Qu.: 66.60   3rd Qu.:184.0   3rd Qu.: 80.00  
##  Max.   :14.500   Max.   :999.90   Max.   :608.0   Max.   :255.00  
##      attack          defense       special_attack   special_defense 
##  Min.   :  5.00   Min.   :  5.00   Min.   : 10.00   Min.   : 20.00  
##  1st Qu.: 55.00   1st Qu.: 50.00   1st Qu.: 50.00   1st Qu.: 50.00  
##  Median : 75.00   Median : 70.00   Median : 65.00   Median : 70.00  
##  Mean   : 79.47   Mean   : 74.07   Mean   : 72.81   Mean   : 72.22  
##  3rd Qu.:100.00   3rd Qu.: 90.00   3rd Qu.: 95.00   3rd Qu.: 90.00  
##  Max.   :190.00   Max.   :230.00   Max.   :194.00   Max.   :230.00  
##      speed       
##  Min.   :  5.00  
##  1st Qu.: 45.00  
##  Median : 65.00  
##  Mean   : 69.02  
##  3rd Qu.: 90.00  
##  Max.   :180.00

To get an overview of how the numeric combat stats relate, I create a smaller numeric subset:

stats_cols <- c("hp", "attack", "defense",
                "special_attack", "special_defense", "speed")

pokemon_stats <- pokemon_df |>
  select(all_of(stats_cols))

GGally::ggpairs(pokemon_stats)

## Cleaning and Preparation The Pokémon data are already quite clean, but I still:

Standardize the Pokémon name (optional, for nicer labels).
Remove rows with missing values in key numeric columns.
Create a modeling dataset focused on the variables relevant to HP prediction.

pokemon_clean <- pokemon_df |>
  mutate(
    pokemon = str_to_title(pokemon)
  ) |>
  filter(
    !is.na(hp),
    !is.na(attack),
    !is.na(defense),
    !is.na(speed),
    !is.na(height),
    !is.na(weight),
    !is.na(base_experience)
  )

summary(select(
  pokemon_clean,
  height, weight, base_experience,
  hp, attack, defense, special_attack, special_defense, speed
))

##      height           weight       base_experience       hp        
##  Min.   : 0.100   Min.   :  0.10   Min.   : 36.0   Min.   :  1.00  
##  1st Qu.: 0.500   1st Qu.:  8.50   1st Qu.: 68.0   1st Qu.: 50.00  
##  Median : 1.000   Median : 28.80   Median :157.0   Median : 65.00  
##  Mean   : 1.228   Mean   : 66.21   Mean   :150.5   Mean   : 68.95  
##  3rd Qu.: 1.500   3rd Qu.: 66.60   3rd Qu.:184.0   3rd Qu.: 80.00  
##  Max.   :14.500   Max.   :999.90   Max.   :608.0   Max.   :255.00  
##      attack          defense       special_attack   special_defense 
##  Min.   :  5.00   Min.   :  5.00   Min.   : 10.00   Min.   : 20.00  
##  1st Qu.: 55.00   1st Qu.: 50.00   1st Qu.: 50.00   1st Qu.: 50.00  
##  Median : 75.00   Median : 70.00   Median : 65.00   Median : 70.00  
##  Mean   : 79.47   Mean   : 74.07   Mean   : 72.81   Mean   : 72.22  
##  3rd Qu.:100.00   3rd Qu.: 90.00   3rd Qu.: 95.00   3rd Qu.: 90.00  
##  Max.   :190.00   Max.   :230.00   Max.   :194.00   Max.   :230.00  
##      speed       
##  Min.   :  5.00  
##  1st Qu.: 45.00  
##  Median : 65.00  
##  Mean   : 69.02  
##  3rd Qu.: 90.00  
##  Max.   :180.00

Final Results

Q1 – Distribution and correlation of combat stats

pokemon_long <- pokemon_clean |>
  select(all_of(stats_cols)) |>
  pivot_longer(everything(), names_to = "stat", values_to = "value")

ggplot(pokemon_long,
       aes(x = value)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  facet_wrap(~ stat, scales = "free") +
  labs(
    title = "Distributions of Pokémon Combat Stats",
    x = "Stat value",
    y = "Count"
  )

cor(select(pokemon_clean, all_of(stats_cols)))

##                        hp    attack     defense special_attack special_defense
## hp              1.0000000 0.4266479  0.26085534      0.3737768       0.3700109
## attack          0.4266479 1.0000000  0.43962628      0.3848831       0.2518813
## defense         0.2608553 0.4396263  1.00000000      0.2237726       0.5314911
## special_attack  0.3737768 0.3848831  0.22377262      1.0000000       0.4924357
## special_defense 0.3700109 0.2518813  0.53149106      0.4924357       1.0000000
## speed           0.1433248 0.3586703 -0.02229393      0.4471509       0.2116434
##                       speed
## hp               0.14332480
## attack           0.35867033
## defense         -0.02229393
## special_attack   0.44715089
## special_defense  0.21164343
## speed            1.00000000

Q2-Predicting HP from other attributes

hp_model <- lm(
  hp ~ height + weight + base_experience +
    attack + defense + speed,
  data = pokemon_clean
)

summary(hp_model)

## 
## Call:
## lm(formula = hp ~ height + weight + base_experience + attack + 
##     defense + speed, data = pokemon_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.225 -10.628  -2.452   8.080 114.688 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     55.283570   2.268826  24.367  < 2e-16 ***
## height           2.360488   0.650023   3.631 0.000297 ***
## weight           0.037077   0.006928   5.352  1.1e-07 ***
## base_experience  0.250225   0.012220  20.477  < 2e-16 ***
## attack           0.062068   0.024873   2.495 0.012753 *  
## defense         -0.234316   0.025455  -9.205  < 2e-16 ***
## speed           -0.245044   0.025644  -9.556  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.24 on 942 degrees of freedom
## Multiple R-squared:  0.5087, Adjusted R-squared:  0.5056 
## F-statistic: 162.6 on 6 and 942 DF,  p-value: < 2.2e-16

broom::glance(hp_model)

## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.509         0.506  18.2      163. 1.22e-141     6 -4098. 8213. 8252.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

par(mfrow = c(2, 2))
plot(hp_model)

par(mfrow = c(1, 1))

Q3 – Identifying “over-tanky” Pokémon (large positive HP residuals)

hp_results <- pokemon_clean |>
  mutate(
    hp_pred = fitted(hp_model),
    hp_residual = hp - hp_pred
  )

top_over_hp <- hp_results |>
  arrange(desc(hp_residual)) |>
  select(pokemon, type_1, type_2,
         hp, hp_pred, hp_residual) |>
  slice_head(n = 15)

top_over_hp

## # A tibble: 15 × 6
##    pokemon          type_1  type_2      hp hp_pred hp_residual
##    <chr>            <chr>   <chr>    <dbl>   <dbl>       <dbl>
##  1 Wobbuffet        psychic <NA>       190    75.3       115. 
##  2 Guzzlord         dark    dragon     223   113.        110. 
##  3 Chansey          normal  <NA>       250   145.        105. 
##  4 Alomomola        water   <NA>       165    70.6        94.4
##  5 Zygarde-Complete dragon  ground     216   125.         90.6
##  6 Drifblim         ghost   flying     150    77.3        72.7
##  7 Solgaleo         psychic steel      137    65.5        71.5
##  8 Lunala           psychic ghost      137    65.6        71.4
##  9 Wailmer          water   <NA>       130    66.3        63.7
## 10 Munchlax         normal  <NA>       135    74.8        60.2
## 11 Slaking          normal  <NA>       150    89.9        60.1
## 12 Blissey          normal  <NA>       255   197.         57.5
## 13 Aurorus          rock    ice        123    69.7        53.3
## 14 Nihilego         rock    poison     109    55.7        53.3
## 15 Lanturn          water   electric   125    72.8        52.2

ggplot(top_over_hp,
       aes(x = reorder(pokemon, hp_residual),
           y = hp_residual,
           fill = type_1)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Pokémon with Much Higher HP Than Expected",
    x = "Pokémon",
    y = "HP residual (observed - predicted)"
  ) +
  theme(legend.position = "bottom")

Q4-Predict Pokémon HP using multivariate regression

hp_model <- lm(
  hp ~ height + weight + base_experience +
    attack + defense + special_attack +
    special_defense + speed,
  data = pokemon_clean
)

hp_model_summary <- summary(hp_model)

# Add predictions + residuals
pokemon_regression <- pokemon_clean |>
  mutate(
    hp_pred = predict(hp_model),
    hp_residual = hp - hp_pred
  )

# Top Pokémon with unusually high HP (positive residuals)
top_hp_outliers <- pokemon_regression |>
  arrange(desc(hp_residual)) |>
  select(pokemon, hp, hp_pred, hp_residual) |>
  slice_head(n = 10)

hp_model_summary

## 
## Call:
## lm(formula = hp ~ height + weight + base_experience + attack + 
##     defense + special_attack + special_defense + speed, data = pokemon_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.480 -10.270  -2.138   7.829 111.923 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     59.071206   2.614573  22.593  < 2e-16 ***
## height           2.727561   0.655633   4.160 3.47e-05 ***
## weight           0.036278   0.006897   5.260 1.78e-07 ***
## base_experience  0.281054   0.015759  17.834  < 2e-16 ***
## attack           0.047388   0.026246   1.806  0.07131 .  
## defense         -0.235222   0.027157  -8.661  < 2e-16 ***
## special_attack  -0.077753   0.025661  -3.030  0.00251 ** 
## special_defense -0.036156   0.032249  -1.121  0.26251    
## speed           -0.235174   0.025770  -9.126  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.14 on 940 degrees of freedom
## Multiple R-squared:  0.5147, Adjusted R-squared:  0.5106 
## F-statistic: 124.6 on 8 and 940 DF,  p-value: < 2.2e-16

top_hp_outliers

## # A tibble: 10 × 4
##    pokemon             hp hp_pred hp_residual
##    <chr>            <dbl>   <dbl>       <dbl>
##  1 Guzzlord           223   111.        112. 
##  2 Wobbuffet          190    79.1       111. 
##  3 Chansey            250   155.         94.9
##  4 Alomomola          165    74.6        90.4
##  5 Zygarde-Complete   216   129.         87.1
##  6 Lunala             137    59.6        77.4
##  7 Solgaleo           137    61.4        75.6
##  8 Drifblim           150    77.5        72.5
##  9 Wailmer            130    66.0        64.0
## 10 Munchlax           135    73.7        61.3

Q5-Cluster Pokémon into combat archetypes (offensive/defensive/speedy)

# Scale numeric combat stats
pokemon_scaled <- scale(pokemon_stats)

set.seed(123)
k3 <- kmeans(pokemon_scaled, centers = 3, nstart = 25)

pokemon_clustered <- pokemon_clean |>
  mutate(cluster = factor(k3$cluster))

# Visualize clusters (Attack vs Defense)
cluster_plot <- ggplot(pokemon_clustered,
                       aes(x = attack, y = defense, color = cluster)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(
    title = "Pokémon Combat Archetypes ",
    x = "Attack",
    y = "Defense",
    color = "Cluster"
  ) +
  theme_minimal()

# View cluster sizes
cluster_sizes <- table(pokemon_clustered$cluster)

cluster_sizes

## 
##   1   2   3 
## 286 382 281

cluster_plot

Pokemon

Issue Description

Questions

Data Source

Documentation

Description of the Data

Final Results