library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Reading the dataset
pokemon_data <- read.csv("./PokemonStats.csv")
# Calculating 50% of the dataset's length
subsample_length <- nrow(pokemon_data) * 0.5
cat(paste("Subsample Length is", subsample_length, "rows"))
## Subsample Length is 597 rows
# Columns for sampling
selected_columns <- c('Type1', 'Type2', 'Total', 'HP', 'Attack', 'Defense')
# Collecting 10 random samples with replacement
subsamples <- list()
for (i in 1:10) {
subsample <- pokemon_data %>%
select(all_of(selected_columns)) %>%
sample_n(size = subsample_length, replace = TRUE)
subsamples[[i]] <- subsample
}
for (i in 1:10) {
cat(paste("Subsample", i, ":\n"))
print(head(subsamples[[i]]))
cat("\n")
}
## Subsample 1 :
## Type1 Type2 Total HP Attack Defense
## 1 Flying Dragon 535 85 70 80
## 2 Electric Steel 325 25 35 70
## 3 Steel 630 70 140 230
## 4 Dark Fairy 370 65 60 45
## 5 Normal Ground 423 85 56 77
## 6 Water Flying 485 85 40 70
##
## Subsample 2 :
## Type1 Type2 Total HP Attack Defense
## 1 Fairy Steel 506 85 75 77
## 2 Water 316 50 53 48
## 3 Normal 490 104 91 63
## 4 Poison 500 105 105 75
## 5 Ice Psychic 455 65 50 35
## 6 Grass Ice 494 90 92 75
##
## Subsample 3 :
## Type1 Type2 Total HP Attack Defense
## 1 Water 580 100 75 115
## 2 Dark 455 70 58 58
## 3 Normal Flying 262 40 60 30
## 4 Flying 580 79 115 70
## 5 Fighting 310 50 68 60
## 6 Rock Ground 390 55 95 115
##
## Subsample 4 :
## Type1 Type2 Total HP Attack Defense
## 1 Normal 670 110 160 110
## 2 Bug Poison 400 70 90 70
## 3 Rock Electric 570 100 134 110
## 4 Grass Ghost 530 78 107 75
## 5 Fire Poison 570 80 70 60
## 6 Bug Flying 474 70 80 102
##
## Subsample 5 :
## Type1 Type2 Total HP Attack Defense
## 1 Poison 273 46 57 40
## 2 Bug Steel 495 70 135 105
## 3 Dragon 580 200 100 50
## 4 Electric 205 20 40 15
## 5 Dark 220 35 55 35
## 6 Steel Ghost 448 59 110 150
##
## Subsample 6 :
## Type1 Type2 Total HP Attack Defense
## 1 Normal Flying 278 45 50 43
## 2 Normal 440 65 70 60
## 3 Rock 487 75 117 65
## 4 Electric 270 59 45 50
## 5 Psychic 315 90 65 65
## 6 Rock 580 80 100 200
##
## Subsample 7 :
## Type1 Type2 Total HP Attack Defense
## 1 Electric Ice 520 50 65 107
## 2 Rock Ground 300 50 64 50
## 3 Psychic Ghost 680 137 113 89
## 4 Steel Rock 530 70 110 180
## 5 Grass 530 70 85 65
## 6 Water 280 41 63 40
##
## Subsample 8 :
## Type1 Type2 Total HP Attack Defense
## 1 Electric 330 40 30 50
## 2 Fighting 505 90 130 80
## 3 Steel Psychic 300 57 24 86
## 4 Fire 505 73 76 75
## 5 Bug 224 40 29 45
## 6 Water Ground 384 75 65 55
##
## Subsample 9 :
## Type1 Type2 Total HP Attack Defense
## 1 Bug Ground 424 60 79 105
## 2 Ground 670 100 150 140
## 3 Ground 265 10 55 25
## 4 Steel Psychic 600 80 135 130
## 5 Normal 288 48 48 48
## 6 Dark Fire 570 55 80 80
##
## Subsample 10 :
## Type1 Type2 Total HP Attack Defense
## 1 Rock Dark 600 100 134 110
## 2 Water 530 70 85 65
## 3 Psychic Steel 680 97 157 127
## 4 Grass Psychic 530 95 95 85
## 5 Electric 510 90 75 85
## 6 Dragon 410 66 117 70
# We can compare basic statistics (like mean, median, standard deviation, etc.) for the continuous columns across all sub samples. This will give us an idea of the variability among the sub samples.
# Creating a list to store statistics for each sub sample
all_stats <- list()
# Calculating statistics for each subsample
for (i in 1:10) {
subsample_stats <- subsamples[[i]] %>%
select(Total, HP, Attack, Defense) %>%
summarise(
Subsample = i,
Mean_Total = mean(Total),
Median_Total = median(Total),
SD_Total = sd(Total),
Mean_HP = mean(HP),
Median_HP = median(HP),
SD_HP = sd(HP),
Mean_Attack = mean(Attack),
Median_Attack = median(Attack),
SD_Attack = sd(Attack),
Mean_Defense = mean(Defense),
Median_Defense = median(Defense),
SD_Defense = sd(Defense)
)
all_stats[[i]] <- subsample_stats
}
# Consolidating all stats into a single data frame
consolidated_stats <- bind_rows(all_stats)
# Displaying the consolidated stats using kable
kable(consolidated_stats)
| 1 |
445.4573 |
460 |
117.1370 |
71.66834 |
70 |
27.35126 |
80.34506 |
80 |
31.31109 |
77.81072 |
75 |
33.11049 |
| 2 |
440.1826 |
470 |
118.2490 |
69.51256 |
68 |
24.22875 |
79.56114 |
75 |
31.96446 |
73.81407 |
70 |
29.88115 |
| 3 |
448.6281 |
473 |
116.9926 |
71.97822 |
70 |
27.04932 |
82.38526 |
80 |
32.03494 |
75.75712 |
70 |
30.81275 |
| 4 |
440.0687 |
465 |
120.7889 |
72.06700 |
70 |
27.60615 |
82.26801 |
80 |
31.05589 |
72.71692 |
70 |
28.46490 |
| 5 |
439.2194 |
467 |
132.5946 |
70.53936 |
68 |
28.54751 |
80.37186 |
78 |
31.79452 |
74.19765 |
70 |
32.35112 |
| 6 |
446.8961 |
464 |
122.4848 |
71.49246 |
69 |
25.53289 |
83.09380 |
80 |
32.64217 |
75.70184 |
70 |
30.81300 |
| 7 |
444.6248 |
470 |
118.6917 |
72.89782 |
70 |
26.73499 |
81.69347 |
80 |
29.90423 |
77.14238 |
72 |
30.92144 |
| 8 |
439.2730 |
455 |
121.8672 |
69.74372 |
68 |
24.55728 |
80.85092 |
79 |
32.80744 |
74.56449 |
70 |
30.07058 |
| 9 |
436.5997 |
455 |
127.8192 |
69.54606 |
65 |
27.21201 |
80.46901 |
80 |
33.24464 |
74.43719 |
70 |
33.04870 |
| 10 |
443.6382 |
473 |
128.2434 |
71.52094 |
70 |
27.83313 |
82.77554 |
80 |
33.03049 |
74.87270 |
70 |
30.59506 |
# The mean values for Total, HP, Attack, and Defense vary slightly across the 10 sub samples. For instance, the mean Total score fluctuates between approximately 435 and 445 across sub samples.
# With an average Mean_HP of around 70.71, it appears that most Pokémon have a moderate amount of health. HP is crucial as it determines how much damage a Pokémon can take before fainting.
# The distribution of Mean_HP is concentrated around 70-71, meaning that there's consistency in the health stats among Pokémon
# The Mean_Attack and Mean_Defense values have strong positive correlations with the Mean_Total. This suggests that Pokémon with higher attack or defense stats tend to have higher overall stats.
# The standard deviations for attributes like SD_Total, SD_HP, SD_Attack, and SD_Defense provide insights into the variability of these stats among Pokémon. For instance, a higher standard deviation in SD_Attack would mean that there's a wider range of attack values among Pokémon.
# Future question:
# What are the most common type combinations, and how do their stats compare?
# To identify potential anomalies, we can visually inspect the distribution of the continuous data across subsamples using boxplots.
# Combining subsamples into a single data frame with an identifier for each sub sample
combined_data <- bind_rows(lapply(1:10, function(i) {
subsample <- subsamples[[i]]
subsample$subsample_id <- i
return(subsample)
}))
# Melting the data to a long format for easy plotting
long_data <- combined_data %>%
gather(key = "Stat", value = "Value", -Type1, -Type2, -subsample_id)
# Filtering for the columns of interest
plot_data <- long_data %>%
filter(Stat %in% c("Total", "HP", "Attack", "Defense"))
# Plotting the boxplots
ggplot(plot_data, aes(x = factor(subsample_id), y = Value)) +
geom_boxplot() +
facet_wrap(~ Stat, scales = "free") +
labs(title = "Distribution of Stats across Subsamples",
x = "Subsample Index",
y = "Value") +
theme_minimal()

#The central tendency (median) of the data is quite similar across all subsamples for each of the continuous variables (Total, HP, Attack, and Defense).
#There are outliers present in several sub samples. Some sub samples have outliers in the upper range, while others have them in the lower range. This variation in outliers indicates that what might be considered an anomaly in one subsample might not be in another.
#For consistency, we can check the frequency of categorical variables (Type1 and Type2) across sub samples. If certain categories appear frequently in all sub samples, they are consistent features of the data set.
# Calculating frequencies of Type1 and Type2 across subsamples
type_frequencies <- combined_data %>%
gather(key = "Type_Category", value = "Type_Value", Type1, Type2) %>%
count(subsample_id, Type_Category, Type_Value) %>%
arrange(Type_Category, -n)
# Filtering to get the top 5 most frequent types for each Type_Category in each subsample
top_types <- type_frequencies %>%
group_by(subsample_id, Type_Category) %>%
slice_head(n = 5) %>%
ungroup()
# Plotting the frequencies
ggplot(top_types, aes(x = factor(subsample_id), y = n, fill = Type_Value)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ Type_Category, scales = "free") +
labs(title = "Top 5 Types in Type1 and Type2 across Subsamples",
x = "Subsample Index",
y = "Frequency") +
theme_minimal() +
theme(legend.position = "bottom", legend.title = element_blank())

# Insights
# Variability Among Subsamples: While the central tendency of the subsamples is quite consistent, there are variations in the spread and presence of outliers. This emphasizes the importance of having a sufficiently large sample size when drawing conclusions about a population, as different samples can present different perspectives.
#Anomalies and Context: What might be considered an outlier in one subsample might not be in another. This highlights the importance of context when identifying anomalies.
# Consistency in Categorical Data: Certain categories, like Water for Type1 and Flying for Type2, are consistently prevalent across subsamples.
# Significance
# This demonstrates the impact of sampling variability on data analysis. Different samples can yield different insights, underscoring the importance of comprehensive data collection and the potential pitfalls of drawing conclusions from limited data.
# Further Questions:
# How would the insights change if we were to take larger or smaller subsamples?
# How would the insights change if we sampled without replacement?