library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.3     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Reading the dataset
pokemon_data <- read.csv("./PokemonStats.csv")
# Calculating 50% of the dataset's length
subsample_length <- nrow(pokemon_data) * 0.5

cat(paste("Subsample Length is", subsample_length, "rows"))
## Subsample Length is 597 rows
# Columns for sampling
selected_columns <- c('Type1', 'Type2', 'Total', 'HP', 'Attack', 'Defense')
# Collecting 10 random samples with replacement
subsamples <- list()
for (i in 1:10) {
  subsample <- pokemon_data %>%
    select(all_of(selected_columns)) %>%
    sample_n(size = subsample_length, replace = TRUE)
  subsamples[[i]] <- subsample
}

for (i in 1:10) {
  cat(paste("Subsample", i, ":\n"))
  print(head(subsamples[[i]]))
  cat("\n")
}
## Subsample 1 :
##      Type1  Type2 Total HP Attack Defense
## 1   Flying Dragon   535 85     70      80
## 2 Electric  Steel   325 25     35      70
## 3    Steel          630 70    140     230
## 4     Dark  Fairy   370 65     60      45
## 5   Normal Ground   423 85     56      77
## 6    Water Flying   485 85     40      70
## 
## Subsample 2 :
##    Type1   Type2 Total  HP Attack Defense
## 1  Fairy   Steel   506  85     75      77
## 2  Water           316  50     53      48
## 3 Normal           490 104     91      63
## 4 Poison           500 105    105      75
## 5    Ice Psychic   455  65     50      35
## 6  Grass     Ice   494  90     92      75
## 
## Subsample 3 :
##      Type1  Type2 Total  HP Attack Defense
## 1    Water          580 100     75     115
## 2     Dark          455  70     58      58
## 3   Normal Flying   262  40     60      30
## 4   Flying          580  79    115      70
## 5 Fighting          310  50     68      60
## 6     Rock Ground   390  55     95     115
## 
## Subsample 4 :
##    Type1    Type2 Total  HP Attack Defense
## 1 Normal            670 110    160     110
## 2    Bug   Poison   400  70     90      70
## 3   Rock Electric   570 100    134     110
## 4  Grass    Ghost   530  78    107      75
## 5   Fire   Poison   570  80     70      60
## 6    Bug   Flying   474  70     80     102
## 
## Subsample 5 :
##      Type1 Type2 Total  HP Attack Defense
## 1   Poison         273  46     57      40
## 2      Bug Steel   495  70    135     105
## 3   Dragon         580 200    100      50
## 4 Electric         205  20     40      15
## 5     Dark         220  35     55      35
## 6    Steel Ghost   448  59    110     150
## 
## Subsample 6 :
##      Type1  Type2 Total HP Attack Defense
## 1   Normal Flying   278 45     50      43
## 2   Normal          440 65     70      60
## 3     Rock          487 75    117      65
## 4 Electric          270 59     45      50
## 5  Psychic          315 90     65      65
## 6     Rock          580 80    100     200
## 
## Subsample 7 :
##      Type1  Type2 Total  HP Attack Defense
## 1 Electric    Ice   520  50     65     107
## 2     Rock Ground   300  50     64      50
## 3  Psychic  Ghost   680 137    113      89
## 4    Steel   Rock   530  70    110     180
## 5    Grass          530  70     85      65
## 6    Water          280  41     63      40
## 
## Subsample 8 :
##      Type1   Type2 Total HP Attack Defense
## 1 Electric           330 40     30      50
## 2 Fighting           505 90    130      80
## 3    Steel Psychic   300 57     24      86
## 4     Fire           505 73     76      75
## 5      Bug           224 40     29      45
## 6    Water  Ground   384 75     65      55
## 
## Subsample 9 :
##    Type1   Type2 Total  HP Attack Defense
## 1    Bug  Ground   424  60     79     105
## 2 Ground           670 100    150     140
## 3 Ground           265  10     55      25
## 4  Steel Psychic   600  80    135     130
## 5 Normal           288  48     48      48
## 6   Dark    Fire   570  55     80      80
## 
## Subsample 10 :
##      Type1   Type2 Total  HP Attack Defense
## 1     Rock    Dark   600 100    134     110
## 2    Water           530  70     85      65
## 3  Psychic   Steel   680  97    157     127
## 4    Grass Psychic   530  95     95      85
## 5 Electric           510  90     75      85
## 6   Dragon           410  66    117      70
# We can compare basic statistics (like mean, median, standard deviation, etc.) for the continuous columns across all sub samples. This will give us an idea of the variability among the sub samples.
# Creating a list to store statistics for each sub sample
all_stats <- list()
# Calculating statistics for each subsample
for (i in 1:10) {
  subsample_stats <- subsamples[[i]] %>%
    select(Total, HP, Attack, Defense) %>%
    summarise(
      Subsample = i,
      Mean_Total = mean(Total),
      Median_Total = median(Total),
      SD_Total = sd(Total),
      Mean_HP = mean(HP),
      Median_HP = median(HP),
      SD_HP = sd(HP),
      Mean_Attack = mean(Attack),
      Median_Attack = median(Attack),
      SD_Attack = sd(Attack),
      Mean_Defense = mean(Defense),
      Median_Defense = median(Defense),
      SD_Defense = sd(Defense)
    )
  all_stats[[i]] <- subsample_stats
}
# Consolidating all stats into a single data frame
consolidated_stats <- bind_rows(all_stats)

# Displaying the consolidated stats using kable
kable(consolidated_stats)
Subsample Mean_Total Median_Total SD_Total Mean_HP Median_HP SD_HP Mean_Attack Median_Attack SD_Attack Mean_Defense Median_Defense SD_Defense
1 445.4573 460 117.1370 71.66834 70 27.35126 80.34506 80 31.31109 77.81072 75 33.11049
2 440.1826 470 118.2490 69.51256 68 24.22875 79.56114 75 31.96446 73.81407 70 29.88115
3 448.6281 473 116.9926 71.97822 70 27.04932 82.38526 80 32.03494 75.75712 70 30.81275
4 440.0687 465 120.7889 72.06700 70 27.60615 82.26801 80 31.05589 72.71692 70 28.46490
5 439.2194 467 132.5946 70.53936 68 28.54751 80.37186 78 31.79452 74.19765 70 32.35112
6 446.8961 464 122.4848 71.49246 69 25.53289 83.09380 80 32.64217 75.70184 70 30.81300
7 444.6248 470 118.6917 72.89782 70 26.73499 81.69347 80 29.90423 77.14238 72 30.92144
8 439.2730 455 121.8672 69.74372 68 24.55728 80.85092 79 32.80744 74.56449 70 30.07058
9 436.5997 455 127.8192 69.54606 65 27.21201 80.46901 80 33.24464 74.43719 70 33.04870
10 443.6382 473 128.2434 71.52094 70 27.83313 82.77554 80 33.03049 74.87270 70 30.59506
# The mean values for Total, HP, Attack, and Defense vary slightly across the 10 sub samples. For instance, the mean Total score fluctuates between approximately 435 and 445 across sub samples.

# With an average Mean_HP of around 70.71, it appears that most Pokémon have a moderate amount of health. HP is crucial as it determines how much damage a Pokémon can take before fainting.

# The distribution of Mean_HP is concentrated around 70-71, meaning that there's consistency in the health stats among Pokémon

# The Mean_Attack and Mean_Defense values have strong positive correlations with the Mean_Total. This suggests that Pokémon with higher attack or defense stats tend to have higher overall stats.

# The standard deviations for attributes like SD_Total, SD_HP, SD_Attack, and SD_Defense provide insights into the variability of these stats among Pokémon. For instance, a higher standard deviation in SD_Attack would mean that there's a wider range of attack values among Pokémon. 

# Future question:

# What are the most common type combinations, and how do their stats compare?
# To identify potential anomalies, we can visually inspect the distribution of the continuous data across subsamples using boxplots.
# Combining subsamples into a single data frame with an identifier for each sub sample
combined_data <- bind_rows(lapply(1:10, function(i) {
  subsample <- subsamples[[i]]
  subsample$subsample_id <- i
  return(subsample)
}))

# Melting the data to a long format for easy plotting
long_data <- combined_data %>%
  gather(key = "Stat", value = "Value", -Type1, -Type2, -subsample_id)

# Filtering for the columns of interest
plot_data <- long_data %>% 
  filter(Stat %in% c("Total", "HP", "Attack", "Defense"))

# Plotting the boxplots
ggplot(plot_data, aes(x = factor(subsample_id), y = Value)) + 
  geom_boxplot() +
  facet_wrap(~ Stat, scales = "free") + 
  labs(title = "Distribution of Stats across Subsamples", 
       x = "Subsample Index", 
       y = "Value") +
  theme_minimal()

#The central tendency (median) of the data is quite similar across all subsamples for each of the continuous variables (Total, HP, Attack, and Defense).

#There are outliers present in several sub samples. Some sub samples have outliers in the upper range, while others have them in the lower range. This variation in outliers indicates that what might be considered an anomaly in one subsample might not be in another.
#For consistency, we can check the frequency of categorical variables (Type1 and Type2) across sub samples. If certain categories appear frequently in all sub samples, they are consistent features of the data set.


# Calculating frequencies of Type1 and Type2 across subsamples
type_frequencies <- combined_data %>%
  gather(key = "Type_Category", value = "Type_Value", Type1, Type2) %>%
  count(subsample_id, Type_Category, Type_Value) %>%
  arrange(Type_Category, -n)
# Filtering to get the top 5 most frequent types for each Type_Category in each subsample
top_types <- type_frequencies %>%
  group_by(subsample_id, Type_Category) %>%
  slice_head(n = 5) %>%
  ungroup()

# Plotting the frequencies
ggplot(top_types, aes(x = factor(subsample_id), y = n, fill = Type_Value)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Type_Category, scales = "free") +
  labs(title = "Top 5 Types in Type1 and Type2 across Subsamples", 
       x = "Subsample Index", 
       y = "Frequency") +
  theme_minimal() +
  theme(legend.position = "bottom", legend.title = element_blank())

# Insights 

# Variability Among Subsamples: While the central tendency of the subsamples is quite consistent, there are variations in the spread and presence of outliers. This emphasizes the importance of having a sufficiently large sample size when drawing conclusions about a population, as different samples can present different perspectives.

#Anomalies and Context: What might be considered an outlier in one subsample might not be in another. This highlights the importance of context when identifying anomalies.

# Consistency in Categorical Data: Certain categories, like Water for Type1 and Flying for Type2, are consistently prevalent across subsamples. 


# Significance 

# This demonstrates the impact of sampling variability on data analysis. Different samples can yield different insights, underscoring the importance of comprehensive data collection and the potential pitfalls of drawing conclusions from limited data. 

# Further Questions:
# How would the insights change if we were to take larger or smaller subsamples?
# How would the insights change if we sampled without replacement?