Week 4 Data Dive

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.3     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Reading the dataset
pokemon_data <- read.csv("./PokemonStats.csv")

# Calculating 50% of the dataset's length
subsample_length <- nrow(pokemon_data) * 0.5

cat(paste("Subsample Length is", subsample_length, "rows"))

## Subsample Length is 597 rows

# Columns for sampling
selected_columns <- c('Type1', 'Type2', 'Total', 'HP', 'Attack', 'Defense')

# Collecting 10 random samples with replacement
subsamples <- list()
for (i in 1:10) {
  subsample <- pokemon_data %>%
    select(all_of(selected_columns)) %>%
    sample_n(size = subsample_length, replace = TRUE)
  subsamples[[i]] <- subsample
}

for (i in 1:10) {
  cat(paste("Subsample", i, ":\n"))
  print(head(subsamples[[i]]))
  cat("\n")
}

## Subsample 1 :
##      Type1  Type2 Total HP Attack Defense
## 1   Flying Dragon   535 85     70      80
## 2 Electric  Steel   325 25     35      70
## 3    Steel          630 70    140     230
## 4     Dark  Fairy   370 65     60      45
## 5   Normal Ground   423 85     56      77
## 6    Water Flying   485 85     40      70
## 
## Subsample 2 :
##    Type1   Type2 Total  HP Attack Defense
## 1  Fairy   Steel   506  85     75      77
## 2  Water           316  50     53      48
## 3 Normal           490 104     91      63
## 4 Poison           500 105    105      75
## 5    Ice Psychic   455  65     50      35
## 6  Grass     Ice   494  90     92      75
## 
## Subsample 3 :
##      Type1  Type2 Total  HP Attack Defense
## 1    Water          580 100     75     115
## 2     Dark          455  70     58      58
## 3   Normal Flying   262  40     60      30
## 4   Flying          580  79    115      70
## 5 Fighting          310  50     68      60
## 6     Rock Ground   390  55     95     115
## 
## Subsample 4 :
##    Type1    Type2 Total  HP Attack Defense
## 1 Normal            670 110    160     110
## 2    Bug   Poison   400  70     90      70
## 3   Rock Electric   570 100    134     110
## 4  Grass    Ghost   530  78    107      75
## 5   Fire   Poison   570  80     70      60
## 6    Bug   Flying   474  70     80     102
## 
## Subsample 5 :
##      Type1 Type2 Total  HP Attack Defense
## 1   Poison         273  46     57      40
## 2      Bug Steel   495  70    135     105
## 3   Dragon         580 200    100      50
## 4 Electric         205  20     40      15
## 5     Dark         220  35     55      35
## 6    Steel Ghost   448  59    110     150
## 
## Subsample 6 :
##      Type1  Type2 Total HP Attack Defense
## 1   Normal Flying   278 45     50      43
## 2   Normal          440 65     70      60
## 3     Rock          487 75    117      65
## 4 Electric          270 59     45      50
## 5  Psychic          315 90     65      65
## 6     Rock          580 80    100     200
## 
## Subsample 7 :
##      Type1  Type2 Total  HP Attack Defense
## 1 Electric    Ice   520  50     65     107
## 2     Rock Ground   300  50     64      50
## 3  Psychic  Ghost   680 137    113      89
## 4    Steel   Rock   530  70    110     180
## 5    Grass          530  70     85      65
## 6    Water          280  41     63      40
## 
## Subsample 8 :
##      Type1   Type2 Total HP Attack Defense
## 1 Electric           330 40     30      50
## 2 Fighting           505 90    130      80
## 3    Steel Psychic   300 57     24      86
## 4     Fire           505 73     76      75
## 5      Bug           224 40     29      45
## 6    Water  Ground   384 75     65      55
## 
## Subsample 9 :
##    Type1   Type2 Total  HP Attack Defense
## 1    Bug  Ground   424  60     79     105
## 2 Ground           670 100    150     140
## 3 Ground           265  10     55      25
## 4  Steel Psychic   600  80    135     130
## 5 Normal           288  48     48      48
## 6   Dark    Fire   570  55     80      80
## 
## Subsample 10 :
##      Type1   Type2 Total  HP Attack Defense
## 1     Rock    Dark   600 100    134     110
## 2    Water           530  70     85      65
## 3  Psychic   Steel   680  97    157     127
## 4    Grass Psychic   530  95     95      85
## 5 Electric           510  90     75      85
## 6   Dragon           410  66    117      70

# We can compare basic statistics (like mean, median, standard deviation, etc.) for the continuous columns across all sub samples. This will give us an idea of the variability among the sub samples.

# Creating a list to store statistics for each sub sample
all_stats <- list()

# Calculating statistics for each subsample
for (i in 1:10) {
  subsample_stats <- subsamples[[i]] %>%
    select(Total, HP, Attack, Defense) %>%
    summarise(
      Subsample = i,
      Mean_Total = mean(Total),
      Median_Total = median(Total),
      SD_Total = sd(Total),
      Mean_HP = mean(HP),
      Median_HP = median(HP),
      SD_HP = sd(HP),
      Mean_Attack = mean(Attack),
      Median_Attack = median(Attack),
      SD_Attack = sd(Attack),
      Mean_Defense = mean(Defense),
      Median_Defense = median(Defense),
      SD_Defense = sd(Defense)
    )
  all_stats[[i]] <- subsample_stats
}

# Consolidating all stats into a single data frame
consolidated_stats <- bind_rows(all_stats)

# Displaying the consolidated stats using kable
kable(consolidated_stats)

Subsample	Mean_Total	Median_Total	SD_Total	Mean_HP	Median_HP	SD_HP	Mean_Attack	Median_Attack	SD_Attack	Mean_Defense	Median_Defense	SD_Defense
1	445.4573	460	117.1370	71.66834	70	27.35126	80.34506	80	31.31109	77.81072	75	33.11049
2	440.1826	470	118.2490	69.51256	68	24.22875	79.56114	75	31.96446	73.81407	70	29.88115
3	448.6281	473	116.9926	71.97822	70	27.04932	82.38526	80	32.03494	75.75712	70	30.81275
4	440.0687	465	120.7889	72.06700	70	27.60615	82.26801	80	31.05589	72.71692	70	28.46490
5	439.2194	467	132.5946	70.53936	68	28.54751	80.37186	78	31.79452	74.19765	70	32.35112
6	446.8961	464	122.4848	71.49246	69	25.53289	83.09380	80	32.64217	75.70184	70	30.81300
7	444.6248	470	118.6917	72.89782	70	26.73499	81.69347	80	29.90423	77.14238	72	30.92144
8	439.2730	455	121.8672	69.74372	68	24.55728	80.85092	79	32.80744	74.56449	70	30.07058
9	436.5997	455	127.8192	69.54606	65	27.21201	80.46901	80	33.24464	74.43719	70	33.04870
10	443.6382	473	128.2434	71.52094	70	27.83313	82.77554	80	33.03049	74.87270	70	30.59506

# The mean values for Total, HP, Attack, and Defense vary slightly across the 10 sub samples. For instance, the mean Total score fluctuates between approximately 435 and 445 across sub samples.

# With an average Mean_HP of around 70.71, it appears that most Pokémon have a moderate amount of health. HP is crucial as it determines how much damage a Pokémon can take before fainting.

# The distribution of Mean_HP is concentrated around 70-71, meaning that there's consistency in the health stats among Pokémon

# The Mean_Attack and Mean_Defense values have strong positive correlations with the Mean_Total. This suggests that Pokémon with higher attack or defense stats tend to have higher overall stats.

# The standard deviations for attributes like SD_Total, SD_HP, SD_Attack, and SD_Defense provide insights into the variability of these stats among Pokémon. For instance, a higher standard deviation in SD_Attack would mean that there's a wider range of attack values among Pokémon. 

# Future question:

# What are the most common type combinations, and how do their stats compare?

# To identify potential anomalies, we can visually inspect the distribution of the continuous data across subsamples using boxplots.

# Combining subsamples into a single data frame with an identifier for each sub sample
combined_data <- bind_rows(lapply(1:10, function(i) {
  subsample <- subsamples[[i]]
  subsample$subsample_id <- i
  return(subsample)
}))

# Melting the data to a long format for easy plotting
long_data <- combined_data %>%
  gather(key = "Stat", value = "Value", -Type1, -Type2, -subsample_id)

# Filtering for the columns of interest
plot_data <- long_data %>% 
  filter(Stat %in% c("Total", "HP", "Attack", "Defense"))

# Plotting the boxplots
ggplot(plot_data, aes(x = factor(subsample_id), y = Value)) + 
  geom_boxplot() +
  facet_wrap(~ Stat, scales = "free") + 
  labs(title = "Distribution of Stats across Subsamples", 
       x = "Subsample Index", 
       y = "Value") +
  theme_minimal()

#The central tendency (median) of the data is quite similar across all subsamples for each of the continuous variables (Total, HP, Attack, and Defense).

#There are outliers present in several sub samples. Some sub samples have outliers in the upper range, while others have them in the lower range. This variation in outliers indicates that what might be considered an anomaly in one subsample might not be in another.

#For consistency, we can check the frequency of categorical variables (Type1 and Type2) across sub samples. If certain categories appear frequently in all sub samples, they are consistent features of the data set.


# Calculating frequencies of Type1 and Type2 across subsamples
type_frequencies <- combined_data %>%
  gather(key = "Type_Category", value = "Type_Value", Type1, Type2) %>%
  count(subsample_id, Type_Category, Type_Value) %>%
  arrange(Type_Category, -n)

# Filtering to get the top 5 most frequent types for each Type_Category in each subsample
top_types <- type_frequencies %>%
  group_by(subsample_id, Type_Category) %>%
  slice_head(n = 5) %>%
  ungroup()

# Plotting the frequencies
ggplot(top_types, aes(x = factor(subsample_id), y = n, fill = Type_Value)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Type_Category, scales = "free") +
  labs(title = "Top 5 Types in Type1 and Type2 across Subsamples", 
       x = "Subsample Index", 
       y = "Frequency") +
  theme_minimal() +
  theme(legend.position = "bottom", legend.title = element_blank())

# Insights 

# Variability Among Subsamples: While the central tendency of the subsamples is quite consistent, there are variations in the spread and presence of outliers. This emphasizes the importance of having a sufficiently large sample size when drawing conclusions about a population, as different samples can present different perspectives.

#Anomalies and Context: What might be considered an outlier in one subsample might not be in another. This highlights the importance of context when identifying anomalies.

# Consistency in Categorical Data: Certain categories, like Water for Type1 and Flying for Type2, are consistently prevalent across subsamples. 


# Significance 

# This demonstrates the impact of sampling variability on data analysis. Different samples can yield different insights, underscoring the importance of comprehensive data collection and the potential pitfalls of drawing conclusions from limited data. 

# Further Questions:
# How would the insights change if we were to take larger or smaller subsamples?
# How would the insights change if we sampled without replacement?

Week 4 Data Dive

Navdeep Metchu