# Load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)

# Load the data
data <- read.csv("D:/2024-2025/FA24/Stats/mergedfile.csv")

# Set the seed for reproducibility
set.seed(123)

# Function to generate a random subsample
generate_subsample <- function(data, sample_size) {
  data |> sample_n(size = sample_size, replace = TRUE)
}

# Create 5 random subsamples, each 50% of the original data size
sample_size <- nrow(data) * 0.5
subsamples <- 1:5 |> map(~ generate_subsample(data, sample_size))

# Assign the subsamples to separate data frames
df_1 <- subsamples[[1]]
df_2 <- subsamples[[2]]
df_3 <- subsamples[[3]]
df_4 <- subsamples[[4]]
df_5 <- subsamples[[5]]
# Function to summarize each subsample by Sector and Industry
summarize_by_sector_industry <- function(df) {
  df |>
    group_by(Sector, Industry) |>
    summarize(
      mean_adj_close = mean(Adj.Close, na.rm = TRUE),
      mean_marketcap = mean(Marketcap, na.rm = TRUE),
      mean_current_price = mean(Currentprice, na.rm = TRUE),
      mean_ebitda = mean(Ebitda, na.rm = TRUE),
      mean_revenue_growth = mean(Revenuegrowth, na.rm = TRUE),
      .groups = 'drop'
    )
}

# Apply the summary function to each subsample
summaries <- subsamples |> map(summarize_by_sector_industry)

Sub-samples

All five df samples below:

# View the summary of the subsamples
print(summaries[[1]])
## # A tibble: 56 × 7
##    Sector  Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
##    <chr>   <chr>             <dbl>          <dbl>              <dbl>       <dbl>
##  1 Basic … Special…          122.         3.61e10              183.      1.69e 9
##  2 Commun… Interne…           59.8        2.05e12              167.      1.15e11
##  3 Commun… Telecom…           13.9        1.41e11               19.7     4.21e10
##  4 Consum… Auto & …           64.8        1.34e10               85.8     9.63e 8
##  5 Consum… Auto Pa…           52.4        1.22e10               52.5     2.40e 9
##  6 Consum… Interne…           66.8        1.86e12              177.      1.04e11
##  7 Consum… Packagi…           50.5        1.78e10              101.      1.79e 9
##  8 Consum… Resorts…           40.6        8.28e 9               38.3     3.71e 9
##  9 Consum… Special…          365.         2.69e10             1087.      2.83e 9
## 10 Consum… Travel …          711.         7.39e10             1265.      4.55e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
print(summaries[[2]])
## # A tibble: 56 × 7
##    Sector  Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
##    <chr>   <chr>             <dbl>          <dbl>              <dbl>       <dbl>
##  1 Basic … Special…          124.         3.67e10              186.      1.75e 9
##  2 Commun… Interne…           58.2        2.05e12              167.      1.15e11
##  3 Commun… Telecom…           14.0        1.41e11               19.7     4.21e10
##  4 Consum… Auto & …           64.2        1.34e10               85.8     9.63e 8
##  5 Consum… Auto Pa…           52.6        1.23e10               53.0     2.42e 9
##  6 Consum… Interne…           68.0        1.86e12              177.      1.04e11
##  7 Consum… Packagi…           49.3        1.78e10               99.4     1.79e 9
##  8 Consum… Resorts…           39.8        8.28e 9               38.3     3.71e 9
##  9 Consum… Special…          366.         2.70e10             1097.      2.84e 9
## 10 Consum… Travel …          740.         7.51e10             1331.      4.63e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
print(summaries[[3]])
## # A tibble: 56 × 7
##    Sector  Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
##    <chr>   <chr>             <dbl>          <dbl>              <dbl>       <dbl>
##  1 Basic … Special…          121.         3.64e10              185.      1.72e 9
##  2 Commun… Interne…           57.9        2.05e12              167.      1.15e11
##  3 Commun… Telecom…           14.1        1.41e11               19.7     4.21e10
##  4 Consum… Auto & …           64.4        1.34e10               85.8     9.63e 8
##  5 Consum… Auto Pa…           52.7        1.24e10               53.2     2.43e 9
##  6 Consum… Interne…           68.8        1.86e12              177.      1.04e11
##  7 Consum… Packagi…           49.6        1.78e10               99.0     1.79e 9
##  8 Consum… Resorts…           41.0        8.28e 9               38.3     3.71e 9
##  9 Consum… Special…          345.         2.63e10             1051.      2.79e 9
## 10 Consum… Travel …          733.         7.50e10             1330.      4.63e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
print(summaries[[4]])
## # A tibble: 56 × 7
##    Sector  Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
##    <chr>   <chr>             <dbl>          <dbl>              <dbl>       <dbl>
##  1 Basic … Special…          122.         3.69e10              187.      1.77e 9
##  2 Commun… Interne…           58.3        2.05e12              167.      1.15e11
##  3 Commun… Telecom…           14.0        1.41e11               19.7     4.21e10
##  4 Consum… Auto & …           64.1        1.34e10               85.8     9.63e 8
##  5 Consum… Auto Pa…           52.7        1.24e10               53.3     2.43e 9
##  6 Consum… Interne…           66.3        1.86e12              177.      1.04e11
##  7 Consum… Packagi…           50.4        1.78e10               98.9     1.79e 9
##  8 Consum… Resorts…           41.3        8.28e 9               38.3     3.71e 9
##  9 Consum… Special…          355.         2.67e10             1070.      2.83e 9
## 10 Consum… Travel …          730.         7.50e10             1323.      4.61e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
print(summaries[[5]])
## # A tibble: 56 × 7
##    Sector  Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
##    <chr>   <chr>             <dbl>          <dbl>              <dbl>       <dbl>
##  1 Basic … Special…          122.         3.63e10              184.      1.71e 9
##  2 Commun… Interne…           58.5        2.05e12              167.      1.15e11
##  3 Commun… Telecom…           14.0        1.41e11               19.7     4.21e10
##  4 Consum… Auto & …           63.6        1.34e10               85.8     9.63e 8
##  5 Consum… Auto Pa…           53.0        1.24e10               53.2     2.43e 9
##  6 Consum… Interne…           67.8        1.86e12              177.      1.04e11
##  7 Consum… Packagi…           50.3        1.77e10               98.0     1.79e 9
##  8 Consum… Resorts…           40.6        8.28e 9               38.3     3.71e 9
##  9 Consum… Special…          358.         2.67e10             1074.      2.82e 9
## 10 Consum… Travel …          736.         7.47e10             1321.      4.63e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>

Questions

How are they different?

  • Key Differences: The subsamples can vary considerably, especially given that they are drawn with replacement. Some subsamples might over-represent certain sectors or industries, leading to different values for key metrics like Adj Close, Marketcap, or Ebitda. For example, one subsample may have an average Adj Close that is higher in the Technology sector, while another might have a lower average due to a different mix of companies included.

  • Sector Performance: Across different subsamples, sectors such as Healthcare or Energy might exhibit inconsistent behavior in terms of price growth (Adj Close) or market cap fluctuations.

Anomaly

  • Anomalies Due to Random Sampling: In one subsample, you might observe a much higher Adj Close or Revenuegrowth in a particular sector, say Technology, which might appear as an outlier. However, this could simply be due to the random oversampling of companies that performed particularly well in that subsample. If other subsamples do not exhibit the same extreme values, this would suggest the anomaly is driven by random sampling rather than genuine market behavior.

  • Sector Skew: If one subsample is heavily skewed toward a sector like Utilities, which is traditionally less volatile, it might show lower variability in key metrics (like Adj Close). In contrast, another subsample might show much more volatility if it has a higher proportion of Technology companies, which are generally more dynamic in pricing.

Consistencies

  • Consistent Sectors: Certain sectors, like Financials or Consumer Staples, may show consistent behavior across subsamples. For instance, their Marketcap or Revenuegrowth might display low standard deviations across different subsamples, indicating stability.

  • Stable Metrics: Metrics like Ebitda or Marketcap for large, established sectors (e.g., Energy or Financials) are likely to remain stable across all subsamples. These sectors are often less sensitive to market fluctuations and random sampling, indicating their performance is more predictable.

# Monte Carlo Simulation: Run with 100 subsamples
monte_carlo_simulation <- 1:100 |> map(~ generate_subsample(data, sample_size))
# Summarize each Monte Carlo sample
monte_carlo_summaries <- monte_carlo_simulation |> map(summarize_by_sector_industry)
# Combine the results into a single data frame for further analysis
combined_summaries <- monte_carlo_summaries |> bind_rows(.id = "simulation")


# Group by Sector and calculate overall statistics across simulations
final_results <- combined_summaries |>
  group_by(Sector, Industry) |>
  summarize(
    avg_mean_adj_close = mean(mean_adj_close, na.rm = TRUE),
    avg_mean_marketcap = mean(mean_marketcap, na.rm = TRUE),
    avg_mean_current_price = mean(mean_current_price, na.rm = TRUE),
    sd_adj_close = sd(mean_adj_close, na.rm = TRUE),
    sd_marketcap = sd(mean_marketcap, na.rm = TRUE),
    sd_current_price = sd(mean_current_price, na.rm = TRUE),
    .groups = 'drop'
  )

# View final results
print(final_results)
## # A tibble: 56 × 8
##    Sector  Industry avg_mean_adj_close avg_mean_marketcap avg_mean_current_price
##    <chr>   <chr>                 <dbl>              <dbl>                  <dbl>
##  1 Basic … Special…              122.             3.60e10                  183. 
##  2 Commun… Interne…               58.6            2.05e12                  167. 
##  3 Commun… Telecom…               14.0            1.41e11                   19.7
##  4 Consum… Auto & …               64.4            1.34e10                   85.8
##  5 Consum… Auto Pa…               53.1            1.24e10                   53.1
##  6 Consum… Interne…               68.4            1.86e12                  177. 
##  7 Consum… Packagi…               49.6            1.78e10                   98.2
##  8 Consum… Resorts…               40.4            8.28e 9                   38.3
##  9 Consum… Special…              365.             2.68e10                 1081. 
## 10 Consum… Travel …              723.             7.47e10                 1322. 
## # ℹ 46 more rows
## # ℹ 3 more variables: sd_adj_close <dbl>, sd_marketcap <dbl>,
## #   sd_current_price <dbl>