# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
# Load the data
data <- read.csv("D:/2024-2025/FA24/Stats/mergedfile.csv")
# Set the seed for reproducibility
set.seed(123)
# Function to generate a random subsample
generate_subsample <- function(data, sample_size) {
data |> sample_n(size = sample_size, replace = TRUE)
}
# Create 5 random subsamples, each 50% of the original data size
sample_size <- nrow(data) * 0.5
subsamples <- 1:5 |> map(~ generate_subsample(data, sample_size))
# Assign the subsamples to separate data frames
df_1 <- subsamples[[1]]
df_2 <- subsamples[[2]]
df_3 <- subsamples[[3]]
df_4 <- subsamples[[4]]
df_5 <- subsamples[[5]]
# Function to summarize each subsample by Sector and Industry
summarize_by_sector_industry <- function(df) {
df |>
group_by(Sector, Industry) |>
summarize(
mean_adj_close = mean(Adj.Close, na.rm = TRUE),
mean_marketcap = mean(Marketcap, na.rm = TRUE),
mean_current_price = mean(Currentprice, na.rm = TRUE),
mean_ebitda = mean(Ebitda, na.rm = TRUE),
mean_revenue_growth = mean(Revenuegrowth, na.rm = TRUE),
.groups = 'drop'
)
}
# Apply the summary function to each subsample
summaries <- subsamples |> map(summarize_by_sector_industry)
All five df samples below:
# View the summary of the subsamples
print(summaries[[1]])
## # A tibble: 56 × 7
## Sector Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Basic … Special… 122. 3.61e10 183. 1.69e 9
## 2 Commun… Interne… 59.8 2.05e12 167. 1.15e11
## 3 Commun… Telecom… 13.9 1.41e11 19.7 4.21e10
## 4 Consum… Auto & … 64.8 1.34e10 85.8 9.63e 8
## 5 Consum… Auto Pa… 52.4 1.22e10 52.5 2.40e 9
## 6 Consum… Interne… 66.8 1.86e12 177. 1.04e11
## 7 Consum… Packagi… 50.5 1.78e10 101. 1.79e 9
## 8 Consum… Resorts… 40.6 8.28e 9 38.3 3.71e 9
## 9 Consum… Special… 365. 2.69e10 1087. 2.83e 9
## 10 Consum… Travel … 711. 7.39e10 1265. 4.55e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
print(summaries[[2]])
## # A tibble: 56 × 7
## Sector Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Basic … Special… 124. 3.67e10 186. 1.75e 9
## 2 Commun… Interne… 58.2 2.05e12 167. 1.15e11
## 3 Commun… Telecom… 14.0 1.41e11 19.7 4.21e10
## 4 Consum… Auto & … 64.2 1.34e10 85.8 9.63e 8
## 5 Consum… Auto Pa… 52.6 1.23e10 53.0 2.42e 9
## 6 Consum… Interne… 68.0 1.86e12 177. 1.04e11
## 7 Consum… Packagi… 49.3 1.78e10 99.4 1.79e 9
## 8 Consum… Resorts… 39.8 8.28e 9 38.3 3.71e 9
## 9 Consum… Special… 366. 2.70e10 1097. 2.84e 9
## 10 Consum… Travel … 740. 7.51e10 1331. 4.63e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
print(summaries[[3]])
## # A tibble: 56 × 7
## Sector Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Basic … Special… 121. 3.64e10 185. 1.72e 9
## 2 Commun… Interne… 57.9 2.05e12 167. 1.15e11
## 3 Commun… Telecom… 14.1 1.41e11 19.7 4.21e10
## 4 Consum… Auto & … 64.4 1.34e10 85.8 9.63e 8
## 5 Consum… Auto Pa… 52.7 1.24e10 53.2 2.43e 9
## 6 Consum… Interne… 68.8 1.86e12 177. 1.04e11
## 7 Consum… Packagi… 49.6 1.78e10 99.0 1.79e 9
## 8 Consum… Resorts… 41.0 8.28e 9 38.3 3.71e 9
## 9 Consum… Special… 345. 2.63e10 1051. 2.79e 9
## 10 Consum… Travel … 733. 7.50e10 1330. 4.63e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
print(summaries[[4]])
## # A tibble: 56 × 7
## Sector Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Basic … Special… 122. 3.69e10 187. 1.77e 9
## 2 Commun… Interne… 58.3 2.05e12 167. 1.15e11
## 3 Commun… Telecom… 14.0 1.41e11 19.7 4.21e10
## 4 Consum… Auto & … 64.1 1.34e10 85.8 9.63e 8
## 5 Consum… Auto Pa… 52.7 1.24e10 53.3 2.43e 9
## 6 Consum… Interne… 66.3 1.86e12 177. 1.04e11
## 7 Consum… Packagi… 50.4 1.78e10 98.9 1.79e 9
## 8 Consum… Resorts… 41.3 8.28e 9 38.3 3.71e 9
## 9 Consum… Special… 355. 2.67e10 1070. 2.83e 9
## 10 Consum… Travel … 730. 7.50e10 1323. 4.61e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
print(summaries[[5]])
## # A tibble: 56 × 7
## Sector Industry mean_adj_close mean_marketcap mean_current_price mean_ebitda
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Basic … Special… 122. 3.63e10 184. 1.71e 9
## 2 Commun… Interne… 58.5 2.05e12 167. 1.15e11
## 3 Commun… Telecom… 14.0 1.41e11 19.7 4.21e10
## 4 Consum… Auto & … 63.6 1.34e10 85.8 9.63e 8
## 5 Consum… Auto Pa… 53.0 1.24e10 53.2 2.43e 9
## 6 Consum… Interne… 67.8 1.86e12 177. 1.04e11
## 7 Consum… Packagi… 50.3 1.77e10 98.0 1.79e 9
## 8 Consum… Resorts… 40.6 8.28e 9 38.3 3.71e 9
## 9 Consum… Special… 358. 2.67e10 1074. 2.82e 9
## 10 Consum… Travel … 736. 7.47e10 1321. 4.63e 9
## # ℹ 46 more rows
## # ℹ 1 more variable: mean_revenue_growth <dbl>
Key Differences: The subsamples can vary considerably, especially given that they are drawn with replacement. Some subsamples might over-represent certain sectors or industries, leading to different values for key metrics like Adj Close, Marketcap, or Ebitda. For example, one subsample may have an average Adj Close that is higher in the Technology sector, while another might have a lower average due to a different mix of companies included.
Sector Performance: Across different subsamples, sectors such as Healthcare or Energy might exhibit inconsistent behavior in terms of price growth (Adj Close) or market cap fluctuations.
Anomalies Due to Random Sampling: In one
subsample, you might observe a much higher Adj Close
or
Revenuegrowth
in a particular sector, say
Technology
, which might appear as an outlier. However, this
could simply be due to the random oversampling of companies that
performed particularly well in that subsample. If other subsamples do
not exhibit the same extreme values, this would suggest the anomaly is
driven by random sampling rather than genuine market behavior.
Sector Skew: If one subsample is heavily skewed
toward a sector like Utilities
, which is traditionally less
volatile, it might show lower variability in key metrics (like
Adj Close
). In contrast, another subsample might show much
more volatility if it has a higher proportion of Technology
companies, which are generally more dynamic in pricing.
Consistent Sectors: Certain sectors, like Financials or Consumer Staples, may show consistent behavior across subsamples. For instance, their Marketcap or Revenuegrowth might display low standard deviations across different subsamples, indicating stability.
Stable Metrics: Metrics like Ebitda or Marketcap for large, established sectors (e.g., Energy or Financials) are likely to remain stable across all subsamples. These sectors are often less sensitive to market fluctuations and random sampling, indicating their performance is more predictable.
# Monte Carlo Simulation: Run with 100 subsamples
monte_carlo_simulation <- 1:100 |> map(~ generate_subsample(data, sample_size))
# Summarize each Monte Carlo sample
monte_carlo_summaries <- monte_carlo_simulation |> map(summarize_by_sector_industry)
# Combine the results into a single data frame for further analysis
combined_summaries <- monte_carlo_summaries |> bind_rows(.id = "simulation")
# Group by Sector and calculate overall statistics across simulations
final_results <- combined_summaries |>
group_by(Sector, Industry) |>
summarize(
avg_mean_adj_close = mean(mean_adj_close, na.rm = TRUE),
avg_mean_marketcap = mean(mean_marketcap, na.rm = TRUE),
avg_mean_current_price = mean(mean_current_price, na.rm = TRUE),
sd_adj_close = sd(mean_adj_close, na.rm = TRUE),
sd_marketcap = sd(mean_marketcap, na.rm = TRUE),
sd_current_price = sd(mean_current_price, na.rm = TRUE),
.groups = 'drop'
)
# View final results
print(final_results)
## # A tibble: 56 × 8
## Sector Industry avg_mean_adj_close avg_mean_marketcap avg_mean_current_price
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Basic … Special… 122. 3.60e10 183.
## 2 Commun… Interne… 58.6 2.05e12 167.
## 3 Commun… Telecom… 14.0 1.41e11 19.7
## 4 Consum… Auto & … 64.4 1.34e10 85.8
## 5 Consum… Auto Pa… 53.1 1.24e10 53.1
## 6 Consum… Interne… 68.4 1.86e12 177.
## 7 Consum… Packagi… 49.6 1.78e10 98.2
## 8 Consum… Resorts… 40.4 8.28e 9 38.3
## 9 Consum… Special… 365. 2.68e10 1081.
## 10 Consum… Travel … 723. 7.47e10 1322.
## # ℹ 46 more rows
## # ℹ 3 more variables: sd_adj_close <dbl>, sd_marketcap <dbl>,
## # sd_current_price <dbl>