I am going to create multiple randon samples from the existinf dataset by generating subsamples about 50 percent of the original dataset where we can analyze and understand the variability in both categorical and continuous data. Each data is store in a separate data frame for easy access and comparison

# Load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)

Load the dataset using the provided path

df_1 <- read.csv("/Users/umakasi/Downloads/udemy_courses.csv.xls")

Check if the data is loaded correctly and is a data frame

df_1 <- as.data.frame(df_1) # Ensuring it’s a data frame str(df_1) # This should confirm it’s a data frame

Samples to be 5

# Number of samples to generate
num_samples <- 5

Samples are 50% of the data with replacement

# Fraction of data to sample (50%)
fraction <- 0.5

# Creating the 5 samples with replacement
samples <- lapply(1:num_samples, function(i) {
  sample_n(df_1, size = ceiling(nrow(df_1) * fraction), replace = TRUE)
})

#nrow determines no.of rows in the dataset

Assigning each sample to separate variables

df_1_sample <- samples[[1]]
df_2_sample <- samples[[2]]
df_3_sample <- samples[[3]]
df_4_sample <- samples[[4]]
df_5_sample <- samples[[5]]

# View the size and first few rows of the first sample
dim(df_1_sample)
## [1] 1839   12
head(df_1_sample)
##   course_id                                                 course_title
## 1     81064                               Easy Forex and Futures Trading
## 2    301442                 Black Algo Trading: Build Your Trading Robot
## 3    637452                             Complete Guide to Business Loans
## 4    476880      Financial Model Fundamentals-Learn the Essential Skills
## 5    411168 Trading Inside Bars - Master 1 Easy Pattern To Be Successful
## 6    250902                                           Nest Egg Investing
##                                                                                url
## 1                            https://www.udemy.com/easy-forex-and-futures-trading/
## 2                                  https://www.udemy.com/build-your-trading-robot/
## 3                          https://www.udemy.com/complete-guide-to-business-loans/
## 4   https://www.udemy.com/financial-model-fundamentals-learn-the-essential-skills/
## 5 https://www.udemy.com/trading-inside-bars-find-setups-today-make-money-tomorrow/
## 6                                          https://www.udemy.com/nest-egg-secrets/
##   is_paid price num_subscribers num_reviews num_lectures              level
## 1    True   100             218           5           66         All Levels
## 2    True   200           20195        1113          227         All Levels
## 3    True    30              35           4           35     Beginner Level
## 4    True    75              47           5           27         All Levels
## 5    True   125            2996         200           23 Intermediate Level
## 6    True    45               6           0           12     Beginner Level
##   content_duration  published_timestamp          subject
## 1             14.5 2013-12-31T01:22:44Z Business Finance
## 2             16.0 2014-10-27T22:01:36Z Business Finance
## 3              1.5 2015-12-04T05:11:46Z Business Finance
## 4              2.5 2015-06-01T20:58:11Z Business Finance
## 5              2.0 2015-03-09T00:24:38Z Business Finance
## 6              2.0 2014-06-27T15:15:12Z Business Finance
summary_1 <- summary(df_1_sample)
summary_1 <- summary(df_2_sample)
summary_1 <- summary(df_3_sample)
summary_1 <- summary(df_4_sample)
summary_1 <- summary(df_5_sample)
print(summary_1)
##    course_id       course_title           url              is_paid         
##  Min.   :   8325   Length:1839        Length:1839        Length:1839       
##  1st Qu.: 403604   Class :character   Class :character   Class :character  
##  Median : 672086   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 669604                                                           
##  3rd Qu.: 951331                                                           
##  Max.   :1275872                                                           
##      price        num_subscribers  num_reviews     num_lectures   
##  Min.   :  0.00   Min.   :    0   Min.   :    0   Min.   :  5.00  
##  1st Qu.: 20.00   1st Qu.:  111   1st Qu.:    4   1st Qu.: 14.00  
##  Median : 45.00   Median :  878   Median :   16   Median : 24.00  
##  Mean   : 66.57   Mean   : 2982   Mean   :  155   Mean   : 40.48  
##  3rd Qu.: 95.00   3rd Qu.: 2532   3rd Qu.:   65   3rd Qu.: 45.50  
##  Max.   :200.00   Max.   :72932   Max.   :15117   Max.   :544.00  
##     level           content_duration  published_timestamp   subject         
##  Length:1839        Min.   : 0.1333   Length:1839         Length:1839       
##  Class :character   1st Qu.: 1.0000   Class :character    Class :character  
##  Mode  :character   Median : 2.0000   Mode  :character    Mode  :character  
##                     Mean   : 4.1156                                         
##                     3rd Qu.: 4.5000                                         
##                     Max.   :70.0000

#After generating random subsamples that represent 50% of the original dataset, we efficiently simulate the process of collecting data from larger population. It enhances the reliability of statical modeling for enabling better decision-making and making the analysis more efficient and structured.

#from here we received some course id’s and their relative course titles

Group by subject and level of subsamples

# Load necessary libraries
library(dplyr)
library(ggplot2)

# Function to calculate mean price by subject for each subsample
mean_price_by_subject <- function(df_sample) {
  df_sample %>%
    group_by(subject) %>%
    summarize(mean_price = mean(price, na.rm = TRUE), .groups = 'drop')
}

# Calculate mean price by subject for each subsample
mean_price_results <- bind_rows(
  mean_price_by_subject(df_1_sample) %>% mutate(Sample = "Sample 1"),
  mean_price_by_subject(df_2_sample) %>% mutate(Sample = "Sample 2"),
  mean_price_by_subject(df_3_sample) %>% mutate(Sample = "Sample 3"),
  mean_price_by_subject(df_4_sample) %>% mutate(Sample = "Sample 4"),
  mean_price_by_subject(df_5_sample) %>% mutate(Sample = "Sample 5")
)

# Visualize mean price by subject for all subsamples
ggplot(mean_price_results, aes(x = subject, y = mean_price, fill = Sample)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Mean Price by Subject for All Subsamples",
       x = "Subject",
       y = "Mean Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Here meanprices are visualized by subject of each subsample providing pricing strategies and course offering.The difference between the samples are less by this we can understand that the samples are consistent

Scatter plot of Price vs subject with anomalies highlighted

# Load necessary libraries
library(dplyr)
library(ggplot2)

# Function to identify anomalies based on Z-score
identify_anomalies <- function(df_sample) {
  df_sample <- df_sample %>%
    mutate(z_price = (price - mean(price, na.rm = TRUE)) / sd(price, na.rm = TRUE)) %>%
    mutate(anomaly = abs(z_price) > 2)  # Marking anomalies with Z-score > 2 or < -2
  
  return(df_sample)
}

# Identify anomalies in each subsample
anomalies_combined <- bind_rows(
  identify_anomalies(df_1_sample) %>% mutate(Sample = "Sample 1"),
  identify_anomalies(df_2_sample) %>% mutate(Sample = "Sample 2"),
  identify_anomalies(df_3_sample) %>% mutate(Sample = "Sample 3"),
  identify_anomalies(df_4_sample) %>% mutate(Sample = "Sample 4"),
  identify_anomalies(df_5_sample) %>% mutate(Sample = "Sample 5")
)

# Visualize anomalies
ggplot(anomalies_combined, aes(x = subject, y = price, color = anomaly)) +
  geom_point(position = position_jitter(width = 0.2, height = 0), alpha = 0.3, size = 2) +
  facet_wrap(~ Sample) +
  labs(title = "Anomalies in Price by Subject for Each Subsample",
       x = "Subject",
       y = "Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#I used scatter plot to visualise the anomalies, if there’s no anomaly it is red and if anomaly is present it is blue. By this visualization we can say there are less anomalies between the samples. #anomaly is based on the context and distribution of the data in each subsample. It depends on price outliers, course levels etc..,

Monte Carlo Simulation for a specific group

# Load necessary libraries
library(dplyr)

# Function to perform Monte Carlo simulation by subject
monte_carlo_simulation_by_subject <- function(df_sample, n_iterations = 1000) {
  results <- list()  # List to store results for each iteration
  
  for (i in 1:n_iterations) {
    sampled_data <- sample_n(df_sample, size = ceiling(nrow(df_sample) * 0.5), replace = TRUE)
    
    # Calculate mean price by subject
    mean_price_by_subject <- sampled_data %>%
      group_by(subject) %>%
      summarize(mean_price = mean(price, na.rm = TRUE), .groups = 'drop')
    
    results[[i]] <- mean_price_by_subject  
  }
  
  # Combine all results into a single data frame
  combined_results <- bind_rows(results, .id = "iteration")
  return(combined_results)
}

# Number of iterations for the Monte Carlo simulation
n_iterations <- 1000

# Run Monte Carlo simulations for each subsample
mc_results_combined <- bind_rows(
  monte_carlo_simulation_by_subject(df_1_sample, n_iterations) %>% mutate(Sample = "Sample 1"),
  monte_carlo_simulation_by_subject(df_2_sample, n_iterations) %>% mutate(Sample = "Sample 2"),
  monte_carlo_simulation_by_subject(df_3_sample, n_iterations) %>% mutate(Sample = "Sample 3"),
  monte_carlo_simulation_by_subject(df_4_sample, n_iterations) %>% mutate(Sample = "Sample 4"),
  monte_carlo_simulation_by_subject(df_5_sample, n_iterations) %>% mutate(Sample = "Sample 5")
)

summary_mc <- mc_results_combined %>%
  group_by(Sample, subject) %>%
  summarize(
    Mean = mean(mean_price, na.rm = TRUE),
    SD = sd(mean_price, na.rm = TRUE),
    Min = min(mean_price, na.rm = TRUE),
    Max = max(mean_price, na.rm = TRUE),
    .groups = 'drop'
  )

# Display summary statistics
print(summary_mc)
## # A tibble: 20 × 6
##    Sample   subject              Mean    SD   Min   Max
##    <chr>    <chr>               <dbl> <dbl> <dbl> <dbl>
##  1 Sample 1 Business Finance     68.2  3.67  57.2  80.8
##  2 Sample 1 Graphic Design       55.4  4.22  42.9  71  
##  3 Sample 1 Musical Instruments  48.3  3.43  38.8  60.7
##  4 Sample 1 Web Development      79.2  3.93  66.8  92.3
##  5 Sample 2 Business Finance     70.2  3.62  58.4  80.9
##  6 Sample 2 Graphic Design       53.2  4.45  41.8  70.6
##  7 Sample 2 Musical Instruments  49.4  3.09  40.7  59.4
##  8 Sample 2 Web Development      81.4  4.11  68.9  93.3
##  9 Sample 3 Business Finance     68.7  3.87  57.1  79.7
## 10 Sample 3 Graphic Design       55.0  4.33  43.5  70.5
## 11 Sample 3 Musical Instruments  49.8  2.78  41.8  58.2
## 12 Sample 3 Web Development      76.9  3.98  65.1  88.4
## 13 Sample 4 Business Finance     71.4  3.70  60.2  83.1
## 14 Sample 4 Graphic Design       57.3  4.81  44.4  72.0
## 15 Sample 4 Musical Instruments  50.1  3.21  40.5  60.5
## 16 Sample 4 Web Development      78.7  3.89  65.8  93.8
## 17 Sample 5 Business Finance     65.5  3.50  52.0  76.1
## 18 Sample 5 Graphic Design       55.9  4.86  42.9  74.6
## 19 Sample 5 Musical Instruments  49.9  3.19  40.6  62.1
## 20 Sample 5 Web Development      81.6  4.03  69.5  93.1
# Visualize the distribution of mean prices from simulations by subject
ggplot(mc_results_combined, aes(x = mean_price, fill = Sample)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~ subject, scales = "free") +
  labs(title = "Density Plot of Mean Prices by Subject from Monte Carlo Simulations",
       x = "Mean Price",
       fill = "Sample") +
  theme_minimal()

#The monte_carlo_simulation_by_subject function samples from the subsample and computes the mean price grouped by subject across multiple iterations. #Results from all iterations are combined, and summary statistics (mean, standard deviation, minimum, maximum) are computed for each subject. #Finally, a density plot is generated to visualize the distribution of mean prices by subject across all simulations. #The density plot is almost same for the samples of the subject.