# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)
df_1 <- read.csv("/Users/umakasi/Downloads/udemy_courses.csv.xls")
df_1 <- as.data.frame(df_1) # Ensuring it’s a data frame str(df_1) # This should confirm it’s a data frame
# Number of samples to generate
num_samples <- 5
# Fraction of data to sample (50%)
fraction <- 0.5
# Creating the 5 samples with replacement
samples <- lapply(1:num_samples, function(i) {
sample_n(df_1, size = ceiling(nrow(df_1) * fraction), replace = TRUE)
})
#nrow determines no.of rows in the dataset
df_1_sample <- samples[[1]]
df_2_sample <- samples[[2]]
df_3_sample <- samples[[3]]
df_4_sample <- samples[[4]]
df_5_sample <- samples[[5]]
# View the size and first few rows of the first sample
dim(df_1_sample)
## [1] 1839 12
head(df_1_sample)
## course_id course_title
## 1 81064 Easy Forex and Futures Trading
## 2 301442 Black Algo Trading: Build Your Trading Robot
## 3 637452 Complete Guide to Business Loans
## 4 476880 Financial Model Fundamentals-Learn the Essential Skills
## 5 411168 Trading Inside Bars - Master 1 Easy Pattern To Be Successful
## 6 250902 Nest Egg Investing
## url
## 1 https://www.udemy.com/easy-forex-and-futures-trading/
## 2 https://www.udemy.com/build-your-trading-robot/
## 3 https://www.udemy.com/complete-guide-to-business-loans/
## 4 https://www.udemy.com/financial-model-fundamentals-learn-the-essential-skills/
## 5 https://www.udemy.com/trading-inside-bars-find-setups-today-make-money-tomorrow/
## 6 https://www.udemy.com/nest-egg-secrets/
## is_paid price num_subscribers num_reviews num_lectures level
## 1 True 100 218 5 66 All Levels
## 2 True 200 20195 1113 227 All Levels
## 3 True 30 35 4 35 Beginner Level
## 4 True 75 47 5 27 All Levels
## 5 True 125 2996 200 23 Intermediate Level
## 6 True 45 6 0 12 Beginner Level
## content_duration published_timestamp subject
## 1 14.5 2013-12-31T01:22:44Z Business Finance
## 2 16.0 2014-10-27T22:01:36Z Business Finance
## 3 1.5 2015-12-04T05:11:46Z Business Finance
## 4 2.5 2015-06-01T20:58:11Z Business Finance
## 5 2.0 2015-03-09T00:24:38Z Business Finance
## 6 2.0 2014-06-27T15:15:12Z Business Finance
summary_1 <- summary(df_1_sample)
summary_1 <- summary(df_2_sample)
summary_1 <- summary(df_3_sample)
summary_1 <- summary(df_4_sample)
summary_1 <- summary(df_5_sample)
print(summary_1)
## course_id course_title url is_paid
## Min. : 8325 Length:1839 Length:1839 Length:1839
## 1st Qu.: 403604 Class :character Class :character Class :character
## Median : 672086 Mode :character Mode :character Mode :character
## Mean : 669604
## 3rd Qu.: 951331
## Max. :1275872
## price num_subscribers num_reviews num_lectures
## Min. : 0.00 Min. : 0 Min. : 0 Min. : 5.00
## 1st Qu.: 20.00 1st Qu.: 111 1st Qu.: 4 1st Qu.: 14.00
## Median : 45.00 Median : 878 Median : 16 Median : 24.00
## Mean : 66.57 Mean : 2982 Mean : 155 Mean : 40.48
## 3rd Qu.: 95.00 3rd Qu.: 2532 3rd Qu.: 65 3rd Qu.: 45.50
## Max. :200.00 Max. :72932 Max. :15117 Max. :544.00
## level content_duration published_timestamp subject
## Length:1839 Min. : 0.1333 Length:1839 Length:1839
## Class :character 1st Qu.: 1.0000 Class :character Class :character
## Mode :character Median : 2.0000 Mode :character Mode :character
## Mean : 4.1156
## 3rd Qu.: 4.5000
## Max. :70.0000
#After generating random subsamples that represent 50% of the original dataset, we efficiently simulate the process of collecting data from larger population. It enhances the reliability of statical modeling for enabling better decision-making and making the analysis more efficient and structured.
#from here we received some course id’s and their relative course titles
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Function to calculate mean price by subject for each subsample
mean_price_by_subject <- function(df_sample) {
df_sample %>%
group_by(subject) %>%
summarize(mean_price = mean(price, na.rm = TRUE), .groups = 'drop')
}
# Calculate mean price by subject for each subsample
mean_price_results <- bind_rows(
mean_price_by_subject(df_1_sample) %>% mutate(Sample = "Sample 1"),
mean_price_by_subject(df_2_sample) %>% mutate(Sample = "Sample 2"),
mean_price_by_subject(df_3_sample) %>% mutate(Sample = "Sample 3"),
mean_price_by_subject(df_4_sample) %>% mutate(Sample = "Sample 4"),
mean_price_by_subject(df_5_sample) %>% mutate(Sample = "Sample 5")
)
# Visualize mean price by subject for all subsamples
ggplot(mean_price_results, aes(x = subject, y = mean_price, fill = Sample)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Mean Price by Subject for All Subsamples",
x = "Subject",
y = "Mean Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#Here meanprices are visualized by subject of each subsample providing pricing strategies and course offering.The difference between the samples are less by this we can understand that the samples are consistent
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Function to identify anomalies based on Z-score
identify_anomalies <- function(df_sample) {
df_sample <- df_sample %>%
mutate(z_price = (price - mean(price, na.rm = TRUE)) / sd(price, na.rm = TRUE)) %>%
mutate(anomaly = abs(z_price) > 2) # Marking anomalies with Z-score > 2 or < -2
return(df_sample)
}
# Identify anomalies in each subsample
anomalies_combined <- bind_rows(
identify_anomalies(df_1_sample) %>% mutate(Sample = "Sample 1"),
identify_anomalies(df_2_sample) %>% mutate(Sample = "Sample 2"),
identify_anomalies(df_3_sample) %>% mutate(Sample = "Sample 3"),
identify_anomalies(df_4_sample) %>% mutate(Sample = "Sample 4"),
identify_anomalies(df_5_sample) %>% mutate(Sample = "Sample 5")
)
# Visualize anomalies
ggplot(anomalies_combined, aes(x = subject, y = price, color = anomaly)) +
geom_point(position = position_jitter(width = 0.2, height = 0), alpha = 0.3, size = 2) +
facet_wrap(~ Sample) +
labs(title = "Anomalies in Price by Subject for Each Subsample",
x = "Subject",
y = "Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#I used scatter plot to visualise the anomalies, if there’s no anomaly it is red and if anomaly is present it is blue. By this visualization we can say there are less anomalies between the samples. #anomaly is based on the context and distribution of the data in each subsample. It depends on price outliers, course levels etc..,
# Load necessary libraries
library(dplyr)
# Function to perform Monte Carlo simulation by subject
monte_carlo_simulation_by_subject <- function(df_sample, n_iterations = 1000) {
results <- list() # List to store results for each iteration
for (i in 1:n_iterations) {
sampled_data <- sample_n(df_sample, size = ceiling(nrow(df_sample) * 0.5), replace = TRUE)
# Calculate mean price by subject
mean_price_by_subject <- sampled_data %>%
group_by(subject) %>%
summarize(mean_price = mean(price, na.rm = TRUE), .groups = 'drop')
results[[i]] <- mean_price_by_subject
}
# Combine all results into a single data frame
combined_results <- bind_rows(results, .id = "iteration")
return(combined_results)
}
# Number of iterations for the Monte Carlo simulation
n_iterations <- 1000
# Run Monte Carlo simulations for each subsample
mc_results_combined <- bind_rows(
monte_carlo_simulation_by_subject(df_1_sample, n_iterations) %>% mutate(Sample = "Sample 1"),
monte_carlo_simulation_by_subject(df_2_sample, n_iterations) %>% mutate(Sample = "Sample 2"),
monte_carlo_simulation_by_subject(df_3_sample, n_iterations) %>% mutate(Sample = "Sample 3"),
monte_carlo_simulation_by_subject(df_4_sample, n_iterations) %>% mutate(Sample = "Sample 4"),
monte_carlo_simulation_by_subject(df_5_sample, n_iterations) %>% mutate(Sample = "Sample 5")
)
summary_mc <- mc_results_combined %>%
group_by(Sample, subject) %>%
summarize(
Mean = mean(mean_price, na.rm = TRUE),
SD = sd(mean_price, na.rm = TRUE),
Min = min(mean_price, na.rm = TRUE),
Max = max(mean_price, na.rm = TRUE),
.groups = 'drop'
)
# Display summary statistics
print(summary_mc)
## # A tibble: 20 × 6
## Sample subject Mean SD Min Max
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Sample 1 Business Finance 68.2 3.67 57.2 80.8
## 2 Sample 1 Graphic Design 55.4 4.22 42.9 71
## 3 Sample 1 Musical Instruments 48.3 3.43 38.8 60.7
## 4 Sample 1 Web Development 79.2 3.93 66.8 92.3
## 5 Sample 2 Business Finance 70.2 3.62 58.4 80.9
## 6 Sample 2 Graphic Design 53.2 4.45 41.8 70.6
## 7 Sample 2 Musical Instruments 49.4 3.09 40.7 59.4
## 8 Sample 2 Web Development 81.4 4.11 68.9 93.3
## 9 Sample 3 Business Finance 68.7 3.87 57.1 79.7
## 10 Sample 3 Graphic Design 55.0 4.33 43.5 70.5
## 11 Sample 3 Musical Instruments 49.8 2.78 41.8 58.2
## 12 Sample 3 Web Development 76.9 3.98 65.1 88.4
## 13 Sample 4 Business Finance 71.4 3.70 60.2 83.1
## 14 Sample 4 Graphic Design 57.3 4.81 44.4 72.0
## 15 Sample 4 Musical Instruments 50.1 3.21 40.5 60.5
## 16 Sample 4 Web Development 78.7 3.89 65.8 93.8
## 17 Sample 5 Business Finance 65.5 3.50 52.0 76.1
## 18 Sample 5 Graphic Design 55.9 4.86 42.9 74.6
## 19 Sample 5 Musical Instruments 49.9 3.19 40.6 62.1
## 20 Sample 5 Web Development 81.6 4.03 69.5 93.1
# Visualize the distribution of mean prices from simulations by subject
ggplot(mc_results_combined, aes(x = mean_price, fill = Sample)) +
geom_density(alpha = 0.5) +
facet_wrap(~ subject, scales = "free") +
labs(title = "Density Plot of Mean Prices by Subject from Monte Carlo Simulations",
x = "Mean Price",
fill = "Sample") +
theme_minimal()
#The monte_carlo_simulation_by_subject function samples from the subsample and computes the mean price grouped by subject across multiple iterations. #Results from all iterations are combined, and summary statistics (mean, standard deviation, minimum, maximum) are computed for each subject. #Finally, a density plot is generated to visualize the distribution of mean prices by subject across all simulations. #The density plot is almost same for the samples of the subject.