library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(readr)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2 ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(conflicted)
#Reading the data set and printing first 5rows
data <- read.csv("dataset.csv")
head(data)
conflicted::conflicts_prefer(dplyr::filter)
[conflicted] Will prefer dplyr::filter over any other package.
# Filter dataset where explicit is "True" and sample 10,000 rows
sample_data <- data |> filter(explicit == "True") |> sample_n(9000)
data <- sample_data
nrow(data)
[1] 9000
# Display first few rows
head(data)
library(dplyr)
set.seed(123) # Ensure reproducibility
# The set.seed(123) function ensures that the random sampling process produces the same results every time the code is run
# Determine sample size (50% of data)
sample_size <- round(nrow(data) * 0.5)
# Create 5 random samples with replacement
df_1 <- data |> sample_n(sample_size, replace = TRUE)
df_2 <- data |> sample_n(sample_size, replace = TRUE)
df_3 <- data |> sample_n(sample_size, replace = TRUE)
df_4 <- data |> sample_n(sample_size, replace = TRUE)
df_5 <- data |> sample_n(sample_size, replace = TRUE)
# Verify
dim(df_1); dim(df_2); dim(df_3); dim(df_4); dim(df_5)
[1] 4500 21
[1] 4500 21
[1] 4500 21
[1] 4500 21
[1] 4500 21
# Verifying whether dataframe-1 is having categorical and continuous variables
str(df_1)
'data.frame': 4500 obs. of 21 variables:
$ X : int 18659 85341 59510 33375 18378 112004 60876 71107 15666 39768 ...
$ track_id : chr "1Ep3W7DYe6JxWsWHb3SKL1" "72wdyMXmB7cw11nURyEPKH" "05wAwEhXvp9ftiptQ9xbXl" "3xScUbjCK3Df7yedTWVQA4" ...
$ artists : chr "Mo Mandel" "Oslo Ess" "Vista Versicle" "Slowly Slowly" ...
$ album_name : chr "Negative Reinforcement" "Uleste Bøker Og Utgåtte Sko" "Needles" "Forget You" ...
$ track_name : chr "Pee Pee Treats" "Alt Jeg Trenger" "Needles" "Forget You" ...
$ popularity : int 21 40 1 40 23 52 18 58 54 39 ...
$ duration_ms : int 149941 258586 200071 200506 278626 210146 198341 164760 156825 192472 ...
$ explicit : chr "True" "True" "True" "True" ...
$ danceability : num 0.758 0.489 0.501 0.545 0.564 0.795 0.849 0.542 0.568 0.815 ...
$ energy : num 0.642 0.853 0.989 0.904 0.938 0.874 0.705 0.932 0.867 0.45 ...
$ key : int 6 9 2 1 5 1 2 4 0 2 ...
$ loudness : num -12.07 -4.92 -1.62 -2.96 -6.82 ...
$ mode : int 1 1 1 1 1 1 1 1 1 1 ...
$ speechiness : num 0.954 0.0384 0.11 0.0663 0.938 0.0506 0.141 0.05 0.15 0.395 ...
$ acousticness : num 0.87 0.0271 0.000715 0.0297 0.781 0.0812 0.0334 0.0747 0.209 0.0303 ...
$ instrumentalness: num 0.00 0.00 2.81e-06 0.00 0.00 9.06e-02 2.90e-05 0.00 0.00 4.66e-06 ...
$ liveness : num 0.773 0.199 0.2 0.064 0.93 0.065 0.305 0.613 0.407 0.344 ...
$ valence : num 0.5 0.511 0.113 0.839 0.356 0.2 0.35 0.698 0.441 0.228 ...
$ tempo : num 70.4 140.1 90 160 76.8 ...
$ time_signature : int 3 4 4 4 4 4 4 4 4 4 ...
$ track_genre : chr "comedy" "punk-rock" "iranian" "emo" ...
sapply(df_1, class)
X track_id artists album_name track_name popularity duration_ms
"integer" "character" "character" "character" "character" "integer" "integer"
explicit danceability energy key loudness mode speechiness
"character" "numeric" "numeric" "integer" "numeric" "integer" "numeric"
acousticness instrumentalness liveness valence tempo time_signature track_genre
"numeric" "numeric" "numeric" "numeric" "numeric" "integer" "character"
df_1 <- df_1 |> mutate_if(is.character, as.factor)
sum(sapply(df_1, is.factor)) # Count of categorical variables
[1] 6
sum(sapply(df_1, is.numeric)) # Count of continuous variables
[1] 15
# Checking counts using summarize
df_1 |>
summarize(
num_cats = sum(sapply(df_1, is.factor)),
num_nums = sum(sapply(df_1, is.numeric))
)
library(dplyr)
# To identify categorical variables names
cat_vars <- df_1 |> select(where(is.factor)) |> names()
print(cat_vars)
[1] "track_id" "artists" "album_name" "track_name" "explicit" "track_genre"
library(dplyr)
# Extract and print names of continuous (numeric) variables
continuous_vars <- df_1 |> select(where(is.numeric)) |> colnames()
print(continuous_vars)
[1] "X" "popularity" "duration_ms" "danceability" "energy" "key"
[7] "loudness" "mode" "speechiness" "acousticness" "instrumentalness" "liveness"
[13] "valence" "tempo" "time_signature"
# Grouping df_1 by track_name and computing the mean for all numeric columns
df_1 |>
group_by(track_name) |>
summarize(across(where(is.numeric), mean, na.rm = TRUE))
Warning: There was 1 warning in `summarize()`.
ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
ℹ In group 1: `track_name = "'98 To Piano"`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.
# Previously
across(a:b, mean, na.rm = TRUE)
# Now
across(a:b, \(x) mean(x, na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
#df_2
# Load necessary library
library(dplyr)
# Summarize the count of categorical (factor) and numeric variables in df_2
df_2 |>
summarize(
num_cats = sum(sapply(df_2, is.factor)),
num_nums = sum(sapply(df_2, is.numeric))
)
# Extract and print names of categorical variables
cat_vars_2 <- df_2 |> select(where(is.factor)) |> names()
print(cat_vars_2)
character(0)
# Extract and print names of numeric (continuous) variables
continuous_vars_2 <- df_2 |> select(where(is.numeric)) |> colnames()
print(continuous_vars_2)
[1] "X" "popularity" "duration_ms" "danceability" "energy" "key"
[7] "loudness" "mode" "speechiness" "acousticness" "instrumentalness" "liveness"
[13] "valence" "tempo" "time_signature"
# Group df_2 by track_genre and compute the mean for all numeric columns
df_2 |>
group_by(track_genre) |>
summarize(across(where(is.numeric), mean, na.rm = TRUE))
#df_3
library(dplyr)
df_3 |>
summarize(
num_cats = sum(sapply(df_3, is.factor)),
num_nums = sum(sapply(df_3, is.numeric))
)
NA
cat_vars_3 <- df_3 |> select(where(is.factor)) |> names()
print(cat_vars_3)
character(0)
continuous_vars_3 <- df_3 |> select(where(is.numeric)) |> colnames()
print(continuous_vars_3)
[1] "X" "popularity" "duration_ms" "danceability" "energy" "key"
[7] "loudness" "mode" "speechiness" "acousticness" "instrumentalness" "liveness"
[13] "valence" "tempo" "time_signature"
df_3 |>
group_by(track_id) |>
summarize(across(where(is.numeric), mean, na.rm = TRUE))
NA
library(dplyr)
df_4 |>
summarize(
num_cats = sum(sapply(df_4, is.factor)),
num_nums = sum(sapply(df_4, is.numeric))
)
cat_vars_4 <- df_4 |> select(where(is.factor)) |> names()
print(cat_vars_4)
character(0)
continuous_vars_4 <- df_4 |> select(where(is.numeric)) |> colnames()
print(continuous_vars_4)
[1] "X" "popularity" "duration_ms" "danceability" "energy" "key"
[7] "loudness" "mode" "speechiness" "acousticness" "instrumentalness" "liveness"
[13] "valence" "tempo" "time_signature"
df_4 |>
group_by(artists) |>
summarize(across(where(is.numeric), mean, na.rm = TRUE))
NA
library(dplyr)
df_5 |>
summarize(
num_cats = sum(sapply(df_5, is.factor)),
num_nums = sum(sapply(df_5, is.numeric))
)
cat_vars_5 <- df_5 |> select(where(is.factor)) |> names()
print(cat_vars_5)
character(0)
continuous_vars_5 <- df_5 |> select(where(is.numeric)) |> colnames()
print(continuous_vars_5)
[1] "X" "popularity" "duration_ms" "danceability" "energy" "key"
[7] "loudness" "mode" "speechiness" "acousticness" "instrumentalness" "liveness"
[13] "valence" "tempo" "time_signature"
df_5 |>
group_by(album_name) |>
summarize(across(where(is.numeric), mean, na.rm = TRUE))
# Summarising using lapply
lapply(list(df_1, df_2, df_3, df_4, df_5), summary)
[[1]]
X track_id artists album_name
Min. : 446 41oY4WCTj5kccfesTVFnvN: 8 Jhayco;Bad Bunny : 34 Feliz Cumpleaños con Perreo: 34
1st Qu.: 28849 3F3uRFEXzCby4QCWoVmIA5: 7 Jack Harlow : 31 Halloween con perreito : 24
Median : 48174 0FYD3N8NI3yQ8mKvIMDJol: 6 Daddy Yankee;Bad Bunny: 30 Bad Vibes Forever : 19
Mean : 51140 19kiEGUN1df3omU4ChyjEX: 6 XXXTENTACION : 29 Un Verano Sin Ti : 19
3rd Qu.: 71931 5ls7oqk4nnxgPAJmGqZyTw: 6 Feid : 27 El perreo es el futuro : 17
Max. :112976 7LtXmsv58ijM3k2dP6h6fR: 6 Jhayco : 25 Frescura y Perreo : 16
(Other) :4461 (Other) :4324 (Other) :4371
track_name popularity duration_ms explicit danceability energy key
CÓMO SE SIENTE - Remix: 34 Min. : 0.00 Min. : 32306 True:4500 Min. :0.0614 Min. :0.0423 Min. : 0.000
X ÚLTIMA VEZ : 30 1st Qu.:20.00 1st Qu.: 162990 1st Qu.:0.5300 1st Qu.:0.5827 1st Qu.: 2.000
First Class : 25 Median :36.00 Median : 194266 Median :0.6580 Median :0.7320 Median : 6.000
Normal : 25 Mean :36.17 Mean : 206347 Mean :0.6385 Mean :0.7218 Mean : 5.331
Pantysito : 23 3rd Qu.:56.00 3rd Qu.: 232690 3rd Qu.:0.7730 3rd Qu.:0.8820 3rd Qu.: 8.000
HISTORY : 16 Max. :98.00 Max. :4246206 Max. :0.9630 Max. :0.9990 Max. :11.000
(Other) :4347
loudness mode speechiness acousticness instrumentalness liveness valence
Min. :-24.843 Min. :0.0000 Min. :0.0243 Min. :0.00000 Min. :0.0000000 Min. :0.0241 Min. :0.0224
1st Qu.: -7.975 1st Qu.:0.0000 1st Qu.:0.0592 1st Qu.:0.00957 1st Qu.:0.0000000 1st Qu.:0.1040 1st Qu.:0.3040
Median : -5.899 Median :1.0000 Median :0.1130 Median :0.09650 Median :0.0000016 Median :0.1490 Median :0.4685
Mean : -6.446 Mean :0.5878 Mean :0.1910 Mean :0.21213 Mean :0.0524307 Mean :0.2394 Mean :0.4705
3rd Qu.: -4.245 3rd Qu.:1.0000 3rd Qu.:0.2382 3rd Qu.:0.32650 3rd Qu.:0.0004930 3rd Qu.:0.3190 3rd Qu.:0.6390
Max. : 1.682 Max. :1.0000 Max. :0.9650 Max. :0.97700 Max. :0.9630000 Max. :0.9920 Max. :0.9750
tempo time_signature track_genre
Min. : 35.39 Min. :1.000 comedy : 302
1st Qu.: 96.96 1st Qu.:4.000 emo : 217
Median :119.72 Median :4.000 sad : 214
Mean :121.10 Mean :3.966 j-dance : 178
3rd Qu.:140.12 3rd Qu.:4.000 funk : 157
Max. :205.66 Max. :5.000 dancehall: 147
(Other) :3285
[[2]]
X track_id artists album_name track_name popularity duration_ms
Min. : 59 Length:4500 Length:4500 Length:4500 Length:4500 Min. : 0.0 Min. : 31186
1st Qu.: 23701 Class :character Class :character Class :character Class :character 1st Qu.:21.0 1st Qu.: 163545
Median : 48113 Mode :character Mode :character Mode :character Mode :character Median :38.0 Median : 194746
Mean : 50408 Mean :37.1 Mean : 205129
3rd Qu.: 71777 3rd Qu.:56.0 3rd Qu.: 231208
Max. :112983 Max. :98.0 Max. :1482242
explicit danceability energy key loudness mode speechiness
Length:4500 Min. :0.0614 Min. :0.0423 Min. : 0.00 Min. :-24.843 Min. :0.0000 Min. :0.02440
Class :character 1st Qu.:0.5240 1st Qu.:0.5820 1st Qu.: 2.00 1st Qu.: -7.992 1st Qu.:0.0000 1st Qu.:0.05828
Mode :character Median :0.6580 Median :0.7290 Median : 6.00 Median : -5.923 Median :1.0000 Median :0.11000
Mean :0.6374 Mean :0.7196 Mean : 5.36 Mean : -6.537 Mean :0.5842 Mean :0.19303
3rd Qu.:0.7710 3rd Qu.:0.8790 3rd Qu.: 8.00 3rd Qu.: -4.496 3rd Qu.:1.0000 3rd Qu.:0.25000
Max. :0.9800 Max. :1.0000 Max. :11.00 Max. : 0.915 Max. :1.0000 Max. :0.96500
acousticness instrumentalness liveness valence tempo time_signature track_genre
Min. :0.00000 Min. :0.0000000 Min. :0.0196 Min. :0.0215 Min. : 35.39 Min. :1.000 Length:4500
1st Qu.:0.00941 1st Qu.:0.0000000 1st Qu.:0.1020 1st Qu.:0.2970 1st Qu.: 96.12 1st Qu.:4.000 Class :character
Median :0.09620 Median :0.0000013 Median :0.1430 Median :0.4710 Median :119.97 Median :4.000 Mode :character
Mean :0.21220 Mean :0.0491172 Mean :0.2342 Mean :0.4683 Mean :121.60 Mean :3.951
3rd Qu.:0.33200 3rd Qu.:0.0005072 3rd Qu.:0.3120 3rd Qu.:0.6350 3rd Qu.:143.01 3rd Qu.:4.000
Max. :0.99500 Max. :0.9710000 Max. :0.9920 Max. :0.9890 Max. :213.78 Max. :5.000
[[3]]
X track_id artists album_name track_name popularity duration_ms
Min. : 243 Length:4500 Length:4500 Length:4500 Length:4500 Min. : 0.00 Min. : 31186
1st Qu.: 29386 Class :character Class :character Class :character Class :character 1st Qu.:19.00 1st Qu.: 162592
Median : 48520 Mode :character Mode :character Mode :character Mode :character Median :36.00 Median : 193985
Mean : 51616 Mean :35.64 Mean : 204265
3rd Qu.: 72043 3rd Qu.:56.00 3rd Qu.: 232627
Max. :112983 Max. :98.00 Max. :1482242
explicit danceability energy key loudness mode speechiness
Length:4500 Min. :0.0614 Min. :0.0759 Min. : 0.000 Min. :-24.843 Min. :0.0000 Min. :0.0243
Class :character 1st Qu.:0.5240 1st Qu.:0.5810 1st Qu.: 2.000 1st Qu.: -7.963 1st Qu.:0.0000 1st Qu.:0.0596
Mode :character Median :0.6570 Median :0.7360 Median : 6.000 Median : -5.921 Median :1.0000 Median :0.1110
Mean :0.6394 Mean :0.7227 Mean : 5.421 Mean : -6.496 Mean :0.5716 Mean :0.1925
3rd Qu.:0.7760 3rd Qu.:0.8840 3rd Qu.: 8.000 3rd Qu.: -4.327 3rd Qu.:1.0000 3rd Qu.:0.2470
Max. :0.9800 Max. :1.0000 Max. :11.000 Max. : 1.104 Max. :1.0000 Max. :0.9650
acousticness instrumentalness liveness valence tempo time_signature track_genre
Min. :0.00000 Min. :0.0000000 Min. :0.0268 Min. :0.0215 Min. : 45.10 Min. :1.000 Length:4500
1st Qu.:0.00876 1st Qu.:0.0000000 1st Qu.:0.1040 1st Qu.:0.3040 1st Qu.: 96.96 1st Qu.:4.000 Class :character
Median :0.09965 Median :0.0000013 Median :0.1480 Median :0.4850 Median :119.97 Median :4.000 Mode :character
Mean :0.21426 Mean :0.0495060 Mean :0.2366 Mean :0.4793 Mean :121.77 Mean :3.964
3rd Qu.:0.33125 3rd Qu.:0.0004478 3rd Qu.:0.3160 3rd Qu.:0.6560 3rd Qu.:143.00 3rd Qu.:4.000
Max. :0.99200 Max. :0.9710000 Max. :0.9920 Max. :0.9700 Max. :208.95 Max. :5.000
[[4]]
X track_id artists album_name track_name popularity duration_ms
Min. : 247 Length:4500 Length:4500 Length:4500 Length:4500 Min. : 0.00 Min. : 31186
1st Qu.: 28760 Class :character Class :character Class :character Class :character 1st Qu.:20.00 1st Qu.: 163611
Median : 48608 Mode :character Mode :character Mode :character Mode :character Median :38.00 Median : 193145
Mean : 51938 Mean :36.78 Mean : 204492
3rd Qu.: 72226 3rd Qu.:56.00 3rd Qu.: 229466
Max. :112983 Max. :98.00 Max. :1482242
explicit danceability energy key loudness mode speechiness
Length:4500 Min. :0.0614 Min. :0.0423 Min. : 0.000 Min. :-24.843 Min. :0.0000 Min. :0.02420
Class :character 1st Qu.:0.5200 1st Qu.:0.5900 1st Qu.: 2.000 1st Qu.: -7.915 1st Qu.:0.0000 1st Qu.:0.05847
Mode :character Median :0.6535 Median :0.7310 Median : 6.000 Median : -5.907 Median :1.0000 Median :0.11000
Mean :0.6345 Mean :0.7229 Mean : 5.438 Mean : -6.496 Mean :0.5802 Mean :0.19126
3rd Qu.:0.7710 3rd Qu.:0.8830 3rd Qu.: 9.000 3rd Qu.: -4.414 3rd Qu.:1.0000 3rd Qu.:0.24900
Max. :0.9710 Max. :1.0000 Max. :11.000 Max. : 1.821 Max. :1.0000 Max. :0.96500
acousticness instrumentalness liveness valence tempo time_signature track_genre
Min. :0.000001 Min. :0.0000000 Min. :0.0196 Min. :0.0215 Min. : 35.39 Min. :1.000 Length:4500
1st Qu.:0.008735 1st Qu.:0.0000000 1st Qu.:0.1040 1st Qu.:0.2980 1st Qu.: 97.01 1st Qu.:4.000 Class :character
Median :0.097700 Median :0.0000016 Median :0.1450 Median :0.4640 Median :120.00 Median :4.000 Mode :character
Mean :0.211339 Mean :0.0525719 Mean :0.2341 Mean :0.4670 Mean :122.20 Mean :3.959
3rd Qu.:0.331000 3rd Qu.:0.0005490 3rd Qu.:0.3140 3rd Qu.:0.6362 3rd Qu.:142.96 3rd Qu.:4.000
Max. :0.995000 Max. :0.9950000 Max. :0.9920 Max. :0.9890 Max. :206.76 Max. :5.000
[[5]]
X track_id artists album_name track_name popularity duration_ms
Min. : 59 Length:4500 Length:4500 Length:4500 Length:4500 Min. : 0.00 Min. : 31240
1st Qu.: 28288 Class :character Class :character Class :character Class :character 1st Qu.:21.00 1st Qu.: 161989
Median : 48168 Mode :character Mode :character Mode :character Mode :character Median :38.00 Median : 193309
Mean : 50792 Mean :36.92 Mean : 204436
3rd Qu.: 72004 3rd Qu.:56.00 3rd Qu.: 232616
Max. :112905 Max. :98.00 Max. :1101318
explicit danceability energy key loudness mode speechiness
Length:4500 Min. :0.1110 Min. :0.0678 Min. : 0.000 Min. :-24.843 Min. :0.0000 Min. :0.0244
Class :character 1st Qu.:0.5190 1st Qu.:0.5800 1st Qu.: 2.000 1st Qu.: -7.932 1st Qu.:0.0000 1st Qu.:0.0583
Mode :character Median :0.6495 Median :0.7275 Median : 6.000 Median : -5.982 Median :1.0000 Median :0.1090
Mean :0.6322 Mean :0.7193 Mean : 5.382 Mean : -6.491 Mean :0.5702 Mean :0.1894
3rd Qu.:0.7660 3rd Qu.:0.8810 3rd Qu.: 8.000 3rd Qu.: -4.473 3rd Qu.:1.0000 3rd Qu.:0.2460
Max. :0.9660 Max. :1.0000 Max. :11.000 Max. : 1.821 Max. :1.0000 Max. :0.9650
acousticness instrumentalness liveness valence tempo time_signature track_genre
Min. :0.000001 Min. :0.0000000 Min. :0.0197 Min. :0.0256 Min. : 35.39 Min. :1.000 Length:4500
1st Qu.:0.008170 1st Qu.:0.0000000 1st Qu.:0.1040 1st Qu.:0.2990 1st Qu.: 96.88 1st Qu.:4.000 Class :character
Median :0.094600 Median :0.0000012 Median :0.1480 Median :0.4780 Median :119.93 Median :4.000 Mode :character
Mean :0.210667 Mean :0.0504810 Mean :0.2360 Mean :0.4739 Mean :121.93 Mean :3.958
3rd Qu.:0.323000 3rd Qu.:0.0003340 3rd Qu.:0.3150 3rd Qu.:0.6470 3rd Qu.:143.98 3rd Qu.:4.000
Max. :0.995000 Max. :0.9710000 Max. :0.9920 Max. :0.9750 Max. :213.78 Max. :5.000
For comparing random sub-samples of tracks,
Popularity: Tracks with higher popularity (popularity score of 80 or more) are having higher energy, danceability, and loudness values. In contrast, less popular tracks are having lower values in these features.
Genre-Based Differences: Acoustic tracks have lower energy and danceability, with higher acousticness and instrumentalness compared to electronic or pop genres.
For this dataset, anomalies vary across sub-samples like:
A track with a popularity score of 50 or below is an anomaly in a sub-sample of tracks with popularity mostly above 70. Conversely, in a sub-sample with mostly low-popularity tracks, a track with popularity over 70 is unusual.
A track with exceptionally low energy (e.g., below 0.2) is an anomaly in a sub-sample focused on energetic genres like pop or EDM, but normal in a sub-sample of classical or acoustic music.
In Duration (ms): A track that is significantly longer (e.g., over 7 minutes) is an anomaly in a typical 3-4 minute pop/rock sub-sample but not unusual in a classical or ambient sub-sample.
Going in a general way, By doing this kind of investigation on random sample, we have significant effects over the conclusions we draw. If we rely on random subsampling without ensuring that all types of variables—especially categorical ones—are represented, we will not get conclusions based on incomplete or biased samples. For instance, if a subsample omits important categorical variables, we might overlook patterns or relationships that exist between these variables and the outcome of interest.
In the future, this experience suggests that it’s important to: 1. Examine sampling methods carefully to ensure that both categorical and numeric variables are appropriately represented. 2. Use stratified sampling or ensure a balanced representation of categories if we’re working with imbalanced datasets or when certain groups are crucial for the analysis. 3. Consider the context of missing variables when interpreting results—if a certain category is underrepresented, it might skew results or lead to faulty conclusions about the relationships within the data.
By refining the sampling process and ensuring balanced representation, we can draw more robust, unbiased conclusions, making our analyses more reliable.
#Just trying Monte Carlo Simulation
# Number of simulations
num_simulations <- 1000
# Store results of each simulation
results <- vector("list", num_simulations)
# Simulation process
for (i in 1:num_simulations) {
# Randomly sample from the dataset
sampled_data <- df_1[sample(nrow(df_1), size = nrow(df_1), replace = TRUE), ]
# Count categorical and numeric variables in the sampled data
cat_count <- sum(sapply(sampled_data, is.factor))
num_count <- sum(sapply(sampled_data, is.numeric))
# Store the results
results[[i]] <- list(cat_count = cat_count, num_count = num_count)
}
# Convert results into a dataframe for easier analysis
simulation_results <- bind_rows(results)
# Summarize how often categorical variables appear across all simulations
cat_appearance_rate <- mean(simulation_results$cat_count > 0)
# Check the distribution of numeric vs categorical counts
table(simulation_results$cat_count > 0)
TRUE
1000
# Print the appearance rate of categorical variables
print(paste("Rate of Categorical Variables in Subsamples:", cat_appearance_rate))
[1] "Rate of Categorical Variables in Subsamples: 1"
TRUE: This indicates that in each of the 1,000 simulations, at least one categorical variable was present in the subsample. So, in every random subsample, there was at least one categorical variable included.
1000: This is simply showing that we ran 1,000 simulations, which aligns with the number of iterations in the Monte Carlo simulation.
Rate of Categorical Variables in Subsamples: 1: This means that in all 1,000 subsamples, categorical variables are present. The rate of 1 signifies that categorical variables were always included in the subsamples. So, there was no case where categorical variables were completely missing in any of the simulations.
Categorical Variables are Always Represented: The subsampling strategy, as set up, always includes categorical variables. This suggests that, under the conditions of your simulation (sample size, replacement), our random sampling procedure consistently includes categorical variables.
No Missing Categories: The fact that the rate is 1 implies that in our subsampling approach, categorical variables are always present, which is a positive finding. It means we don’t need to worry about missing out on categorical variables under the current setup.
I could expand the simulation to track how often different categories appear and explore if some categories are more prone to being sampled over others.