Now we will make 5 samples from the original data set and name them as data frame(df_) 1,2,3,4,5. To sampling the dataset will collect the randomly from any rows or columns.These sub samples should each include both categorical and continuous (numeric) data.

# Load necessary libraries
library(dplyr)

# Set a seed for reproducibility
set.seed(123)

# Get the number of rows in the dataset
num_rows <- nrow(data)

# Sample 50% of the data 5 times with replacement
df_1 <- data[sample(1:num_rows, size = floor(0.5 * num_rows), replace = TRUE), ]
df_2 <- data[sample(1:num_rows, size = floor(0.5 * num_rows), replace = TRUE), ]
df_3 <- data[sample(1:num_rows, size = floor(0.5 * num_rows), replace = TRUE), ]
df_4 <- data[sample(1:num_rows, size = floor(0.5 * num_rows), replace = TRUE), ]
df_5 <- data[sample(1:num_rows, size = floor(0.5 * num_rows), replace = TRUE), ]

We Will display the first few rows of each sample.

The sample 1

head(df_1)

##       idx                       transaction_id    user_id transaction_date
## 2463 2463 6e9aaecd-6d8d-4614-9618-9249c8d7c913 USER_08108 2024-02-17 09:43
## 2511 2511 4897848c-d285-4e7d-877a-3b7e42c3937f USER_07206 2024-02-21 00:46
## 2227 2227 da2b83e0-5022-497e-87e9-dde5bd325bbc USER_05183 2024-01-31 03:49
## 526   526 3167b375-b764-4d20-8fcd-84615cbad432 USER_08995 2023-09-26 04:45
## 4291 4291 5955ca9b-0304-4008-95d5-47c45415e9fc USER_04125 2024-06-29 21:42
## 2986 2986 21419024-75d9-46ef-928c-8e3935cfbd6e USER_02349 2024-03-27 04:11
##       product_category        product_name  merchant_name product_amount
## 2463 Streaming Service          Basic Plan        Netflix         755.49
## 2511        Bus Ticket       Non-AC Seater MakeMyTrip Bus        2253.46
## 2227  Electricity Bill    Residential Bill           BSES         128.21
## 526     Gaming Credits    Legendary Weapon      Free Fire        2359.70
## 4291      Rent Payment 1BHK Apartment Rent       Nestaway        3252.79
## 2986 Streaming Service       Student Offer        Hotstar        1652.35
##      transaction_fee cashback loyalty_points payment_method transaction_status
## 2463           36.79    94.56            141 Wallet Balance         Successful
## 2511           15.38    56.24             59  Bank Transfer         Successful
## 2227           25.41    51.28            222 Wallet Balance         Successful
## 526            44.21    50.01            490            UPI            Pending
## 4291           35.53    22.54            844  Bank Transfer             Failed
## 2986           38.07    81.17            702 Wallet Balance         Successful
##      merchant_id device_type location
## 2463  MERCH_0228     Android    Urban
## 2511  MERCH_0332         iOS    Urban
## 2227  MERCH_0916     Android Suburban
## 526   MERCH_0758     Android    Urban
## 4291  MERCH_0217         iOS    Urban
## 2986  MERCH_0440     Android    Urban

The sample 2

head(df_2)

##       idx                       transaction_id    user_id transaction_date
## 754   754 a5c55dcf-8dd1-43d1-ab58-36602030967f USER_09609 2023-10-12 02:40
## 2813 2813 35098c21-1943-42f9-9b5c-4edcd8ab6fcf USER_09786 2024-03-14 11:38
## 4702 4702 690508d9-cd89-40b2-ba42-c386ff4201db USER_02781 2024-07-29 17:01
## 2581 2581 800df07c-3cfd-489f-9c91-b06d0ea40747 USER_00161 2024-02-25 17:25
## 2468 2468 3ea438e5-acc1-45b8-b615-abda3fd0985e USER_04602 2024-02-17 17:19
## 3927 3927 1122df72-43d1-4feb-9b03-636d6f77db16 USER_00977 2024-06-03 11:59
##       product_category              product_name merchant_name product_amount
## 754      Internet Bill     Broadband 50Mbps Plan       Hathway        4909.70
## 2813     Education Fee Python Programming Course       Vedantu        5177.35
## 4702 Streaming Service             Student Offer          ZEE5        9805.53
## 2581      Rent Payment          Co-working Space   Housing.com        6479.56
## 2468    Flight Booking      Bangalore to Kolkata    MakeMyTrip        7504.16
## 3927      Movie Ticket                  3 Idiots   SPI Cinemas         542.76
##      transaction_fee cashback loyalty_points payment_method transaction_status
## 754             6.72    21.52            227 Wallet Balance         Successful
## 2813           31.29    79.01            129     Debit Card         Successful
## 4702           45.21    72.15             51 Wallet Balance         Successful
## 2581           41.07    37.74            844    Credit Card             Failed
## 2468           10.10    11.18            735    Credit Card         Successful
## 3927           26.01    85.77            820    Credit Card         Successful
##      merchant_id device_type location
## 754   MERCH_0950         Web    Urban
## 2813  MERCH_0447     Android Suburban
## 4702  MERCH_0355         iOS    Urban
## 2581  MERCH_0926     Android    Urban
## 2468  MERCH_0678     Android Suburban
## 3927  MERCH_0498     Android    Urban

The sample 3

head(df_3)

##       idx                       transaction_id    user_id transaction_date
## 3805 3805 ef72d2b6-3546-49ad-8910-e3044b6bb54a USER_00534 2024-05-25 15:37
## 4599 4599 1923ce46-f3ff-4f55-a13a-81927b0dc9c2 USER_04643 2024-07-22 18:28
## 3362 3362 f80abda3-c503-4b4f-8195-c72dce348433 USER_04861 2024-04-24 06:42
## 4065 4065 72457eb4-c2bc-472e-a4f5-b9ed98a7cc55 USER_03237 2024-06-14 05:50
## 3845 3845 7cc0e519-6d67-4aa4-ac3c-7637bd83683d USER_03871 2024-05-28 16:29
## 1873 1873 645362b8-bdc8-48a4-8ba6-80b7b6d2a8c3 USER_05353 2024-01-03 21:31
##      product_category          product_name         merchant_name
## 3805    Education Fee        IELTS Training              Coursera
## 4599        Taxi Fare      Airport Transfer                Rapido
## 3362       Water Bill   Commercial Property Hyderabad Water Board
## 4065        Taxi Fare        Rental Package                  Uber
## 3845        Taxi Fare        Rental Package                Jugnoo
## 1873    Internet Bill Broadband 50Mbps Plan             Jio Fiber
##      product_amount transaction_fee cashback loyalty_points payment_method
## 3805        6198.02           27.58    30.02            392            UPI
## 4599        6066.91           30.37    33.47            173     Debit Card
## 3362        7026.12           10.88    99.96            328 Wallet Balance
## 4065        8214.70            4.55    76.09             96  Bank Transfer
## 3845        1422.18           12.09    63.56             67  Bank Transfer
## 1873        4846.55            0.45    71.20            895            UPI
##      transaction_status merchant_id device_type location
## 3805         Successful  MERCH_0120     Android    Urban
## 4599         Successful  MERCH_0849     Android    Urban
## 3362         Successful  MERCH_0598     Android Suburban
## 4065         Successful  MERCH_0852     Android    Urban
## 3845         Successful  MERCH_0043         iOS    Urban
## 1873         Successful  MERCH_0323         iOS    Urban

The sample 4

head(df_4)

##       idx                       transaction_id    user_id transaction_date
## 3218 3218 f2fa2caf-4f2d-4865-aff9-528f5869d42d USER_00295 2024-04-14 00:46
## 384   384 6a207895-38bc-41d9-b2b5-48674247e183 USER_08989 2023-09-17 05:55
## 1928 1928 a034e187-4c6f-4796-9a1c-68d5d23349e9 USER_03450 2024-01-08 02:20
## 627   627 93b911e2-2660-4a04-b1bb-0934529bf842 USER_05563 2023-10-03 03:11
## 4987 4987 de0930db-c7b0-4ec8-91e7-fe22331f7642 USER_06427 2024-08-17 09:28
## 4365 4365 27309781-1ce6-46b7-a688-5b363328a28d USER_04781 2024-07-04 17:27
##      product_category           product_name merchant_name product_amount
## 3218    Hotel Booking                  Suite        Treebo        1383.56
## 384     Internet Bill     Fiber 100Mbps Plan       Hathway        1317.00
## 1928     Rent Payment  Commercial Space Rent      NoBroker        5885.22
## 627     Internet Bill Business Internet Plan     Jio Fiber        2228.22
## 4987  Mobile Recharge       Annual Plan 2999           Jio        3352.43
## 4365     Rent Payment       PG Accommodation      NoBroker         234.25
##      transaction_fee cashback loyalty_points payment_method transaction_status
## 3218           19.75    25.64            748            UPI         Successful
## 384            49.72    17.21            190 Wallet Balance         Successful
## 1928           44.32    51.88            365            UPI         Successful
## 627            30.48    35.30            885 Wallet Balance         Successful
## 4987           21.01    18.73            316    Credit Card         Successful
## 4365           22.17    79.62            873     Debit Card         Successful
##      merchant_id device_type location
## 3218  MERCH_0554     Android Suburban
## 384   MERCH_0022         iOS    Urban
## 1928  MERCH_0176         iOS Suburban
## 627   MERCH_0266     Android    Urban
## 4987  MERCH_0096     Android    Rural
## 4365  MERCH_0068         Web    Rural

The sample 5

head(df_5)

##       idx                       transaction_id    user_id transaction_date
## 4302 4302 57baf3ed-a809-4118-b929-68de04a4d4eb USER_09534 2024-06-30 09:48
## 4290 4290 0c1337a9-500d-4702-b814-f77b111d5ae2 USER_02841 2024-06-29 21:30
## 1445 1445 bde9e0d4-a31b-43fb-afe3-8c48e6e2c963 USER_09078 2023-12-03 13:51
## 4573 4573 d6d5cb9b-fe77-46fc-b857-d2a7a184fe6a USER_06332 2024-07-20 15:29
## 770   770 2f2178ab-52a4-4144-a134-91e0eac258ea USER_07853 2023-10-13 07:22
## 3565 3565 ca5cda0d-fbed-4e90-bb9c-5662687593f2 USER_06133 2024-05-08 18:28
##      product_category           product_name   merchant_name product_amount
## 4302         Gas Bill      Piped Natural Gas       Adani Gas        9385.57
## 4290       Water Bill Residential Connection Delhi Jal Board        7979.07
## 1445 Electricity Bill      Agricultural Bill      Tata Power        1040.50
## 4573    Internet Bill   Student Special Plan  BSNL Broadband        9992.47
## 770         Gift Card   Rs. 5000 Luxury Card   Shoppers Stop        4316.54
## 3565 Electricity Bill        Commercial Bill            BSES        9899.56
##      transaction_fee cashback loyalty_points payment_method transaction_status
## 4302           21.39    41.20            770  Bank Transfer         Successful
## 4290            3.05    31.16            726  Bank Transfer         Successful
## 1445           17.70    39.88            216    Credit Card         Successful
## 4573           34.61    72.06            124     Debit Card         Successful
## 770             6.94    46.12            666    Credit Card         Successful
## 3565           37.15    37.94            229     Debit Card         Successful
##      merchant_id device_type location
## 4302  MERCH_0478     Android    Urban
## 4290  MERCH_0470     Android    Urban
## 1445  MERCH_0290         iOS    Urban
## 4573  MERCH_0678     Android    Rural
## 770   MERCH_0533     Android    Urban
## 3565  MERCH_0645     Android    Urban

Scrutinize To scrutinize the subsamples, we can compare various statistics, such as the distribution of categorical variables and summary statistics for numeric variables we use Group_by function.

Now we will Compare Categorical Data: We’ll count how often each category (e.g., product_category, transaction_status) appears in each subsample.

# Compare categorical distributions across subsamples
compare_categorical <- function(df, variable) {
  df %>%
    group_by(!!sym(variable)) %>%
    summarise(count = n()) %>%
    arrange(desc(count))
}

# Check distribution of product_category across subsamples
cat_df1 <- compare_categorical(df_1, "product_category")
cat_df2 <- compare_categorical(df_2, "product_category")
cat_df3 <- compare_categorical(df_3, "product_category")
cat_df4 <- compare_categorical(df_4, "product_category")
cat_df5 <- compare_categorical(df_5, "product_category")

We will display the Compared Samples

cat_df1

## # A tibble: 20 × 2
##    product_category  count
##    <chr>             <int>
##  1 Streaming Service   152
##  2 Education Fee       151
##  3 Hotel Booking       145
##  4 Movie Ticket        142
##  5 Electricity Bill    138
##  6 Rent Payment        137
##  7 Gaming Credits      130
##  8 Water Bill          127
##  9 Gift Card           125
## 10 Mobile Recharge     123
## 11 Food Delivery       120
## 12 Bus Ticket          118
## 13 Loan Repayment      118
## 14 Internet Bill       117
## 15 Flight Booking      115
## 16 Online Shopping     114
## 17 Gas Bill            111
## 18 Grocery Shopping    110
## 19 Insurance Premium   106
## 20 Taxi Fare           101

cat_df2

## # A tibble: 20 × 2
##    product_category  count
##    <chr>             <int>
##  1 Streaming Service   161
##  2 Education Fee       155
##  3 Gas Bill            150
##  4 Electricity Bill    141
##  5 Food Delivery       135
##  6 Movie Ticket        135
##  7 Water Bill          135
##  8 Hotel Booking       130
##  9 Taxi Fare           130
## 10 Rent Payment        128
## 11 Internet Bill       127
## 12 Mobile Recharge     123
## 13 Online Shopping     120
## 14 Bus Ticket          119
## 15 Loan Repayment      118
## 16 Grocery Shopping    111
## 17 Insurance Premium   105
## 18 Gift Card            98
## 19 Gaming Credits       93
## 20 Flight Booking       86

cat_df3

## # A tibble: 20 × 2
##    product_category  count
##    <chr>             <int>
##  1 Movie Ticket        154
##  2 Streaming Service   146
##  3 Water Bill          142
##  4 Electricity Bill    139
##  5 Education Fee       137
##  6 Hotel Booking       135
##  7 Gas Bill            131
##  8 Rent Payment        130
##  9 Taxi Fare           129
## 10 Grocery Shopping    125
## 11 Internet Bill       125
## 12 Food Delivery       123
## 13 Gift Card           122
## 14 Bus Ticket          118
## 15 Gaming Credits      115
## 16 Online Shopping     114
## 17 Insurance Premium   111
## 18 Mobile Recharge     110
## 19 Flight Booking      101
## 20 Loan Repayment       93

cat_df4

## # A tibble: 20 × 2
##    product_category  count
##    <chr>             <int>
##  1 Streaming Service   160
##  2 Hotel Booking       148
##  3 Water Bill          143
##  4 Taxi Fare           138
##  5 Electricity Bill    136
##  6 Movie Ticket        136
##  7 Rent Payment        135
##  8 Education Fee       131
##  9 Food Delivery       130
## 10 Loan Repayment      125
## 11 Gas Bill            124
## 12 Gift Card           123
## 13 Mobile Recharge     118
## 14 Bus Ticket          114
## 15 Gaming Credits      114
## 16 Online Shopping     113
## 17 Grocery Shopping    110
## 18 Insurance Premium   109
## 19 Flight Booking      100
## 20 Internet Bill        93

cat_df5

## # A tibble: 20 × 2
##    product_category  count
##    <chr>             <int>
##  1 Education Fee       151
##  2 Streaming Service   144
##  3 Water Bill          140
##  4 Taxi Fare           134
##  5 Grocery Shopping    133
##  6 Rent Payment        132
##  7 Movie Ticket        130
##  8 Gas Bill            129
##  9 Bus Ticket          127
## 10 Internet Bill       125
## 11 Mobile Recharge     123
## 12 Electricity Bill    122
## 13 Food Delivery       121
## 14 Hotel Booking       121
## 15 Online Shopping     117
## 16 Flight Booking      116
## 17 Insurance Premium   114
## 18 Gaming Credits      109
## 19 Loan Repayment      108
## 20 Gift Card           104

Now we will Compare Numeric Data: We’ll compute summary statistics (mean, median, etc.) for numeric variables like product_amount or cashback.

# Function to calculate summary statistics for numeric columns
compare_numeric <- function(df, variable) {
  df %>%
    summarise(mean = mean(!!sym(variable), na.rm = TRUE),
              median = median(!!sym(variable), na.rm = TRUE),
              sd = sd(!!sym(variable), na.rm = TRUE),
              min = min(!!sym(variable), na.rm = TRUE),
              max = max(!!sym(variable), na.rm = TRUE))
}

# Compare product_amount across subsamples
num_df1 <- compare_numeric(df_1, "product_amount")
num_df2 <- compare_numeric(df_2, "product_amount")
num_df3 <- compare_numeric(df_3, "product_amount")
num_df4 <- compare_numeric(df_4, "product_amount")
num_df5 <- compare_numeric(df_5, "product_amount")

We will display the summary statistics of each dataframes

num_df1

##       mean  median       sd   min     max
## 1 5033.087 4981.41 2878.074 11.75 9992.47

num_df2

##       mean  median       sd   min     max
## 1 4967.581 4932.35 2880.227 13.33 9993.54

num_df3

##       mean  median       sd   min     max
## 1 4906.752 4923.43 2876.466 10.09 9993.54

num_df4

##       mean   median       sd   min     max
## 1 5015.371 5005.695 2881.142 10.09 9993.54

num_df5

##       mean  median       sd   min     max
## 1 5031.011 5025.56 2908.699 10.09 9994.83

Visualization of comparisons

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Combine summary statistics for each subsample into one data frame
summary_stats <- data.frame(
  Sample = rep(c("df_1", "df_2", "df_3", "df_4", "df_5"), each = 5),
  Statistic = rep(c("Mean", "Median", "Standard Deviation", "Min", "Max"), times = 5),
  Value = c(mean(df_1$product_amount, na.rm = TRUE), median(df_1$product_amount, na.rm = TRUE), sd(df_1$product_amount, na.rm = TRUE), min(df_1$product_amount, na.rm = TRUE), max(df_1$product_amount, na.rm = TRUE),
            mean(df_2$product_amount, na.rm = TRUE), median(df_2$product_amount, na.rm = TRUE), sd(df_2$product_amount, na.rm = TRUE), min(df_2$product_amount, na.rm = TRUE), max(df_2$product_amount, na.rm = TRUE),
            mean(df_3$product_amount, na.rm = TRUE), median(df_3$product_amount, na.rm = TRUE), sd(df_3$product_amount, na.rm = TRUE), min(df_3$product_amount, na.rm = TRUE), max(df_3$product_amount, na.rm = TRUE),
            mean(df_4$product_amount, na.rm = TRUE), median(df_4$product_amount, na.rm = TRUE), sd(df_4$product_amount, na.rm = TRUE), min(df_4$product_amount, na.rm = TRUE), max(df_4$product_amount, na.rm = TRUE),
            mean(df_5$product_amount, na.rm = TRUE), median(df_5$product_amount, na.rm = TRUE), sd(df_5$product_amount, na.rm = TRUE), min(df_5$product_amount, na.rm = TRUE), max(df_5$product_amount, na.rm = TRUE))
)

Means for all samples

ggplot(summary_stats %>% filter(Statistic == "Mean"), aes(x = Sample, y = Value, fill = Sample)) +
  geom_bar(stat = "identity") +
  labs(title = "Mean Product Amount by Sample", x = "Sample", y = "Mean") +
  theme_minimal()

Observation: Sample 1 has a higher mean product amount compared to others indicating that transactions in that subsample have, on average, larger values.Other subsample 3 have lower mean, indicating smaller average spending amounts.A higher mean driven by a few large purchases and high-value products electronics or luxury goods.A lower mean reflect frequent small transactions such as everyday purchases e.g., groceries, low-cost services.

Medians for all samples

ggplot(summary_stats %>% filter(Statistic == "Median"), aes(x = Sample, y = Value, fill = Sample)) +
  geom_bar(stat = "identity") +
  labs(title = "Median Product Amount by Sample", x = "Sample", y = "Median") +
  theme_minimal()

Observation :The meadian of Sample 5 is high and the median of sample 3 is less compared to all.Median reflects the “typical” transaction amount, which is less affected by extreme values. A higher median indicate that most transactions in a subsample consist of moderately expensive purchases as electronics, while a lower median points to more routine, inexpensive transactions groceries.

Standard deviation for all samples

ggplot(summary_stats %>% filter(Statistic == "Standard Deviation"), aes(x = Sample, y = Value, fill = Sample)) +
  geom_bar(stat = "identity") +
  labs(title = "Standard Deviation of Product Amount by Sample", x = "Sample", y = "Standard Deviation") +
  theme_minimal()

Observations: The Sample 5 and sample 3 has high standard deviation and less standard deviation respectively.A high standard deviation could reflect a mix of different types of purchases large electronics purchases alongside smaller daily purchases.A lower standard deviation indicate that transactions are uniform in size, potentially from more homogeneous purchasing behavior repetitive, small transactions.

Minimum for all samples

ggplot(summary_stats %>% filter(Statistic == "Min"), aes(x = Sample, y = Value, fill = Sample)) +
  geom_bar(stat = "identity") +
  labs(title = "Minimum Product Amount by Sample", x = "Sample", y = "Min") +
  theme_minimal()

Observation: The minimum of all samples is high in Sample 2 and less minimum in sample 3,4,5 because the high minimum sample evn has less expenses on large products compared to all.Small or negligible transactions may come from refunds, discounts, or low-cost items, which are common across subsamples regardless of category.

Maximum for all samples

ggplot(summary_stats %>% filter(Statistic == "Max"), aes(x = Sample, y = Value, fill = Sample)) +
  geom_bar(stat = "identity") +
  labs(title = "Maximum Product Amount by Sample", x = "Sample", y = "Max") +
  theme_minimal()

Observation: Subsample 5 show higher maximum values compared to others, indicating that those subsamples include some very large transactions.The lower maximum subsample 1 means that the most expensive transaction is still moderate in size.A higher maximum reflect high-value purchases, such as luxury goods or significant service payments.A lower maximum indicate that customers in this subsample did not make any very large transactions, sticking to lower-value purchases.

Anomaly in One Sub-Sample

High Transaction Amounts: In a sub-sample with lower average spending ( groceries or low-cost services), a very high transaction amount ( $10,000 purchase) would be considered an anomaly. However, in a sub-sample consisting of higher spending categories (electronics or luxury products), such a transaction would be typical and not considered an anomaly.

Low Transaction Amounts: In sub-samples where high-value purchases are the norm (electronics), a very low transaction amount (under $5) might be seen as anomalous.

This could indicate an unusual product, a refund, or a data entry error. In other sub-samples, particularly for categories like groceries or services, small transactions are common, and thus this wouldn’t be considered an anomaly. Thus, what constitutes an anomaly largely depends on the context of the sub-sample and the expected range of transaction amounts within that category.

Consistent Aspects Across All Sub-Samples:

Small Transactions: Even in sub-samples where the overall spending is higher ( electronics or travel), there may still be a small portion of low-value transactions that are consistent across all sub-samples. These could be for low-cost services, accessories, or small products.

Transaction Frequency: The number of transactions in each sub-sample might be somewhat consistent if the data was sampled with equal probability across the dataset. Even if the amounts differ, the count of transactions across categories might stay roughly stable.

Payment Method Distribution: If a certain payment method (credit card or mobile wallet) is dominant in the dataset, this could be consistent across all sub-samples, showing similar adoption of payment methods regardless of the specific product category or transaction amount. These consistent elements provide insights into the broader spending behavior and purchasing trends across all transaction types.

OPTIONAL I WORKED ON IT

Monte Carlo simulations

library(dplyr)
library(ggplot2)

set.seed(123)
samples <- list()
for (i in 1:5) {
  samples[[i]] <- data %>%
    sample_frac(size = 0.5, replace = TRUE)
}
summary_stats <- data.frame(
  Sample = character(),
  Mean_Spending = numeric(),
  Median_Spending = numeric(),
  SD_Spending = numeric()
)

for (i in 1:5) {
  sample_mean <- mean(samples[[i]]$product_amount, na.rm = TRUE)
  sample_median <- median(samples[[i]]$product_amount, na.rm = TRUE)
  sample_sd <- sd(samples[[i]]$product_amount, na.rm = TRUE)
  
  summary_stats <- rbind(summary_stats, data.frame(
    Sample = paste("Sample", i),
    Mean_Spending = sample_mean,
    Median_Spending = sample_median,
    SD_Spending = sample_sd
  ))
}

ggplot(summary_stats, aes(x = Sample)) +
  geom_bar(aes(y = Mean_Spending, fill = "Mean"), stat = "identity", alpha = 0.6, color = "darkblue", linewidth = 1) +
  geom_point(aes(y = Median_Spending, color = "Median"), size = 4, shape = 18) +  # Larger points for median
  geom_errorbar(aes(ymin = Mean_Spending - SD_Spending, ymax = Mean_Spending + SD_Spending), 
                width = 0.2, color = "black", linewidth = 1.2) +  # Prominent error bars
  labs(title = "Mean, Median, and Standard Deviation of Spending Across 5 Samples",
       x = "Samples", y = "Spending Amount") +
  scale_fill_manual(name = "Legend", values = c("Mean" = "lightblue")) +  # Use distinct color for mean
  scale_color_manual(name = "Legend", values = c("Median" = "red")) +  # Red for median points
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  
        plot.title = element_text(hjust = 0.5),  
        legend.position = "top") +  
  geom_text(aes(y = Mean_Spending, label = round(Mean_Spending, 2)), vjust = -1, size = 3) +  
  geom_text(aes(y = Median_Spending, label = round(Median_Spending, 2), color = "Median"), vjust = 1.5, size = 3)

Observations from the Monte Carlo Simulation and Plot:

Categories like “Travel” and “Electronics” are likely to show higher variability in spending. This is because large purchases, like vacations or expensive electronic items, can skew both the average and the total spending. Categories with Consistent Spending,Categories such as “Groceries” and “Food & Beverages” typically show more consistent spending patterns across simulations, as these are everyday, frequent purchases that have less variability in price compared to luxury or high-cost items. Anomalies in Certain Subsamples:In some subsamples, anomalies may appear in categories like “Luxury” or “Electronics” if large transactions are over-represented. However, such outliers would not be present in categories with low transaction variance like “Groceries.” Monte Carlo simulations smooth out these anomalies, showing what a “normal” average should look like after accounting for randomness.

Week 4 prob

2024-09-25

OPTIONAL I WORKED ON IT

Monte Carlo simulations