library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

Our population dataset

df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)

Mode function

mode_fun <- function(x){
  uniq_x <- unique(x)
  uniq_x[which.max(tabulate(match(x, uniq_x)))]
}

5 Random Samples

df_1 <- df[sample(nrow(df), 1056), ]
df_2 <- df[sample(nrow(df), 1056), ]
df_3 <- df[sample(nrow(df), 1056), ]
df_4 <- df[sample(nrow(df), 1056), ]
df_5 <- df[sample(nrow(df), 1056), ]

Sample_1

df_1 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker =  sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
##   NObeyesdad    count num_walker num_biker mode_exercise mean_weight mean_height
##   <chr>         <int>      <int>     <int>         <dbl>       <dbl>       <dbl>
## 1 Obesity_Type…   168          0         0             0       121.         1.69
## 2 Obesity_Type…   143          1         0             0       115.         1.77
## 3 Obesity_Type…   180          2         0             0        93.1        1.70
## 4 Overweight_L…   135          5         0             0        82.1        1.70
## 5 Overweight_L…   154          6         1             0        73.7        1.68
## 6 Normal_Weight   149         17         1             1        61.8        1.67
## 7 Insufficient…   127          2         0             2        49.7        1.69
## # ℹ 1 more variable: mean_meals <dbl>

Sample_2

df_2 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker =  sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
##   NObeyesdad    count num_walker num_biker mode_exercise mean_weight mean_height
##   <chr>         <int>      <int>     <int>         <dbl>       <dbl>       <dbl>
## 1 Obesity_Type…   156          0         0             0       119.         1.68
## 2 Obesity_Type…   140          0         0             0       115.         1.77
## 3 Obesity_Type…   178          2         0             0        92.1        1.69
## 4 Overweight_L…   143          0         0             0        82.3        1.71
## 5 Overweight_L…   149          6         1             0        74.8        1.69
## 6 Normal_Weight   143         15         2             1        62.1        1.68
## 7 Insufficient…   147          5         0             2        49.4        1.68
## # ℹ 1 more variable: mean_meals <dbl>

Sample_3

df_3 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker =  sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
##   NObeyesdad    count num_walker num_biker mode_exercise mean_weight mean_height
##   <chr>         <int>      <int>     <int>         <dbl>       <dbl>       <dbl>
## 1 Obesity_Type…   182          0         0             0       121.         1.69
## 2 Obesity_Type…   139          0         1             0       115.         1.77
## 3 Obesity_Type…   179          1         0             0        92.7        1.69
## 4 Overweight_L…   143          4         0             0        82.1        1.70
## 5 Overweight_L…   136          6         1             0        74.2        1.69
## 6 Normal_Weight   143         15         2             1        61.7        1.67
## 7 Insufficient…   134          1         0             2        49.5        1.68
## # ℹ 1 more variable: mean_meals <dbl>

Sample_4

df_4 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker =  sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
##   NObeyesdad    count num_walker num_biker mode_exercise mean_weight mean_height
##   <chr>         <int>      <int>     <int>         <dbl>       <dbl>       <dbl>
## 1 Obesity_Type…   169          0         0             0       121.         1.69
## 2 Obesity_Type…   155          1         1             0       116.         1.77
## 3 Obesity_Type…   165          1         0             0        92.0        1.68
## 4 Overweight_L…   137          2         0             0        81.8        1.70
## 5 Overweight_L…   146          7         1             1        74.0        1.69
## 6 Normal_Weight   158         17         2             1        61.8        1.67
## 7 Insufficient…   126          2         0             2        50.0        1.69
## # ℹ 1 more variable: mean_meals <dbl>

Sample_5

df_5 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker =  sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
##   NObeyesdad    count num_walker num_biker mode_exercise mean_weight mean_height
##   <chr>         <int>      <int>     <int>         <dbl>       <dbl>       <dbl>
## 1 Obesity_Type…   168          0         0             0       121.         1.69
## 2 Obesity_Type…   129          0         0             0       115.         1.77
## 3 Obesity_Type…   178          2         0             0        93.9        1.70
## 4 Overweight_L…   155          1         0             0        82.6        1.71
## 5 Overweight_L…   144          6         1             1        73.6        1.68
## 6 Normal_Weight   146         20         2             0        62.5        1.68
## 7 Insufficient…   136          4         0             2        50.1        1.69
## # ℹ 1 more variable: mean_meals <dbl>

How different are they?

  1. All five sample data frames have different counts of obesity levels, although each of them contain a total of 1,056 rows of data.

  2. There is very slight differences in means of variables but it is explained by central limit theorem(as the means of multiple samples will form a normal distribution at the true mean of population).

What would you have called an anomaly in one sub-sample that you wouldn’t in another?

  1. There are no walkers in Obesity_Type_II in df_2,df_4,df_5 samples (assuming its a norm). But there is 1 walker in Obesity_Type_II in df_1 and df_3.(this is an anomaly)

  2. Similarly, Number of Obesity_Type_II bikers is 1 in df_2,df_4,df_5 but there are no Obesity_Type_II bikers in df_1 and df_3

  3. The mode of varibale -frequency of workout- for OverWeight_Level_II is 1 in df_1,df_2,df_3,df_4. But in df_5 its 0 (anomaly).

What aspects of the data are consistent among all sub_samples?

  1. mean_weight, mean_exercise, mean_meals are very consistent among all the samples.

  2. num_walker for certain obesity categories(‘Insufficient_Weight’, ‘Normal_Weight’, ‘Obesity_Type_III’) are also consistent(no significant variations).

Incorporating Monte Carlo simulations

Using Monte Carlo simulations to generate weights and heights, and fitting them into their obesity categories based on observations

set.seed(4)
weights <- runif(10, min = 39, max = 173)
heights <- runif(10,min = 1.45, max = 1.98)
gen_data <-data.frame(Weight = weights,Height = heights)
gen_data$index <- 1:nrow(gen_data)
gen_data
##       Weight   Height index
## 1  117.49724 1.849978     1
## 2   40.19874 1.601580     2
## 3   78.36111 1.503028     3
## 4   76.16824 1.955656     4
## 5  148.01894 1.670272     5
## 6   73.89732 1.691204     6
## 7  136.07039 1.964659     7
## 8  160.41635 1.759514     8
## 9  166.17139 1.959968     9
## 10  48.80136 1.853702    10

Plotting our generated data along with original dataset

ggplot() +
  geom_point(data = gen_data, aes(x = Height, y = Weight), alpha = 1) +  
  geom_point(data = df[,c('Weight', 'Height', 'NObeyesdad')], aes(x = Height, y = Weight, color = NObeyesdad), alpha = 0.4) + 
  geom_text(data = gen_data, aes(x = Height, y = Weight,vjust = -1 ,label = index))+
  theme_minimal()

Results from Observation:

Obesity category gen_data datapoint indexes
Obesity_Type_III 5,8,9
Obesity_Type_II. 1*,7
Obesity_Type_I 3
OverWeight_Level_II -
Overweight_Level_I. 6
Normal_Weight 4
Insufficient_Weight. 2,10

*Note:1st index datapoint is on the border of Obesity_Type_I and Obesity_Type_II.

Affects on drawing conclusions about the data in future.

This analysis has improved our understanding of sampling distribution and solidified the concept of the Central Limit Theorem. This investigation enhances statistical reasoning, enabling more accurate and actionable conclusions in the future.

Conclusion from Analysis

  1. Because the sample size is 50% ,the dispersion in means of variable is less. That means,sample means are not far away from True mean of the population.

  2. All the anomalies obtained are not critical anomalies, because all the 5 samples simulate the population very well.

  3. The Monte Carlo simulations used above rely on a uniform probability distribution to generate random sample data points. Consequently, the generated data points are spread out across the range, as the probability is the same for all values within the minimum and maximum range.

Further Investigation

The affect of decreasing and increasing Sample size on different statistical metrics.