library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Our population dataset
df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)
Mode function
mode_fun <- function(x){
uniq_x <- unique(x)
uniq_x[which.max(tabulate(match(x, uniq_x)))]
}
df_1 <- df[sample(nrow(df), 1056), ]
df_2 <- df[sample(nrow(df), 1056), ]
df_3 <- df[sample(nrow(df), 1056), ]
df_4 <- df[sample(nrow(df), 1056), ]
df_5 <- df[sample(nrow(df), 1056), ]
df_1 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker = sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
## NObeyesdad count num_walker num_biker mode_exercise mean_weight mean_height
## <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Obesity_Type… 168 0 0 0 121. 1.69
## 2 Obesity_Type… 143 1 0 0 115. 1.77
## 3 Obesity_Type… 180 2 0 0 93.1 1.70
## 4 Overweight_L… 135 5 0 0 82.1 1.70
## 5 Overweight_L… 154 6 1 0 73.7 1.68
## 6 Normal_Weight 149 17 1 1 61.8 1.67
## 7 Insufficient… 127 2 0 2 49.7 1.69
## # ℹ 1 more variable: mean_meals <dbl>
df_2 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker = sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
## NObeyesdad count num_walker num_biker mode_exercise mean_weight mean_height
## <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Obesity_Type… 156 0 0 0 119. 1.68
## 2 Obesity_Type… 140 0 0 0 115. 1.77
## 3 Obesity_Type… 178 2 0 0 92.1 1.69
## 4 Overweight_L… 143 0 0 0 82.3 1.71
## 5 Overweight_L… 149 6 1 0 74.8 1.69
## 6 Normal_Weight 143 15 2 1 62.1 1.68
## 7 Insufficient… 147 5 0 2 49.4 1.68
## # ℹ 1 more variable: mean_meals <dbl>
df_3 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker = sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
## NObeyesdad count num_walker num_biker mode_exercise mean_weight mean_height
## <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Obesity_Type… 182 0 0 0 121. 1.69
## 2 Obesity_Type… 139 0 1 0 115. 1.77
## 3 Obesity_Type… 179 1 0 0 92.7 1.69
## 4 Overweight_L… 143 4 0 0 82.1 1.70
## 5 Overweight_L… 136 6 1 0 74.2 1.69
## 6 Normal_Weight 143 15 2 1 61.7 1.67
## 7 Insufficient… 134 1 0 2 49.5 1.68
## # ℹ 1 more variable: mean_meals <dbl>
df_4 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker = sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
## NObeyesdad count num_walker num_biker mode_exercise mean_weight mean_height
## <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Obesity_Type… 169 0 0 0 121. 1.69
## 2 Obesity_Type… 155 1 1 0 116. 1.77
## 3 Obesity_Type… 165 1 0 0 92.0 1.68
## 4 Overweight_L… 137 2 0 0 81.8 1.70
## 5 Overweight_L… 146 7 1 1 74.0 1.69
## 6 Normal_Weight 158 17 2 1 61.8 1.67
## 7 Insufficient… 126 2 0 2 50.0 1.69
## # ℹ 1 more variable: mean_meals <dbl>
df_5 |> group_by(NObeyesdad) |> summarise(count = n(),num_walker = sum(MTRANS == 'Walking'), num_biker = sum(MTRANS == 'Bike'),mode_exercise = mode_fun(FAF), mean_weight = mean(Weight),mean_height = mean(Height), mean_meals = mean(NCP)) |> arrange(desc(mean_weight))
## # A tibble: 7 × 8
## NObeyesdad count num_walker num_biker mode_exercise mean_weight mean_height
## <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Obesity_Type… 168 0 0 0 121. 1.69
## 2 Obesity_Type… 129 0 0 0 115. 1.77
## 3 Obesity_Type… 178 2 0 0 93.9 1.70
## 4 Overweight_L… 155 1 0 0 82.6 1.71
## 5 Overweight_L… 144 6 1 1 73.6 1.68
## 6 Normal_Weight 146 20 2 0 62.5 1.68
## 7 Insufficient… 136 4 0 2 50.1 1.69
## # ℹ 1 more variable: mean_meals <dbl>
All five sample data frames have different counts of obesity levels, although each of them contain a total of 1,056 rows of data.
There is very slight differences in means of variables but it is explained by central limit theorem(as the means of multiple samples will form a normal distribution at the true mean of population).
There are no walkers in Obesity_Type_II in df_2,df_4,df_5 samples (assuming its a norm). But there is 1 walker in Obesity_Type_II in df_1 and df_3.(this is an anomaly)
Similarly, Number of Obesity_Type_II bikers is 1 in df_2,df_4,df_5 but there are no Obesity_Type_II bikers in df_1 and df_3
The mode of varibale -frequency of workout- for OverWeight_Level_II is 1 in df_1,df_2,df_3,df_4. But in df_5 its 0 (anomaly).
mean_weight, mean_exercise, mean_meals are very consistent among all the samples.
num_walker for certain obesity categories(‘Insufficient_Weight’, ‘Normal_Weight’, ‘Obesity_Type_III’) are also consistent(no significant variations).
Using Monte Carlo simulations to generate weights and heights, and fitting them into their obesity categories based on observations
set.seed(4)
weights <- runif(10, min = 39, max = 173)
heights <- runif(10,min = 1.45, max = 1.98)
gen_data <-data.frame(Weight = weights,Height = heights)
gen_data$index <- 1:nrow(gen_data)
gen_data
## Weight Height index
## 1 117.49724 1.849978 1
## 2 40.19874 1.601580 2
## 3 78.36111 1.503028 3
## 4 76.16824 1.955656 4
## 5 148.01894 1.670272 5
## 6 73.89732 1.691204 6
## 7 136.07039 1.964659 7
## 8 160.41635 1.759514 8
## 9 166.17139 1.959968 9
## 10 48.80136 1.853702 10
Plotting our generated data along with original dataset
ggplot() +
geom_point(data = gen_data, aes(x = Height, y = Weight), alpha = 1) +
geom_point(data = df[,c('Weight', 'Height', 'NObeyesdad')], aes(x = Height, y = Weight, color = NObeyesdad), alpha = 0.4) +
geom_text(data = gen_data, aes(x = Height, y = Weight,vjust = -1 ,label = index))+
theme_minimal()
Results from Observation:
Obesity category | gen_data datapoint indexes |
---|---|
Obesity_Type_III | 5,8,9 |
Obesity_Type_II. | 1*,7 |
Obesity_Type_I | 3 |
OverWeight_Level_II | - |
Overweight_Level_I. | 6 |
Normal_Weight | 4 |
Insufficient_Weight. | 2,10 |
*Note:1st index datapoint is on the border of Obesity_Type_I and Obesity_Type_II.
This analysis has improved our understanding of sampling distribution and solidified the concept of the Central Limit Theorem. This investigation enhances statistical reasoning, enabling more accurate and actionable conclusions in the future.
Because the sample size is 50% ,the dispersion in means of variable is less. That means,sample means are not far away from True mean of the population.
All the anomalies obtained are not critical anomalies, because all the 5 samples simulate the population very well.
The Monte Carlo simulations used above rely on a uniform probability distribution to generate random sample data points. Consequently, the generated data points are spread out across the range, as the probability is the same for all values within the minimum and maximum range.
The affect of decreasing and increasing Sample size on different statistical metrics.