Data Dive - Confidence Interval

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

obesity_data = read.csv('/Users/ankit/Downloads/Obesity.csv')
obesity_data1 <- obesity_data

DATA COMBINATION 1

# Calculating BMI
obesity_data1$bmi <- obesity_data1$Weight/(obesity_data1$Height*obesity_data1$Height)

# Only inlcuding releveant columns
obesity_data1 = subset(obesity_data1, select = c(bmi, NObeyesdad, Age))

# Removing values where weight is insufficient
obesity_data1 <- obesity_data1 %>% filter(NObeyesdad != 'Insufficient_Weight')

# Categorizing all levels of Overweight (I and II) as Overweight
obesity_data1 <- obesity_data1 %>% 
  mutate(weight_category = case_when(
    NObeyesdad == 'Normal_Weight' ~ 'Normal',
    NObeyesdad == 'Overweight_Level_I' ~ 'Overweight',
    NObeyesdad == 'Overweight_Level_II' ~ 'Overweight',
    NObeyesdad == 'Obesity_Type_I' ~ 'Obese',
    NObeyesdad == 'Obesity_Type_II' ~ 'Obese',
    NObeyesdad == 'Obesity_Type_III' ~ 'Obese',
  )
  )

# Visualization
ggplot(obesity_data1, aes(x=weight_category, y=bmi)) + 
  geom_point()

Here, the overweight category has a smaller range. Obesed category has more range.

ggplot(obesity_data1, aes(x=weight_category, y=Age)) + 
  geom_point()

We could see some outliers here. Most people who are obese, are in the range of 16-42 years of age. There are less people in the normal weight category since the data points are more scattered and are less.In the overweight category,some are in 50 + range.

# Correlation Coefficient
cor(obesity_data1$bmi, obesity_data1$Age)

## [1] 0.1050587

We already know that the correlation lies between -1 to 1. More inclined towards -1: less correlation. More inclined towards 1: more correlation. Here we got Very less correlation, which actually makes sense.

# Calculate the confidence interval
result <- t.test(obesity_data1$bmi)

# Extract the confidence interval
confidence_interval <- result$conf.int

# Print the confidence interval
confidence_interval

## [1] 31.20226 31.83535
## attr(,"conf.level")
## [1] 0.95

95% degree of certainty that the true population means is within the range of 31.2 to 31.8.

table(obesity_data1$weight_category)

## 
##     Normal      Obese Overweight 
##        287        972        580

DATA COMBINATION 2

# Only inlcuding releveant columns
obesity_data2 = subset(obesity_data, select = c(Age, SMOKE, CALC, NObeyesdad))

# Removing values where weight is insufficient
obesity_data2 <- obesity_data2 %>% filter(NObeyesdad != 'Insufficient_Weight')

# Categorizing all levels of Overweight (I and II) as Overweight
obesity_data2 <- obesity_data2 %>% 
  mutate(smoke_category = case_when(
    SMOKE == 'no' ~ 0,
    SMOKE == 'yes' ~ 1
  )
  )

obesity_data2 <- obesity_data2 %>% 
  mutate(alcohol_category = case_when(
    CALC == 'no' ~ 0,
    CALC == 'Sometimes' ~ 1,
    CALC == 'Frequently' ~ 2
  )
  )

#This is to understand if an individual smoke and drink together or not. sub abuse value: 0 means, no smoke and no drink. subs abuse: 1 means, individual either smokes or drink. if 2: no smoke and frequently drink OR does smoke, sometimes drink. If 3: smokes and drinks frequently.
obesity_data2$substance_abuse <- obesity_data2$smoke_category + obesity_data2$alcohol_category

# Categorizing all levels of Overweight (I and II) as Overweight
obesity_data2 <- obesity_data2 %>% 
  mutate(weight_category = case_when(
    NObeyesdad == 'Normal_Weight' ~ 'Normal',
    NObeyesdad == 'Overweight_Level_I' ~ 'Overweight',
    NObeyesdad == 'Overweight_Level_II' ~ 'Overweight',
    NObeyesdad == 'Obesity_Type_I' ~ 'Obese',
    NObeyesdad == 'Obesity_Type_II' ~ 'Obese',
    NObeyesdad == 'Obesity_Type_III' ~ 'Obese',
  )
  )

VISUALIZATION

substance_obesity_relationship <- obesity_data2 %>% count(substance_abuse, weight_category, sort = TRUE)

ggplot(data = substance_obesity_relationship, aes(x = substance_abuse, y = n, fill = weight_category)) +
  geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)

## Warning: Removed 1 rows containing missing values (`geom_bar()`).

This histogram tells us whether smoking or drinking increases the chance of obesity.
more obese in every category Most of them are obese since data is skewed. the chances of getting obesed increases by 100 % when compared to the obesed population who neither smokes nor drinks.

# Correlation Coefficient
cor(obesity_data2$Age, obesity_data2$substance_abuse)

## [1] NA

Substance abuse is a categorivcal variable.Since age and weight+height are the only two combinations of numerical variabl, number of correlation coefficient that can be calculated btw continuous variables are limited. Therefore to show my understanding of coeff corel variable, i have attempted to include age which is a continuous variable and substance abuse which is categorical variable.

# Calculate the confidence interval
result <- t.test(obesity_data2$Age)

# Extract the confidence interval
confidence_interval <- result$conf.int

# Print the confidence interval
confidence_interval

## [1] 24.68721 25.27784
## attr(,"conf.level")
## [1] 0.95

95 % confidence : the avg age of this population is btw 24.68 to 25.27

DATA COMBINATION 3

Hence, continuous variable are limted and we already used all of them , so I will not be repeating the correlation coeffiecient and confidence interval in the third example.

obesity_data3 <- obesity_data

# Only inlcuding releveant columns
obesity_data3 = subset(obesity_data3, select = c(family_history_with_overweight, FAF , NObeyesdad))

# Removing values where weight is insufficient
obesity_data3 <- obesity_data3 %>% filter(NObeyesdad != 'Insufficient_Weight')

# Categorizing all levels of Overweight (I and II) as Overweight
obesity_data3 <- obesity_data3 %>% 
  mutate(weight_category = case_when(
    NObeyesdad == 'Normal_Weight' ~ 'Normal',
    NObeyesdad == 'Overweight_Level_I' ~ 'Overweight',
    NObeyesdad == 'Overweight_Level_II' ~ 'Overweight',
    NObeyesdad == 'Obesity_Type_I' ~ 'Obese',
    NObeyesdad == 'Obesity_Type_II' ~ 'Obese',
    NObeyesdad == 'Obesity_Type_III' ~ 'Obese',
  )
  )

# Categorizing all levels of family history with overweight
obesity_data3 <- obesity_data3 %>% 
  mutate(family_history_with_overweight_numerical = case_when(
    family_history_with_overweight == 'no' ~ 0,
    family_history_with_overweight == 'yes' ~ 1
  )
  )

# Categorizing all levels of Physical Activity
# 3 = No Activity, 2 = Some Activvity, 1 = Moderate Activity, 0 = Most Activity
obesity_data3 <- obesity_data3 %>% 
  mutate(exercise_category = case_when(
    FAF == 0 ~ 3,
    FAF == 1 ~ 2,
    FAF == 2 ~ 1,
    FAF == 3 ~ 0
  )
  )

obesity_data3$family_history_physical_activity_relationship <- obesity_data3$exercise_category + obesity_data3$family_history_with_overweight_numerical

# 4 = Individual that has family history of being overweight, but doesn't do any exercise
obesity_data3 <- obesity_data3 %>% filter(family_history_physical_activity_relationship == 4)

family_history_and_physical_act_relationship <- obesity_data3 %>% count(family_history_with_overweight_numerical, weight_category, sort = TRUE)

ggplot(data = family_history_and_physical_act_relationship, aes(x = weight_category, y = n)) +
  geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)

This chart shows that if you have a family history of being overweight and do not exercise, your chances of being Obese are much higher.

Data Dive - Confidence Interval

Jagriti Mahajan

2023-10-02