DATA DIVE 2

Reading CSV into Dataframe

Obesity <- read.csv('/Users/ankit/Downloads/Obesity.csv')

Loading the required libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Hyopothesis 1 : Younger Females smokes less than younger males.

min(Obesity$Age)

## [1] 14

max(Obesity$Age)

## [1] 61

#categorizing age into different age groups and creating a new dataframe: obesity age group
Obesity_Age_gp <- Obesity %>% 
  mutate(Age_gp = case_when(
    Age <=25 ~ "14-25",
    Age > 25 & Age <= 35 ~ "26-35",
    Age > 35 ~ "35+",
    TRUE ~ "other"
  )
)

#filter(Obesity_Age_gp, SMOKE == 'yes')
#nrow(filter(Obesity_Age_gp, SMOKE == 'yes'))


#dataframe named Obesity_Age_gp
smoker_by_gender_age <- Obesity_Age_gp %>%
  filter(SMOKE == 'yes') %>%
  group_by(Gender, Age_gp) %>%
  summarise(smoker_count = n())

## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.

# Print the result
print(smoker_by_gender_age)

## # A tibble: 6 × 3
## # Groups:   Gender [2]
##   Gender Age_gp smoker_count
##   <chr>  <chr>         <int>
## 1 Female 14-25            10
## 2 Female 26-35             2
## 3 Female 35+               3
## 4 Male   14-25             9
## 5 Male   26-35            18
## 6 Male   35+               2

total_by_gender_age <- Obesity_Age_gp %>%
  group_by(Gender, Age_gp) %>%
  summarise(total_count = n())

## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.

# Print the result
print(total_by_gender_age)

## # A tibble: 6 × 3
## # Groups:   Gender [2]
##   Gender Age_gp total_count
##   <chr>  <chr>        <int>
## 1 Female 14-25          689
## 2 Female 26-35          253
## 3 Female 35+            101
## 4 Male   14-25          680
## 5 Male   26-35          321
## 6 Male   35+             67

df2 <- smoker_by_gender_age %>% inner_join( total_by_gender_age, 
        by=c('Gender'='Gender', 'Age_gp'='Age_gp'))

df2$smoking_probability <- df2$smoker_count/df2$total_count

ANAMOLY

df2$is_anomaly <- ifelse(df2$smoking_probability < 0.01, "yes", "no")
df2

library(ggplot2)


# Create a bar chart
ggplot(data = df2, aes(x = Age_gp, y = smoking_probability, fill = Gender)) +
  geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75) +
  labs(x = "\n Age Group", y = "Probability\n", title = "\n Smoking Probability \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))

The least number of smokers are in 26-35 yers age group in female, which is only 0.8% and for the male of same age category, it is 5.6% . O.8 % is the lowest out of all different grps. There’s a possibility that women in that age group are generally in their child bearing age, that is why the number of feamle smokers are the least in that age category. In later life, the number of male and female smokers are somewhat comparable.

Hypothesis 2 : People having walk as mode of transportation come under normal weight category

df3 <- Obesity %>% 
  mutate(physical_activity = case_when(
    MTRANS %in% c("Public_Transportation", "Automobile", "Motorbike") ~ "No Physical Activity",
    MTRANS %in% c ("Walking","Bike") ~ "Physical Activity",
    TRUE ~ "other"
  )
)

df3 <- df3 %>%
  filter(NObeyesdad != 'Insufficient_Weight') 

df3 <- df3 %>% 
  mutate(Weight_Category = case_when(
    NObeyesdad %in% c("Normal_Weight") ~ "Normal Weight",
    TRUE ~ "Overweight + Obese"
  )
)

df3_normal_weight <- df3 %>%
  filter(Weight_Category == 'Normal Weight') %>%
  group_by(physical_activity) %>%
  summarise(normal_weight_count = n())


df3_total <- df3 %>%
  group_by(physical_activity) %>%
  summarise(total_count = n())


df3_results <- df3_normal_weight %>% inner_join( df3_total, 
        by=c('physical_activity'='physical_activity'))

df3_results$normal_weight_probability <- df3_results$normal_weight_count/df3_results$total_count

ggplot(data = df3_results, aes(x = physical_activity, y = normal_weight_probability)) +
  geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75) +
  labs(x = "\n Physical Activity", y = "Probability\n", title = "\n Probability of being Normal Weight considering physical activity\n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))

The bar chart shows there is some correlation between physical activity and probability of falling under normal weight category.

Hypothesis 3: People having a family history of being overweight come under overweight and obese weight category.

df4 <- Obesity %>%
  filter(NObeyesdad != 'Insufficient_Weight') 

df4 <- df4 %>% 
  mutate(weight_category = case_when(
    NObeyesdad %in% c("Normal_Weight") ~ "Normal_Weight",
    NObeyesdad %in% c("Overweight_Level_I", "Overweight_Level_II") ~ "Overweight",
    NObeyesdad %in% c ("Obesity_Type_I","Obesity_Type_II", "Obesity_Type_III") ~ "Obese",
    TRUE ~ "other"
  )
)

df4_family_history <- df4 %>%
  group_by(family_history_with_overweight) %>%
  summarise(family_history_records = n())

df4_family_history_and_weight_category <- df4 %>%
  group_by(family_history_with_overweight, weight_category) %>%
  summarise(weight_category_records = n())

## `summarise()` has grouped output by 'family_history_with_overweight'. You can
## override using the `.groups` argument.

df4_results <- df4_family_history %>% inner_join( df4_family_history_and_weight_category, 
        by=c('family_history_with_overweight'='family_history_with_overweight'))

df4_results$weight_category_probability <- df4_results$weight_category_records/df4_results$family_history_records

ggplot(data = df4_results, aes(x = weight_category, y = weight_category_probability, fill = family_history_with_overweight)) +
  geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75) +
  labs(x = "\n Weight Category", y = "Probability\n", title = "\n Probability of being in a weight category considering family history \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))

The bar chart shows that is some correlation between family history with overweight and obesity in people.

Which combinations never show up?

#smoke
smoke_counts <- table(Obesity$SMOKE)
print(smoke_counts)

## 
##   no  yes 
## 2067   44

#SCC(calorie consumption monitoring)
scc_counts <- table(Obesity$SCC)
print(scc_counts)

## 
##   no  yes 
## 2015   96

# Find the minimum and maximum values for MTRANS(mode of transport)
min(Obesity$MTRANS)

## [1] "Automobile"

max(Obesity$MTRANS)

## [1] "Walking"

Least common is Automobile and the most common is walking.

# Find the minimum and maximum values for NObeyesdad (weight category)
min(Obesity$NObeyesdad)

## [1] "Insufficient_Weight"

max(Obesity$NObeyesdad)

## [1] "Overweight_Level_II"

Least common is Insufficient Weight category and the most common is Overweight level 2 category..

DATA DIVE 2

Jagriti Mahajan

2023-09-11