Reading CSV into Dataframe
Obesity <- read.csv('/Users/ankit/Downloads/Obesity.csv')
Loading the required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Hyopothesis 1 : Younger Females smokes less than younger males.
min(Obesity$Age)
## [1] 14
max(Obesity$Age)
## [1] 61
#categorizing age into different age groups and creating a new dataframe: obesity age group
Obesity_Age_gp <- Obesity %>%
mutate(Age_gp = case_when(
Age <=25 ~ "14-25",
Age > 25 & Age <= 35 ~ "26-35",
Age > 35 ~ "35+",
TRUE ~ "other"
)
)
#filter(Obesity_Age_gp, SMOKE == 'yes')
#nrow(filter(Obesity_Age_gp, SMOKE == 'yes'))
#dataframe named Obesity_Age_gp
smoker_by_gender_age <- Obesity_Age_gp %>%
filter(SMOKE == 'yes') %>%
group_by(Gender, Age_gp) %>%
summarise(smoker_count = n())
## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.
# Print the result
print(smoker_by_gender_age)
## # A tibble: 6 × 3
## # Groups: Gender [2]
## Gender Age_gp smoker_count
## <chr> <chr> <int>
## 1 Female 14-25 10
## 2 Female 26-35 2
## 3 Female 35+ 3
## 4 Male 14-25 9
## 5 Male 26-35 18
## 6 Male 35+ 2
total_by_gender_age <- Obesity_Age_gp %>%
group_by(Gender, Age_gp) %>%
summarise(total_count = n())
## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.
# Print the result
print(total_by_gender_age)
## # A tibble: 6 × 3
## # Groups: Gender [2]
## Gender Age_gp total_count
## <chr> <chr> <int>
## 1 Female 14-25 689
## 2 Female 26-35 253
## 3 Female 35+ 101
## 4 Male 14-25 680
## 5 Male 26-35 321
## 6 Male 35+ 67
df2 <- smoker_by_gender_age %>% inner_join( total_by_gender_age,
by=c('Gender'='Gender', 'Age_gp'='Age_gp'))
df2$smoking_probability <- df2$smoker_count/df2$total_count
ANAMOLY
df2$is_anomaly <- ifelse(df2$smoking_probability < 0.01, "yes", "no")
df2
library(ggplot2)
# Create a bar chart
ggplot(data = df2, aes(x = Age_gp, y = smoking_probability, fill = Gender)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75) +
labs(x = "\n Age Group", y = "Probability\n", title = "\n Smoking Probability \n") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="red", size = 12),
axis.title.y = element_text(face="bold", colour="red", size = 12),
legend.title = element_text(face="bold", size = 10))
The least number of smokers are in 26-35 yers age group in female, which
is only 0.8% and for the male of same age category, it is 5.6% . O.8 %
is the lowest out of all different grps. There’s a possibility that
women in that age group are generally in their child bearing age, that
is why the number of feamle smokers are the least in that age category.
In later life, the number of male and female smokers are somewhat
comparable.
Hypothesis 2 : People having walk as mode of transportation come under normal weight category
df3 <- Obesity %>%
mutate(physical_activity = case_when(
MTRANS %in% c("Public_Transportation", "Automobile", "Motorbike") ~ "No Physical Activity",
MTRANS %in% c ("Walking","Bike") ~ "Physical Activity",
TRUE ~ "other"
)
)
df3 <- df3 %>%
filter(NObeyesdad != 'Insufficient_Weight')
df3 <- df3 %>%
mutate(Weight_Category = case_when(
NObeyesdad %in% c("Normal_Weight") ~ "Normal Weight",
TRUE ~ "Overweight + Obese"
)
)
df3_normal_weight <- df3 %>%
filter(Weight_Category == 'Normal Weight') %>%
group_by(physical_activity) %>%
summarise(normal_weight_count = n())
df3_total <- df3 %>%
group_by(physical_activity) %>%
summarise(total_count = n())
df3_results <- df3_normal_weight %>% inner_join( df3_total,
by=c('physical_activity'='physical_activity'))
df3_results$normal_weight_probability <- df3_results$normal_weight_count/df3_results$total_count
ggplot(data = df3_results, aes(x = physical_activity, y = normal_weight_probability)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75) +
labs(x = "\n Physical Activity", y = "Probability\n", title = "\n Probability of being Normal Weight considering physical activity\n") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="red", size = 12),
axis.title.y = element_text(face="bold", colour="red", size = 12),
legend.title = element_text(face="bold", size = 10))
The bar chart shows there is some correlation between physical activity
and probability of falling under normal weight category.
Hypothesis 3: People having a family history of being overweight come under overweight and obese weight category.
df4 <- Obesity %>%
filter(NObeyesdad != 'Insufficient_Weight')
df4 <- df4 %>%
mutate(weight_category = case_when(
NObeyesdad %in% c("Normal_Weight") ~ "Normal_Weight",
NObeyesdad %in% c("Overweight_Level_I", "Overweight_Level_II") ~ "Overweight",
NObeyesdad %in% c ("Obesity_Type_I","Obesity_Type_II", "Obesity_Type_III") ~ "Obese",
TRUE ~ "other"
)
)
df4_family_history <- df4 %>%
group_by(family_history_with_overweight) %>%
summarise(family_history_records = n())
df4_family_history_and_weight_category <- df4 %>%
group_by(family_history_with_overweight, weight_category) %>%
summarise(weight_category_records = n())
## `summarise()` has grouped output by 'family_history_with_overweight'. You can
## override using the `.groups` argument.
df4_results <- df4_family_history %>% inner_join( df4_family_history_and_weight_category,
by=c('family_history_with_overweight'='family_history_with_overweight'))
df4_results$weight_category_probability <- df4_results$weight_category_records/df4_results$family_history_records
ggplot(data = df4_results, aes(x = weight_category, y = weight_category_probability, fill = family_history_with_overweight)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75) +
labs(x = "\n Weight Category", y = "Probability\n", title = "\n Probability of being in a weight category considering family history \n") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="red", size = 12),
axis.title.y = element_text(face="bold", colour="red", size = 12),
legend.title = element_text(face="bold", size = 10))
The bar chart shows that is some correlation between family history with
overweight and obesity in people.
Which combinations never show up?
#smoke
smoke_counts <- table(Obesity$SMOKE)
print(smoke_counts)
##
## no yes
## 2067 44
#SCC(calorie consumption monitoring)
scc_counts <- table(Obesity$SCC)
print(scc_counts)
##
## no yes
## 2015 96
# Find the minimum and maximum values for MTRANS(mode of transport)
min(Obesity$MTRANS)
## [1] "Automobile"
max(Obesity$MTRANS)
## [1] "Walking"
Least common is Automobile and the most common is walking.
# Find the minimum and maximum values for NObeyesdad (weight category)
min(Obesity$NObeyesdad)
## [1] "Insufficient_Weight"
max(Obesity$NObeyesdad)
## [1] "Overweight_Level_II"
Least common is Insufficient Weight category and the most common is Overweight level 2 category..