This is the R Notebook for my final project. Sources used to help with data visualization are included in the bibliography of my paper. Thank you for taking the time to go through this and provide notes. All of the topics helped me a lot with this editing of the dataset. Enjoy! - Kendra Pinckney
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cowplot)
library(corrplot)
## corrplot 0.94 loaded
obesity <- read.csv("ObesityDataSet_raw_and_data_sinthetic.csv")
dim(obesity)
## [1] 2111 17
str(obesity)
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
summary(obesity)
## Gender Age Height Weight
## Length:2111 Min. :14.00 Min. :1.450 Min. : 39.00
## Class :character 1st Qu.:19.95 1st Qu.:1.630 1st Qu.: 65.47
## Mode :character Median :22.78 Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## family_history_with_overweight FAVC FCVC
## Length:2111 Length:2111 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :2.386
## Mean :2.419
## 3rd Qu.:3.000
## Max. :3.000
## NCP CAEC SMOKE CH2O
## Min. :1.000 Length:2111 Length:2111 Min. :1.000
## 1st Qu.:2.659 Class :character Class :character 1st Qu.:1.585
## Median :3.000 Mode :character Mode :character Median :2.000
## Mean :2.686 Mean :2.008
## 3rd Qu.:3.000 3rd Qu.:2.477
## Max. :4.000 Max. :3.000
## SCC FAF TUE CALC
## Length:2111 Min. :0.0000 Min. :0.0000 Length:2111
## Class :character 1st Qu.:0.1245 1st Qu.:0.0000 Class :character
## Mode :character Median :1.0000 Median :0.6253 Mode :character
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## MTRANS NObeyesdad
## Length:2111 Length:2111
## Class :character Class :character
## Mode :character Mode :character
##
##
##
head(obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
obesity <- obesity %>% select(-FCVC, -CAEC, -MTRANS)
obesity <- obesity %>% rename(eat_high_cal_food = FAVC,
num_meals_daily = NCP,
smoke = SMOKE,
water_daily = CH2O,
monitor_calories = SCC,
physical_activity = FAF,
tech_use = TUE,
alcohol_use = CALC,
obesity_level = NObeyesdad)
obesity <- obesity %>% mutate(BMI = Weight / (Height^2))
obesity <- obesity %>% mutate(gender_num = ifelse(Gender == "Male", 0, 1)) %>%
mutate(overweight_fam_hist = ifelse(family_history_with_overweight == "no", 0, 1)) %>%
mutate(high_cal_food_freq = ifelse(eat_high_cal_food == "no", 0, 1)) %>%
mutate(smoking = ifelse(smoke == "no", 0, 1)) %>%
mutate(monitor_daily_cal = ifelse(monitor_calories == "no", 0, 1))
# obesity
obesity_factor <- factor(obesity$obesity_level, levels = c("Insufficient_Weight", "Normal_Weight", "Overweight_Level_I", "Overweight_Level_II", "Obesity_Level_I", "Obesity_Level_II, Obesity_Level_III"))
obesity_numeric <- as.numeric(obesity_factor)
# gender
obesity$gender_num <- factor(obesity$gender_num)
gender_num <- factor(obesity$gender_num)
#family history
obesity$overweight_fam_hist <- factor(obesity$overweight_fam_hist)
fam_hist <- factor(obesity$overweight_fam_hist)
obesity <- obesity %>% select(-family_history_with_overweight, -eat_high_cal_food, -smoke, -monitor_calories)
obesity <- obesity %>% select(Gender, gender_num, Age, Height, Weight, BMI,
obesity_level, num_meals_daily, monitor_daily_cal,
overweight_fam_hist, water_daily,
smoking, physical_activity,tech_use, alcohol_use,
high_cal_food_freq)
ggplot(obesity, aes(x=Height, y=Weight, color = Gender, group=Gender)) +
geom_point() +
labs(x="Height (m)", y="Weight (kg)", color = "Gender", title="Height and Weight vs Gender") +
scale_color_manual(values = c("Male" = "darkslateblue", "Female" = "deeppink2")) +
theme_minimal() +
geom_smooth(method="loess", se = FALSE, aes(color=Gender))
## `geom_smooth()` using formula = 'y ~ x'
# create an order of how we want the bars to be organized
order <- c("Insufficient_Weight", "Normal_Weight", "Overweight_Level_I", "Overweight_Level_II", "Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III")
ggplot(obesity, aes(x = reorder(obesity_level, match(obesity_level, order))))+
geom_bar(fill = "darkcyan", color = "gray") +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, color="gray35") +
theme_minimal() +
labs(x = "Obesity Level", y = "Number of People", title = "Frequency of Weight Levels") +
scale_y_continuous(limits=c(0,425)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
scale_x_discrete(labels = c("Insufficient_Weight" = "Insufficient Weight",
"Normal_Weight" = "Normal Weight",
"Obesity_Type_I" = "Obesity Type I",
"Obesity_Type_II" = "Obesity Type II",
"Obesity_Type_III" = "Obesity Type III",
"Overweight_Level_I" = "Overweight Level I",
"Overweight_Level_II" = "Overweight Level II"))
# the above lines removed the underscores from the labels on the x-axis
obesity$obesity_level <- factor(obesity$obesity_level,
levels = c("Insufficient_Weight", "Normal_Weight",
"Overweight_Level_I", "Overweight_Level_II",
"Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"))
ggplot(obesity, aes(x = obesity_level, fill = factor(overweight_fam_hist, labels = c("No", "Yes")))) +
geom_bar(position = position_dodge(width = 0.8), width = 0.6) +
geom_text(stat = "count", aes(label = after_stat(count)), position = position_dodge(width =0.8), vjust = -0.5, color="gray35", size = 3)+
labs(title = "Obesity Level Distribution by Family History",
x = "Obesity Level",
y = "Number of People",
fill = "Family History") +
scale_fill_manual(values = c("Yes" = "darkcyan", "No" = "red2")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
scale_x_discrete(labels = c("Insufficient_Weight" = "Insufficient Weight",
"Normal_Weight" = "Normal Weight",
"Obesity_Type_I" = "Obesity Type I",
"Obesity_Type_II" = "Obesity Type II",
"Obesity_Type_III" = "Obesity Type III",
"Overweight_Level_I" = "Overweight Level I",
"Overweight_Level_II" = "Overweight Level II"))
# Fit linear regression model
model <- lm(obesity_numeric ~ gender_num + fam_hist, obesity = obesity)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'obesity' will be disregarded
# Summary of the model
summary(model)
##
## Call:
## lm(formula = obesity_numeric ~ gender_num + fam_hist, obesity = obesity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9447 -0.8140 0.0553 1.0553 2.1860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.09372 0.06449 32.468 < 2e-16 ***
## gender_num1 -0.27971 0.06129 -4.564 5.57e-06 ***
## fam_hist1 0.85098 0.06512 13.068 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.02 on 1136 degrees of freedom
## (972 observations deleted due to missingness)
## Multiple R-squared: 0.1604, Adjusted R-squared: 0.1589
## F-statistic: 108.5 on 2 and 1136 DF, p-value: < 2.2e-16