library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(broom)
Warning: package 'broom' was built under R version 4.4.3
setwd("C:/Users/user/Desktop/datasets")
<-read.csv("ObesityDataSet_raw.csv")
obesity_dataglimpse(obesity_data)
Rows: 2,111
Columns: 17
$ Gender <chr> "Female", "Female", "Male", "Male", "Ma…
$ Age <dbl> 21, 21, 23, 27, 22, 29, 23, 22, 24, 22,…
$ Height <dbl> 1.62, 1.52, 1.80, 1.80, 1.78, 1.62, 1.5…
$ Weight <dbl> 64.0, 56.0, 77.0, 87.0, 89.8, 53.0, 55.…
$ family_history_with_overweight <chr> "yes", "yes", "yes", "no", "no", "no", …
$ FAVC <chr> "no", "no", "no", "no", "no", "yes", "y…
$ FCVC <dbl> 2, 3, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 3, …
$ NCP <dbl> 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, …
$ CAEC <chr> "Sometimes", "Sometimes", "Sometimes", …
$ SMOKE <chr> "no", "yes", "no", "no", "no", "no", "n…
$ CH2O <dbl> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
$ SCC <chr> "no", "yes", "no", "no", "no", "no", "n…
$ FAF <dbl> 0, 3, 2, 2, 0, 0, 1, 3, 1, 1, 2, 2, 2, …
$ TUE <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 0, …
$ CALC <chr> "no", "Sometimes", "Frequently", "Frequ…
$ MTRANS <chr> "Public_Transportation", "Public_Transp…
$ NObeyesdad <chr> "Normal_Weight", "Normal_Weight", "Norm…
summary(obesity_data)
Gender Age Height Weight
Length:2111 Min. :14.00 Min. :1.450 Min. : 39.00
Class :character 1st Qu.:19.95 1st Qu.:1.630 1st Qu.: 65.47
Mode :character Median :22.78 Median :1.700 Median : 83.00
Mean :24.31 Mean :1.702 Mean : 86.59
3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
Max. :61.00 Max. :1.980 Max. :173.00
family_history_with_overweight FAVC FCVC
Length:2111 Length:2111 Min. :1.000
Class :character Class :character 1st Qu.:2.000
Mode :character Mode :character Median :2.386
Mean :2.419
3rd Qu.:3.000
Max. :3.000
NCP CAEC SMOKE CH2O
Min. :1.000 Length:2111 Length:2111 Min. :1.000
1st Qu.:2.659 Class :character Class :character 1st Qu.:1.585
Median :3.000 Mode :character Mode :character Median :2.000
Mean :2.686 Mean :2.008
3rd Qu.:3.000 3rd Qu.:2.477
Max. :4.000 Max. :3.000
SCC FAF TUE CALC
Length:2111 Min. :0.0000 Min. :0.0000 Length:2111
Class :character 1st Qu.:0.1245 1st Qu.:0.0000 Class :character
Mode :character Median :1.0000 Median :0.6253 Mode :character
Mean :1.0103 Mean :0.6579
3rd Qu.:1.6667 3rd Qu.:1.0000
Max. :3.0000 Max. :2.0000
MTRANS NObeyesdad
Length:2111 Length:2111
Class :character Class :character
Mode :character Mode :character
colSums(is.na(obesity_data))
Gender Age
0 0
Height Weight
0 0
family_history_with_overweight FAVC
0 0
FCVC NCP
0 0
CAEC SMOKE
0 0
CH2O SCC
0 0
FAF TUE
0 0
CALC MTRANS
0 0
NObeyesdad
0
#Understanding the target variable level balance
%>%
obesity_data count(NObeyesdad) %>%
ggplot(aes(x=NObeyesdad,y= n, fill=NObeyesdad))+
geom_col()+
labs(title = "obesity levels balance",x="obesity level",y="count")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #to stop overlapping for x axis labels