Load required libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggridges)

Read data from file

ds = read.csv("D://r_files//datasets//ObesityDataSet_raw_and_data_sinthetic_Original.csv")

Exploratory phase

Obesity levels

ggplot(ds,aes(x= NObeyesdad))+
  geom_bar() +
  labs(title="OBESITY LEVELS",
       subtitle = "Measure Of Obesity Levels",
       x="Obesity Levels", 
       y="Count")+
  theme(legend.position = "bottom")

Obesity levels against usage of technology devices grouped by smoking habit

ggplot(ds,aes(x= NObeyesdad,y=TUE))+
  geom_boxplot(mapping = aes(color=SMOKE )) +
  labs(title="OBESITY LEVELS",
       subtitle = "Measure Of Obesity Levels",
       x="Obesity Levels", 
       y="Usage of Tech devices")+
  theme(legend.position = "right")

Inferential And confirmatory Analysis

Hypothesis testing

#Numeric Vs Binary

ggplot(ds, aes(x = Weight, y=Height, fill = FAVC))+
  geom_boxplot()

theme_set(theme_ridges())

ggplot(ds, 
       aes(x = Weight, y = NObeyesdad)) +
  geom_density_ridges_gradient(
    aes(fill = ..y..), scale = 3, size = 0.3)+
  scale_fill_gradientn(
    colours = c("blue", "red"))+ 
  labs(x="Weight",
       y="Obesity Level",
       caption = 'Obesity levels Vs Weight')+
  theme(legend.position = "none")
## Warning in geom_density_ridges_gradient(aes(fill = ..y..), scale = 3, size =
## 0.3): Ignoring unknown parameters: `size`
## Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(y)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 2.62

## Numerical summary univariate

mean(ds$Weight)
## [1] 86.58606
#Grouped summaries


ds %>% group_by(NObeyesdad) %>% summarise(mean(Weight))
## # A tibble: 7 × 2
##   NObeyesdad   `mean(Weight)`
##   <chr>                 <dbl>
## 1 Insufficient           49.9
## 2 Normal                 62.2
## 3 Obe_Type1              92.9
## 4 Obe_Type1I            115. 
## 5 Obe_Type1II           121. 
## 6 OverWgt-Lvl1           74.3
## 7 OverWgt-Lvl2           82.1
ds %>% group_by(NObeyesdad) %>% summarise(mean(Weight))
## # A tibble: 7 × 2
##   NObeyesdad   `mean(Weight)`
##   <chr>                 <dbl>
## 1 Insufficient           49.9
## 2 Normal                 62.2
## 3 Obe_Type1              92.9
## 4 Obe_Type1I            115. 
## 5 Obe_Type1II           121. 
## 6 OverWgt-Lvl1           74.3
## 7 OverWgt-Lvl2           82.1
## Test for single mean with known variance
#H0: Mean Weight= 75 units
#H1: Mean Weight < or > 75 (two sided)

#Idea is comparing observed mean from sample and
#assumed (hypothesized) mean
AssumedMean=75
ds = read.csv("D://r_files//datasets//ObesityDataSet_raw_and_data_sinthetic_Original.csv")
ggplot(ds, aes(x=Weight)) + 
  geom_histogram(aes(y=..density..), 
                 color="violet", fill="pink") + 
  geom_vline(aes(xintercept=mean(Weight), 
                 colour ="ESTIMATED"),
             linetype="dashed", 
             size=1, show.legend = TRUE)+
  geom_vline(aes(xintercept=AssumedMean, 
                 colour = "Percieved mean"), 
             linetype="dashed", 
             size=1, show.legend = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Test for single mean of numeric varible with unknown variance and 
#H0: Mean Weight=75 units
#H1: Mean Weight < or > 75 (two sided)

t.test(ds$Weight,mu=30,conf.level = 0.99)
## 
##  One Sample t-test
## 
## data:  ds$Weight
## t = 99.266, df = 2110, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 30
## 99 percent confidence interval:
##  85.11638 88.05573
## sample estimates:
## mean of x 
##  86.58606
#independent t test
#H0: Mean mpg of vs group do not differ significantly
#H1: Mean mpg of vs group differ significantly

#Or equivalently

#H0: mean_vs0 = mean_vs1
#H1: mean_vs0 != mean_vs1

#Or equivalently

#H0: mean_vs0 - mean_vs1 = 0
#H1: mean_vs0 - mean_vs1 != 0 (>0 or <0) two sided
t.test(Weight~SCC,data = ds,conf.level=0.95)
## 
##  Welch Two Sample t-test
## 
## data:  Weight by SCC
## t = 16.15, df = 127.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  22.26646 28.48483
## sample estimates:
##  mean in group no mean in group yes 
##          87.74004          62.36440
#---------------------------------------------------
#Comparison of mean of a numeric variable with polychotomous
#one-way ANOVA 
anv=aov(Weight~NObeyesdad,data = ds)
summary(anv)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## NObeyesdad     6 1228371  204729    1967 <2e-16 ***
## Residuals   2104  219041     104                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#--------------------------------
#Chi square test of independence

ggplot(ds,aes(x=NObeyesdad,fill=Gender))+
  geom_bar(position=position_dodge()) +
  labs(title="Distribution of Obesity level By Gender",
       caption = "Obesity level",
       x="CLASS", 
       y="COUNT")+
  theme(legend.position = "bottom")

table(ds$NObeyesdad, ds$Gender)
##               
##                Female Male
##   Insufficient    173   99
##   Normal          141  146
##   Obe_Type1       156  195
##   Obe_Type1I        2  295
##   Obe_Type1II     323    1
##   OverWgt-Lvl1    145  145
##   OverWgt-Lvl2    103  187
#H0: Two variables are independent
#H1: Two variables are not independent

chisq.test(ds$NObeyesdad, ds$Gender)
## 
##  Pearson's Chi-squared test
## 
## data:  ds$NObeyesdad and ds$Gender
## X-squared = 657.75, df = 6, p-value < 2.2e-16
#-----------------------------------------------
#Proportion test
#We test equality of proportion 
#H0:all groups have equal proportion or equally likely
#H1:all groups do not have equal proportion or equally likely

#or in other words, if there are k groups (or levels) then 

#H0: p1=p2=.......=pk
#H1: some p's are not equal

#If there are two groups (or levels) (k=2) we use prop.test,
#if k > 2 use chisq.test

#In our data we take "family_history_with_overweight" that is binary (k=2)

#H0:p1=p2 or p1=p2 = 0.5 (because k = 2)
#H1:p1!=p2 

table(ds$family_history_with_overweight)
## 
##   no  yes 
##  385 1726
ggplot(ds,aes(x=family_history_with_overweight,fill=family_history_with_overweight))+
  geom_bar(position=position_dodge())+
  theme(legend.position = "none")

table(ds$family_history_with_overweight)
## 
##   no  yes 
##  385 1726
binom.test(c(156,234),c(78,234)) #c(succ,tot),c(fail,tot)
## 
##  Exact binomial test
## 
## data:  c(156, 234)
## number of successes = 156, number of trials = 390, p-value = 9.19e-05
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.3510188 0.4505109
## sample estimates:
## probability of success 
##                    0.4
#Alternatively with success and total

binom.test(table(ds$family_history_with_overweight)[1],
           sum(table(ds$family_history_with_overweight)),p=0.5)
## 
##  Exact binomial test
## 
## data:  table(ds$family_history_with_overweight)[1] and sum(table(ds$family_history_with_overweight))
## number of successes = 385, number of trials = 2111, p-value < 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.1661182 0.1995247
## sample estimates:
## probability of success 
##               0.182378
#Comparison of more levels
table(ds$NObeyesdad)
## 
## Insufficient       Normal    Obe_Type1   Obe_Type1I  Obe_Type1II OverWgt-Lvl1 
##          272          287          351          297          324          290 
## OverWgt-Lvl2 
##          290
#H0: all three probabilities are same
#H1: they are not same


#Not to hard code

cnt_tab=ds %>% 
  group_by(NObeyesdad) %>% 
  summarise(cnt=n())

k=nrow(cnt_tab)

chisq.test(x=cnt_tab$cnt,
           p=rep(1/k,k))
## 
##  Chi-squared test for given probabilities
## 
## data:  cnt_tab$cnt
## X-squared = 14.331, df = 6, p-value = 0.02615
#if any count are relatively very small 
#then follow this code
#Other Example

chisq.test(x=c(8,34,187,374,38),
           p=c(0.2,0.2,0.2,0.2,0.2),
           correct = TRUE,
           simulate.p.value = TRUE)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  c(8, 34, 187, 374, 38)
## X-squared = 743.63, df = NA, p-value = 0.0004998
#---------------------------------------------------