library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggridges)
ds = read.csv("D://r_files//datasets//ObesityDataSet_raw_and_data_sinthetic_Original.csv")
ggplot(ds,aes(x= NObeyesdad))+
geom_bar() +
labs(title="OBESITY LEVELS",
subtitle = "Measure Of Obesity Levels",
x="Obesity Levels",
y="Count")+
theme(legend.position = "bottom")
ggplot(ds,aes(x= NObeyesdad,y=TUE))+
geom_boxplot(mapping = aes(color=SMOKE )) +
labs(title="OBESITY LEVELS",
subtitle = "Measure Of Obesity Levels",
x="Obesity Levels",
y="Usage of Tech devices")+
theme(legend.position = "right")
#Numeric Vs Binary
ggplot(ds, aes(x = Weight, y=Height, fill = FAVC))+
geom_boxplot()
ggplot(ds,
aes(x = Weight, y = NObeyesdad)) +
geom_density_ridges_gradient(
aes(fill = ..y..), scale = 3, size = 0.3)+
scale_fill_gradientn(
colours = c("blue", "red"))+
labs(x="Weight",
y="Obesity Level",
caption = 'Obesity levels Vs Weight')+
theme(legend.position = "none")
## Warning in geom_density_ridges_gradient(aes(fill = ..y..), scale = 3, size =
## 0.3): Ignoring unknown parameters: `size`
## Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(y)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 2.62
## Numerical summary univariate
mean(ds$Weight)
## [1] 86.58606
#Grouped summaries
ds %>% group_by(NObeyesdad) %>% summarise(mean(Weight))
## # A tibble: 7 × 2
## NObeyesdad `mean(Weight)`
## <chr> <dbl>
## 1 Insufficient 49.9
## 2 Normal 62.2
## 3 Obe_Type1 92.9
## 4 Obe_Type1I 115.
## 5 Obe_Type1II 121.
## 6 OverWgt-Lvl1 74.3
## 7 OverWgt-Lvl2 82.1
ds %>% group_by(NObeyesdad) %>% summarise(mean(Weight))
## # A tibble: 7 × 2
## NObeyesdad `mean(Weight)`
## <chr> <dbl>
## 1 Insufficient 49.9
## 2 Normal 62.2
## 3 Obe_Type1 92.9
## 4 Obe_Type1I 115.
## 5 Obe_Type1II 121.
## 6 OverWgt-Lvl1 74.3
## 7 OverWgt-Lvl2 82.1
## Test for single mean with known variance
#H0: Mean Weight= 75 units
#H1: Mean Weight < or > 75 (two sided)
#Idea is comparing observed mean from sample and
#assumed (hypothesized) mean
AssumedMean=75
ds = read.csv("D://r_files//datasets//ObesityDataSet_raw_and_data_sinthetic_Original.csv")
ggplot(ds, aes(x=Weight)) +
geom_histogram(aes(y=..density..),
color="violet", fill="pink") +
geom_vline(aes(xintercept=mean(Weight),
colour ="ESTIMATED"),
linetype="dashed",
size=1, show.legend = TRUE)+
geom_vline(aes(xintercept=AssumedMean,
colour = "Percieved mean"),
linetype="dashed",
size=1, show.legend = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Test for single mean of numeric varible with unknown variance and
#H0: Mean Weight=75 units
#H1: Mean Weight < or > 75 (two sided)
t.test(ds$Weight,mu=30,conf.level = 0.99)
##
## One Sample t-test
##
## data: ds$Weight
## t = 99.266, df = 2110, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 30
## 99 percent confidence interval:
## 85.11638 88.05573
## sample estimates:
## mean of x
## 86.58606
#independent t test
#H0: Mean mpg of vs group do not differ significantly
#H1: Mean mpg of vs group differ significantly
#Or equivalently
#H0: mean_vs0 = mean_vs1
#H1: mean_vs0 != mean_vs1
#Or equivalently
#H0: mean_vs0 - mean_vs1 = 0
#H1: mean_vs0 - mean_vs1 != 0 (>0 or <0) two sided
t.test(Weight~SCC,data = ds,conf.level=0.95)
##
## Welch Two Sample t-test
##
## data: Weight by SCC
## t = 16.15, df = 127.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 22.26646 28.48483
## sample estimates:
## mean in group no mean in group yes
## 87.74004 62.36440
#---------------------------------------------------
#Comparison of mean of a numeric variable with polychotomous
#one-way ANOVA
anv=aov(Weight~NObeyesdad,data = ds)
summary(anv)
## Df Sum Sq Mean Sq F value Pr(>F)
## NObeyesdad 6 1228371 204729 1967 <2e-16 ***
## Residuals 2104 219041 104
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#--------------------------------
#Chi square test of independence
ggplot(ds,aes(x=NObeyesdad,fill=Gender))+
geom_bar(position=position_dodge()) +
labs(title="Distribution of Obesity level By Gender",
caption = "Obesity level",
x="CLASS",
y="COUNT")+
theme(legend.position = "bottom")
table(ds$NObeyesdad, ds$Gender)
##
## Female Male
## Insufficient 173 99
## Normal 141 146
## Obe_Type1 156 195
## Obe_Type1I 2 295
## Obe_Type1II 323 1
## OverWgt-Lvl1 145 145
## OverWgt-Lvl2 103 187
#H0: Two variables are independent
#H1: Two variables are not independent
chisq.test(ds$NObeyesdad, ds$Gender)
##
## Pearson's Chi-squared test
##
## data: ds$NObeyesdad and ds$Gender
## X-squared = 657.75, df = 6, p-value < 2.2e-16
#-----------------------------------------------
#Proportion test
#We test equality of proportion
#H0:all groups have equal proportion or equally likely
#H1:all groups do not have equal proportion or equally likely
#or in other words, if there are k groups (or levels) then
#H0: p1=p2=.......=pk
#H1: some p's are not equal
#If there are two groups (or levels) (k=2) we use prop.test,
#if k > 2 use chisq.test
#In our data we take "family_history_with_overweight" that is binary (k=2)
#H0:p1=p2 or p1=p2 = 0.5 (because k = 2)
#H1:p1!=p2
table(ds$family_history_with_overweight)
##
## no yes
## 385 1726
ggplot(ds,aes(x=family_history_with_overweight,fill=family_history_with_overweight))+
geom_bar(position=position_dodge())+
theme(legend.position = "none")
table(ds$family_history_with_overweight)
##
## no yes
## 385 1726
binom.test(c(156,234),c(78,234)) #c(succ,tot),c(fail,tot)
##
## Exact binomial test
##
## data: c(156, 234)
## number of successes = 156, number of trials = 390, p-value = 9.19e-05
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.3510188 0.4505109
## sample estimates:
## probability of success
## 0.4
#Alternatively with success and total
binom.test(table(ds$family_history_with_overweight)[1],
sum(table(ds$family_history_with_overweight)),p=0.5)
##
## Exact binomial test
##
## data: table(ds$family_history_with_overweight)[1] and sum(table(ds$family_history_with_overweight))
## number of successes = 385, number of trials = 2111, p-value < 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.1661182 0.1995247
## sample estimates:
## probability of success
## 0.182378
#Comparison of more levels
table(ds$NObeyesdad)
##
## Insufficient Normal Obe_Type1 Obe_Type1I Obe_Type1II OverWgt-Lvl1
## 272 287 351 297 324 290
## OverWgt-Lvl2
## 290
#H0: all three probabilities are same
#H1: they are not same
#Not to hard code
cnt_tab=ds %>%
group_by(NObeyesdad) %>%
summarise(cnt=n())
k=nrow(cnt_tab)
chisq.test(x=cnt_tab$cnt,
p=rep(1/k,k))
##
## Chi-squared test for given probabilities
##
## data: cnt_tab$cnt
## X-squared = 14.331, df = 6, p-value = 0.02615
#if any count are relatively very small
#then follow this code
#Other Example
chisq.test(x=c(8,34,187,374,38),
p=c(0.2,0.2,0.2,0.2,0.2),
correct = TRUE,
simulate.p.value = TRUE)
##
## Chi-squared test for given probabilities with simulated p-value (based
## on 2000 replicates)
##
## data: c(8, 34, 187, 374, 38)
## X-squared = 743.63, df = NA, p-value = 0.0004998
#---------------------------------------------------