Ravikumar Balar (S3798098) | Drashti Maniya (S3748944) | Ajaykumar Kothiya (S3793661)
Last updated: 25 October, 2019
# DataSource: https://www.kaggle.com/adu47249/obesity-stats
init_dataset <- read.csv("obesity.csv",stringsAsFactors = FALSE)
dataset <- init_dataset %>% filter(StratificationCategory==c("Gender")) %>% select(-c("Age.years.","Sample_Size","Income","Race.Ethnicity"))
knitr::kable(head(dataset,5L))| YearStart | YearEnd | LocationAbbr | LocationDesc | Data_Value | Gender | StratificationCategory |
|---|---|---|---|---|---|---|
| 2011 | 2011 | AL | Alabama | 32.3 | Male | Gender |
| 2011 | 2011 | AL | Alabama | 31.8 | Female | Gender |
| 2011 | 2011 | AL | Alabama | 39.0 | Male | Gender |
| 2011 | 2011 | AL | Alabama | 30.5 | Female | Gender |
| 2011 | 2011 | AL | Alabama | 40.1 | Female | Gender |
## 'data.frame': 3814 obs. of 7 variables:
## $ YearStart : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ YearEnd : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ LocationAbbr : chr "AL" "AL" "AL" "AL" ...
## $ LocationDesc : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ Data_Value : num 32.3 31.8 39 30.5 40.1 47.9 26.6 22.3 45.4 39.6 ...
## $ Gender : chr "Male" "Female" "Male" "Female" ...
## $ StratificationCategory: chr "Gender" "Gender" "Gender" "Gender" ...
# Convert Gender to factor
dataset$Gender <- factor(dataset$Gender,levels=c("Male","Female"),labels=c("Male","Female"),ordered = TRUE)
# Select relevant column (Data_Value and Gender)
df <- dataset %>% filter(YearStart == 2011) %>% select(c("Gender","Data_Value"))
# Filtered data frame
str(df)## 'data.frame': 936 obs. of 2 variables:
## $ Gender : Ord.factor w/ 2 levels "Male"<"Female": 1 2 1 2 2 1 1 2 1 2 ...
## $ Data_Value: num 32.3 31.8 39 30.5 40.1 47.9 26.6 22.3 45.4 39.6 ...
# replacing outliers with median of each group
df$Data_Value <- ifelse(df$Data_Value %in% box$out, ifelse(df$Gender=="Female", female_median,male_median), df$Data_Value)
boxplot(Data_Value~Gender,df,xlab="Gender",ylab="Obesity")summary_tab <- df %>% group_by(Gender) %>% summarise(Min = min(Data_Value,na.rm = TRUE),
Q1 = quantile(Data_Value,probs = .25,na.rm = TRUE),
Median = median(Data_Value, na.rm = TRUE),
Q3 = quantile(Data_Value,probs = .75,na.rm = TRUE),
Max = max(Data_Value,na.rm = TRUE),
Mean = mean(Data_Value, na.rm = TRUE),
SD = sd(Data_Value, na.rm = TRUE),
n = n(),
Missing = sum(is.na(Data_Value)))
knitr::kable(summary_tab,digits=round(1))| Gender | Min | Q1 | Median | Q3 | Max | Mean | SD | n | Missing |
|---|---|---|---|---|---|---|---|---|---|
| Male | 15.3 | 25.8 | 31.9 | 41.0 | 62.3 | 33.7 | 9.9 | 468 | 0 |
| Female | 10.3 | 22.3 | 27.6 | 29.8 | 45.5 | 26.5 | 6.2 | 468 | 0 |
## [1] 153 398
## [1] 108 102
# Density curve for Male
xfit_male<-seq(min(df_male$Data_Value),max(df_male$Data_Value),length=length(df_male$Data_Value))
yfit_male<-dnorm(xfit_male,mean=mean(df_male$Data_Value),sd=sd(df_male$Data_Value))
h_male <- hist(df_male$Data_Value,col="blue",main="Male obesity",breaks=10,xlab="Obesity")
yfit_male <- yfit_male*diff(h_male$mids[1:2])*length(df_male$Data_Value)
lines(xfit_male, yfit_male, lwd=2)# Density curve for Female
xfit_female<-seq(min(df_female$Data_Value),max(df_female$Data_Value),length=length(df_female$Data_Value))
yfit_female<-dnorm(xfit_female,mean=mean(df_female$Data_Value),sd=sd(df_female$Data_Value))
h_female <- hist(df_female$Data_Value,col="red",main="Female obesity",breaks=10,xlab="Obesity")
yfit_female <- yfit_female*diff(h_female$mids[1:2])*length(df_female$Data_Value)
lines(xfit_female, yfit_female, lwd=2)Ho: Both Gender has equal variance
Ha: Both Gender has unqual variance
\[H_0: \mu_1 - \mu_2 = 0 \]
\[H_A: \mu_1 - \mu_2 \ne 0 \]
Ho: Both Gender has equal obesity
Ha: Both Gender has unqual obesity
\[t = \frac{\overline{x_1} - \overline{x_2}}{\sqrt{\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}}}\]
\[df = \frac{({\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}})^2}{{\frac{({\frac{s_1^2}{n_1}})^2}{n_1-1}}+{\frac{({\frac{s_2^2}{n_2}})^2}{n_2-1}}}\]
# unequal variance t-test
res <- t.test(Data_Value~Gender,data=df,var.equal = FALSE, alternative = "two.sided")
res##
## Welch Two Sample t-test
##
## data: Data_Value by Gender
## t = 13.179, df = 784.52, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 6.082871 8.212000
## sample estimates:
## mean in group Male mean in group Female
## 33.67778 26.53034
## t
## 13.18
## df
## 785
## [1] 5.637934e-36
## [1] 6.08 8.21
## attr(,"conf.level")
## [1] 0.95