MATH1324 Assignment 3

Obesity problem by Gender

Ravikumar Balar (S3798098) | Drashti Maniya (S3748944) | Ajaykumar Kothiya (S3793661)

Last updated: 25 October, 2019

Problem Statement

Data Collection Process

# DataSource: https://www.kaggle.com/adu47249/obesity-stats
init_dataset <- read.csv("obesity.csv",stringsAsFactors = FALSE)
dataset <- init_dataset %>% filter(StratificationCategory==c("Gender"))  %>% select(-c("Age.years.","Sample_Size","Income","Race.Ethnicity"))
knitr::kable(head(dataset,5L))
YearStart YearEnd LocationAbbr LocationDesc Data_Value Gender StratificationCategory
2011 2011 AL Alabama 32.3 Male Gender
2011 2011 AL Alabama 31.8 Female Gender
2011 2011 AL Alabama 39.0 Male Gender
2011 2011 AL Alabama 30.5 Female Gender
2011 2011 AL Alabama 40.1 Female Gender

Summary of Data (Field/Variables)

# Initial Data Structure of dataset
str(dataset)
## 'data.frame':    3814 obs. of  7 variables:
##  $ YearStart             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ YearEnd               : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ LocationAbbr          : chr  "AL" "AL" "AL" "AL" ...
##  $ LocationDesc          : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ Data_Value            : num  32.3 31.8 39 30.5 40.1 47.9 26.6 22.3 45.4 39.6 ...
##  $ Gender                : chr  "Male" "Female" "Male" "Female" ...
##  $ StratificationCategory: chr  "Gender" "Gender" "Gender" "Gender" ...
  1. YearStart, YearEnd : Start Year, End Year
  2. LocationAbbr,Desc : Location Abbrevation, Desc
  3. Data_Value : Obesity index
  4. Gender : Gender(Male/Female)
  5. StratificationCategory : Data Stratification Category(Age/Income/Ethnicity/Gender)

Data Manipulation for experiment (Filter)

  1. Gender
  2. Data_Value(Obesity Index)
# Convert Gender to factor
dataset$Gender <- factor(dataset$Gender,levels=c("Male","Female"),labels=c("Male","Female"),ordered = TRUE)
# Select relevant column (Data_Value and Gender)
df <- dataset %>% filter(YearStart == 2011) %>% select(c("Gender","Data_Value"))
# Filtered data frame
str(df)
## 'data.frame':    936 obs. of  2 variables:
##  $ Gender    : Ord.factor w/ 2 levels "Male"<"Female": 1 2 1 2 2 1 1 2 1 2 ...
##  $ Data_Value: num  32.3 31.8 39 30.5 40.1 47.9 26.6 22.3 45.4 39.6 ...

Data Manipulation (Outliers) cntd..

# Male-Female dataframe
df_male <- df %>% filter(Gender == "Male")
n_male <- nrow(df_male)
male_median <- median(df_male$Data_Value)
df_female <- df %>% filter(Gender == "Female")
n_female <- nrow(df_female)
female_median <- median(df_female$Data_Value)

Data Manipulation (Outliers) cntd..

# Identify Outliers
box <- boxplot(Data_Value~Gender,df,xlab="Gender",ylab="Obesity")

Data Manipulation (Outliers) cntd..

# replacing outliers with median of each group
df$Data_Value <- ifelse(df$Data_Value %in% box$out, ifelse(df$Gender=="Female", female_median,male_median), df$Data_Value)
boxplot(Data_Value~Gender,df,xlab="Gender",ylab="Obesity")

Decsriptive Statistics Summary

summary_tab <- df %>% group_by(Gender) %>% summarise(Min = min(Data_Value,na.rm = TRUE),
                                      Q1 = quantile(Data_Value,probs = .25,na.rm = TRUE),
                                      Median = median(Data_Value, na.rm = TRUE),
                                      Q3 = quantile(Data_Value,probs = .75,na.rm = TRUE),
                                      Max = max(Data_Value,na.rm = TRUE),
                                      Mean = mean(Data_Value, na.rm = TRUE),
                                      SD = sd(Data_Value, na.rm = TRUE),
                                      n = n(),
                                      Missing = sum(is.na(Data_Value)))
knitr::kable(summary_tab,digits=round(1))
Gender Min Q1 Median Q3 Max Mean SD n Missing
Male 15.3 25.8 31.9 41.0 62.3 33.7 9.9 468 0
Female 10.3 22.3 27.6 29.8 45.5 26.5 6.2 468 0

Hypothesis Testing (Normality test)

# Male qqPlot
rnorm(df_male$Data_Value) %>% qqPlot()

## [1] 153 398

Hypothesis Testing (Normality test) cntd..

# Female qqPlot
rnorm(df_female$Data_Value) %>% qqPlot(col="red")

## [1] 108 102

Hypothesis Testing (Normality test) cntd..

# Density curve for Male
xfit_male<-seq(min(df_male$Data_Value),max(df_male$Data_Value),length=length(df_male$Data_Value))
yfit_male<-dnorm(xfit_male,mean=mean(df_male$Data_Value),sd=sd(df_male$Data_Value))
h_male <- hist(df_male$Data_Value,col="blue",main="Male obesity",breaks=10,xlab="Obesity")
yfit_male <- yfit_male*diff(h_male$mids[1:2])*length(df_male$Data_Value)
lines(xfit_male, yfit_male, lwd=2)

Hypothesis Testing (Normality test) cntd..

# Density curve for Female
xfit_female<-seq(min(df_female$Data_Value),max(df_female$Data_Value),length=length(df_female$Data_Value))
yfit_female<-dnorm(xfit_female,mean=mean(df_female$Data_Value),sd=sd(df_female$Data_Value))
h_female <- hist(df_female$Data_Value,col="red",main="Female obesity",breaks=10,xlab="Obesity")
yfit_female <- yfit_female*diff(h_female$mids[1:2])*length(df_female$Data_Value)
lines(xfit_female, yfit_female, lwd=2)

Hypothesis Testing (Variance Check) cntd..

Ho: Both Gender has equal variance

Ha: Both Gender has unqual variance

# compare the variances of male and female obesity
leveneTest(Data_Value~Gender,data=df)

Hypothesis Testing (Two-sample t-test) cntd..

\[H_0: \mu_1 - \mu_2 = 0 \]

\[H_A: \mu_1 - \mu_2 \ne 0 \]

Ho: Both Gender has equal obesity

Ha: Both Gender has unqual obesity

\[t = \frac{\overline{x_1} - \overline{x_2}}{\sqrt{\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}}}\]

\[df = \frac{({\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}})^2}{{\frac{({\frac{s_1^2}{n_1}})^2}{n_1-1}}+{\frac{({\frac{s_2^2}{n_2}})^2}{n_2-1}}}\]

Hypothesis Testing (Two-sample t-test - Unequal variance) cntd..

# unequal variance t-test
res <- t.test(Data_Value~Gender,data=df,var.equal = FALSE, alternative = "two.sided")
res
## 
##  Welch Two Sample t-test
## 
## data:  Data_Value by Gender
## t = 13.179, df = 784.52, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  6.082871 8.212000
## sample estimates:
##   mean in group Male mean in group Female 
##             33.67778             26.53034
# t value
round(res$statistic,2)
##     t 
## 13.18

Hypothesis Testing (Two-sample t-test - Unequal variance) cntd..

# df` degree of freedom
round(res$parameter)
##  df 
## 785
# p-value
res$p.value
## [1] 5.637934e-36
# conf.int
round(res$conf.int,2)
## [1] 6.08 8.21
## attr(,"conf.level")
## [1] 0.95

Discussion

References