Data Source:

Heinz G, Peterson LJ, Johnson RW, Kerk CJ. 2003. Exploring Relationships in Body Dimensions. Journal of Statistics Education 11(2). http://jse.amstat.org/v11n2/datasets.heinz.html

Problem Statement

Load Packages

library(readr) # Useful for importing data
library(dplyr) # Useful for data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#library(magrittr) # Useful for Forward-Pipe operator
library(ggplot2)

Data

Imported the body measurements data and subset the data frame to contain only the variable of choice- hgt and sex.

data <- read_csv("bdims.csv")
## Rows: 507 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (25): bia.di, bii.di, bit.di, che.de, che.di, elb.di, wri.di, kne.di, an...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_subset <- data[c("hgt","sex")]

Summary Statistics

# This is a chunk for Summary Statistics section. 

shapiro.test(data_subset$hgt)
## 
##  Shapiro-Wilk normality test
## 
## data:  data_subset$hgt
## W = 0.99233, p-value = 0.01045
data_subset$sex = factor(data_subset$sex,
                  levels = c("1","0"),
                  labels = c("Male","Female"))
str(data_subset)
## tibble [507 × 2] (S3: tbl_df/tbl/data.frame)
##  $ hgt: num [1:507] 174 175 194 186 187 ...
##  $ sex: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
males <- data_subset %>% filter(sex=="Male")
females <- data_subset %>% filter(sex=="Female")
summary(males)
##       hgt            sex     
##  Min.   :157.2   Male  :247  
##  1st Qu.:172.9   Female:  0  
##  Median :177.8               
##  Mean   :177.7               
##  3rd Qu.:182.7               
##  Max.   :198.1
summary(females)
##       hgt            sex     
##  Min.   :147.2   Male  :  0  
##  1st Qu.:160.0   Female:260  
##  Median :164.5               
##  Mean   :164.9               
##  3rd Qu.:169.5               
##  Max.   :182.9
m1 <- males$hgt
f1 <- females$hgt

d1 <- density(m1)
d2 <- density(f1)
plot(d1,col="blue",main="Density curve for Male Height",xlab="Male Height (in cm)")
polygon(d1,col = "red", border="black")

plot(d2,col="red",main="Density curve for Female Height",xlab="Female Height (in cm)")
polygon(d2,col = "blue", border="black")

shapiro.test(m1)
## 
##  Shapiro-Wilk normality test
## 
## data:  m1
## W = 0.99358, p-value = 0.3716
shapiro.test(f1)
## 
##  Shapiro-Wilk normality test
## 
## data:  f1
## W = 0.99283, p-value = 0.2437

Distribution Fitting

# This is a chunk for Distribution Fitting section. 

h1<- hist(m1, breaks=12,density=20, col="red", xlab="Male Height (in cm)",main="Histogram of Male height", prob=TRUE,ylim=c(0,0.08))
lines(density(m1))
xfit<-seq(min(m1),max(m1),length=40)
yfit<-dnorm(xfit,mean=mean(m1),sd=sd(m1))
lines(xfit, yfit, col="purple",lwd=2)

r_norm_male<-rnorm(length(m1)*2,m=mean(m1),sd=sd(m1))
plot(ecdf(r_norm_male),main="Empirical cumulative distribution function(Male Heights)")

standard_norm_male<-(r_norm_male-mean(r_norm_male))/sd(r_norm_male)
qqnorm(standard_norm_male) 
abline(0,1)

h2<-hist(f1, breaks=15,density=20, col="blue", xlab="Female Height (in cm)",main="Histogram of Female height", prob=TRUE,ylim=c(0,0.10))
lines(density(f1))
xfit<-seq(min(f1),max(f1),length=40)
yfit<-dnorm(xfit,mean=mean(f1),sd=sd(f1))
lines(xfit, yfit, col="purple",lwd=2)

r_norm_female<-rnorm(length(f1)*2,m=mean(f1),sd=sd(f1))
plot(ecdf(r_norm_female),main="Empirical cumulative distribution function(Female Heights)")

standard_norm_female<-(r_norm_female-mean(r_norm_female))/sd(r_norm_female)
qqnorm(standard_norm_female) 
abline(0,1)

Interpretation

The variable ‘hgt’ almost fits the normal distribution for males and females. I have determined this by running the Shapiro-Wilk normality test, along with QQ-plot also by plotting the histogram with normal distribution overlay and a density curve. The insight gained from this investigation is that males are slightly taller than females.