Download data file

knitr::opts_chunk$set(echo = TRUE)

# Clear the console
cat("\014")

download.file("http://www.openintro.org/stat/data/bdims.RData", destfile = "bdims.RData")
load("bdims.RData")

head(bdims)
##   bia.di bii.di bit.di che.de che.di elb.di wri.di kne.di ank.di sho.gi
## 1   42.9   26.0   31.5   17.7   28.0   13.1   10.4   18.8   14.1  106.2
## 2   43.7   28.5   33.5   16.9   30.8   14.0   11.8   20.6   15.1  110.5
## 3   40.1   28.2   33.3   20.9   31.7   13.9   10.9   19.7   14.1  115.1
## 4   44.3   29.9   34.0   18.4   28.2   13.9   11.2   20.9   15.0  104.5
## 5   42.5   29.9   34.0   21.5   29.4   15.2   11.6   20.7   14.9  107.5
## 6   43.3   27.0   31.5   19.6   31.3   14.0   11.5   18.8   13.9  119.8
##   che.gi wai.gi nav.gi hip.gi thi.gi bic.gi for.gi kne.gi cal.gi ank.gi
## 1   89.5   71.5   74.5   93.5   51.5   32.5   26.0   34.5   36.5   23.5
## 2   97.0   79.0   86.5   94.8   51.5   34.4   28.0   36.5   37.5   24.5
## 3   97.5   83.2   82.9   95.0   57.3   33.4   28.8   37.0   37.3   21.9
## 4   97.0   77.8   78.8   94.0   53.0   31.0   26.2   37.0   34.8   23.0
## 5   97.5   80.0   82.5   98.5   55.4   32.0   28.4   37.7   38.6   24.4
## 6   99.9   82.5   80.1   95.3   57.5   33.0   28.0   36.6   36.1   23.5
##   wri.gi age  wgt   hgt sex
## 1   16.5  21 65.6 174.0   1
## 2   17.0  23 71.8 175.3   1
## 3   16.9  28 80.7 193.5   1
## 4   16.6  23 72.6 186.5   1
## 5   18.0  22 78.8 187.2   1
## 6   16.9  21 74.8 181.5   1
mdims <- subset(bdims, sex == 1)
fdims <- subset(bdims, sex == 0)

Exercise 1

# Exercise 1: Make a histogram of men’s heights and a histogram of women’s heights. How would you compare the various aspects of the two distributions?

mhgtmean <- mean(mdims$hgt)
mhgtsd   <- sd(mdims$hgt)

fhgtmean <- mean(fdims$hgt)
fhgtsd   <- sd(fdims$hgt)

hist(mdims$hgt, probability = TRUE)
x <- 140:190
y <- dnorm(x = x, mean = mhgtmean, sd = mhgtsd)
lines(x = x, y = y, col = "red")

hist(fdims$hgt, probability = TRUE)
x <- 140:190
y <- dnorm(x = x, mean = fhgtmean, sd = fhgtsd)
lines(x = x, y = y, col = "blue")

# Answer: Based on the histograms, both distributions appear to be a normal distributions

Exercise 2

# Exercise 2: Based on the this plot, does it appear that the data follow a nearly normal distribution?

hist(fdims$hgt, probability = TRUE)
x <- 140:190
y <- dnorm(x = x, mean = fhgtmean, sd = fhgtsd)
lines(x = x, y = y, col = "blue")

qqnorm(fdims$hgt)
qqline(fdims$hgt)

# Answer: Yes, based on the histogram as well as the Q-Q Plot, the distribution of female heights appear to be normally distributed

Exercise 3

# Exercise 3: Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data?

sim_norm <- rnorm(n = length(fdims$hgt), mean = fhgtmean, sd = fhgtsd)
qqnorm(sim_norm,main="QQ Plot simulated Data")
qqline(fdims$hgt)

# Answer: The simulation does not appear to be normally distributed compated to the original data. There are lots of data that do not fall on/near the line

Exercise 4

# Exercise 4: Does the normal probability plot for fdims$hgt look similar to the plots created for the simulated data? 
# That is, do plots provide evidence that the female heights are nearly normal?

qqnormsim(fdims$hgt)

# Answer: Yes, the simulated models appear to be normal so the female data appears to be normally distributed

Exercise 5

# Exercise 5: Using the same technique, determine whether or not female weights appear to come from a normal distribution.

qqnormsim(fdims$wgt)

# Answer: Yes, the weight also appears to be normal

Exercise 6

# Exercise 6: Write out two probability questions that you would like to answer; 
# one regarding female heights and one regarding female weights. 
# Calculate the those probabilities using both the theoretical normal distribution 
# as well as the empirical distribution (four probabilities in all). 
# Which variable, height or weight, had a closer agreement between the two methods?

# Question 1: What is the probability that the average height of the female aged less than 20 are more than 160 cms
fdimsNew <- subset(bdims, sex == 0 & age <= 20)
fhgtmeanNew <- mean(fdimsNew$hgt)
fhgtsdNew   <- sd(fdimsNew$hgt)

pnorm(q =160, mean = fhgtmeanNew, sd = fhgtsdNew)
## [1] 0.2697275
sum(fdimsNew$hgt<160)/length(fdimsNew$hgt)
## [1] 0.2291667
# Answer: The probability is 0.2291 (22.91%)

# Question 2: What is the probability that the weight of the female aged more than 35, are more than 65 kgs
fdimsNew2 <- subset(bdims, sex == 0 & age > 35)
fhgtmeanNew2 <- mean(fdimsNew2$wgt)
fhgtsdNew2   <- sd(fdimsNew2$wgt)

pnorm(q=65, mean = fhgtmeanNew2, sd = fhgtsdNew2)
## [1] 0.5801419
sum(fdimsNew2$wgt>65)/length(fdimsNew2$wgt)
## [1] 0.3157895
# Answer: The probability is 0.3158 (31.58%)