Download data file
knitr::opts_chunk$set(echo = TRUE)
# Clear the console
cat("\014")
download.file("http://www.openintro.org/stat/data/bdims.RData", destfile = "bdims.RData")
load("bdims.RData")
head(bdims)
## bia.di bii.di bit.di che.de che.di elb.di wri.di kne.di ank.di sho.gi
## 1 42.9 26.0 31.5 17.7 28.0 13.1 10.4 18.8 14.1 106.2
## 2 43.7 28.5 33.5 16.9 30.8 14.0 11.8 20.6 15.1 110.5
## 3 40.1 28.2 33.3 20.9 31.7 13.9 10.9 19.7 14.1 115.1
## 4 44.3 29.9 34.0 18.4 28.2 13.9 11.2 20.9 15.0 104.5
## 5 42.5 29.9 34.0 21.5 29.4 15.2 11.6 20.7 14.9 107.5
## 6 43.3 27.0 31.5 19.6 31.3 14.0 11.5 18.8 13.9 119.8
## che.gi wai.gi nav.gi hip.gi thi.gi bic.gi for.gi kne.gi cal.gi ank.gi
## 1 89.5 71.5 74.5 93.5 51.5 32.5 26.0 34.5 36.5 23.5
## 2 97.0 79.0 86.5 94.8 51.5 34.4 28.0 36.5 37.5 24.5
## 3 97.5 83.2 82.9 95.0 57.3 33.4 28.8 37.0 37.3 21.9
## 4 97.0 77.8 78.8 94.0 53.0 31.0 26.2 37.0 34.8 23.0
## 5 97.5 80.0 82.5 98.5 55.4 32.0 28.4 37.7 38.6 24.4
## 6 99.9 82.5 80.1 95.3 57.5 33.0 28.0 36.6 36.1 23.5
## wri.gi age wgt hgt sex
## 1 16.5 21 65.6 174.0 1
## 2 17.0 23 71.8 175.3 1
## 3 16.9 28 80.7 193.5 1
## 4 16.6 23 72.6 186.5 1
## 5 18.0 22 78.8 187.2 1
## 6 16.9 21 74.8 181.5 1
mdims <- subset(bdims, sex == 1)
fdims <- subset(bdims, sex == 0)
Exercise 1
# Exercise 1: Make a histogram of men’s heights and a histogram of women’s heights. How would you compare the various aspects of the two distributions?
mhgtmean <- mean(mdims$hgt)
mhgtsd <- sd(mdims$hgt)
fhgtmean <- mean(fdims$hgt)
fhgtsd <- sd(fdims$hgt)
hist(mdims$hgt, probability = TRUE)
x <- 140:190
y <- dnorm(x = x, mean = mhgtmean, sd = mhgtsd)
lines(x = x, y = y, col = "red")

hist(fdims$hgt, probability = TRUE)
x <- 140:190
y <- dnorm(x = x, mean = fhgtmean, sd = fhgtsd)
lines(x = x, y = y, col = "blue")

# Answer: Based on the histograms, both distributions appear to be a normal distributions
Exercise 2
# Exercise 2: Based on the this plot, does it appear that the data follow a nearly normal distribution?
hist(fdims$hgt, probability = TRUE)
x <- 140:190
y <- dnorm(x = x, mean = fhgtmean, sd = fhgtsd)
lines(x = x, y = y, col = "blue")

qqnorm(fdims$hgt)
qqline(fdims$hgt)

# Answer: Yes, based on the histogram as well as the Q-Q Plot, the distribution of female heights appear to be normally distributed
Exercise 3
# Exercise 3: Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data?
sim_norm <- rnorm(n = length(fdims$hgt), mean = fhgtmean, sd = fhgtsd)
qqnorm(sim_norm,main="QQ Plot simulated Data")
qqline(fdims$hgt)

# Answer: The simulation does not appear to be normally distributed compated to the original data. There are lots of data that do not fall on/near the line
Exercise 4
# Exercise 4: Does the normal probability plot for fdims$hgt look similar to the plots created for the simulated data?
# That is, do plots provide evidence that the female heights are nearly normal?
qqnormsim(fdims$hgt)

# Answer: Yes, the simulated models appear to be normal so the female data appears to be normally distributed
Exercise 5
# Exercise 5: Using the same technique, determine whether or not female weights appear to come from a normal distribution.
qqnormsim(fdims$wgt)

# Answer: Yes, the weight also appears to be normal
Exercise 6
# Exercise 6: Write out two probability questions that you would like to answer;
# one regarding female heights and one regarding female weights.
# Calculate the those probabilities using both the theoretical normal distribution
# as well as the empirical distribution (four probabilities in all).
# Which variable, height or weight, had a closer agreement between the two methods?
# Question 1: What is the probability that the average height of the female aged less than 20 are more than 160 cms
fdimsNew <- subset(bdims, sex == 0 & age <= 20)
fhgtmeanNew <- mean(fdimsNew$hgt)
fhgtsdNew <- sd(fdimsNew$hgt)
pnorm(q =160, mean = fhgtmeanNew, sd = fhgtsdNew)
## [1] 0.2697275
sum(fdimsNew$hgt<160)/length(fdimsNew$hgt)
## [1] 0.2291667
# Answer: The probability is 0.2291 (22.91%)
# Question 2: What is the probability that the weight of the female aged more than 35, are more than 65 kgs
fdimsNew2 <- subset(bdims, sex == 0 & age > 35)
fhgtmeanNew2 <- mean(fdimsNew2$wgt)
fhgtsdNew2 <- sd(fdimsNew2$wgt)
pnorm(q=65, mean = fhgtmeanNew2, sd = fhgtsdNew2)
## [1] 0.5801419
sum(fdimsNew2$wgt>65)/length(fdimsNew2$wgt)
## [1] 0.3157895
# Answer: The probability is 0.3158 (31.58%)