# This data set contains measurements from 247 men and 260 women, most of whom were considered healthy young adults.
download.file("http://www.openintro.org/stat/data/bdims.RData", destfile = "bdims.RData")
load("bdims.RData")
# Peek at the first few rows of the data
head(bdims)
## bia.di bii.di bit.di che.de che.di elb.di wri.di kne.di ank.di sho.gi
## 1 42.9 26.0 31.5 17.7 28.0 13.1 10.4 18.8 14.1 106.2
## 2 43.7 28.5 33.5 16.9 30.8 14.0 11.8 20.6 15.1 110.5
## 3 40.1 28.2 33.3 20.9 31.7 13.9 10.9 19.7 14.1 115.1
## 4 44.3 29.9 34.0 18.4 28.2 13.9 11.2 20.9 15.0 104.5
## 5 42.5 29.9 34.0 21.5 29.4 15.2 11.6 20.7 14.9 107.5
## 6 43.3 27.0 31.5 19.6 31.3 14.0 11.5 18.8 13.9 119.8
## che.gi wai.gi nav.gi hip.gi thi.gi bic.gi for.gi kne.gi cal.gi ank.gi
## 1 89.5 71.5 74.5 93.5 51.5 32.5 26.0 34.5 36.5 23.5
## 2 97.0 79.0 86.5 94.8 51.5 34.4 28.0 36.5 37.5 24.5
## 3 97.5 83.2 82.9 95.0 57.3 33.4 28.8 37.0 37.3 21.9
## 4 97.0 77.8 78.8 94.0 53.0 31.0 26.2 37.0 34.8 23.0
## 5 97.5 80.0 82.5 98.5 55.4 32.0 28.4 37.7 38.6 24.4
## 6 99.9 82.5 80.1 95.3 57.5 33.0 28.0 36.6 36.1 23.5
## wri.gi age wgt hgt sex
## 1 16.5 21 65.6 174.0 1
## 2 17.0 23 71.8 175.3 1
## 3 16.9 28 80.7 193.5 1
## 4 16.6 23 72.6 186.5 1
## 5 18.0 22 78.8 187.2 1
## 6 16.9 21 74.8 181.5 1
# Names of the 25 variables in one view
names(bdims)
## [1] "bia.di" "bii.di" "bit.di" "che.de" "che.di" "elb.di" "wri.di"
## [8] "kne.di" "ank.di" "sho.gi" "che.gi" "wai.gi" "nav.gi" "hip.gi"
## [15] "thi.gi" "bic.gi" "for.gi" "kne.gi" "cal.gi" "ank.gi" "wri.gi"
## [22] "age" "wgt" "hgt" "sex"
# Create 2 different data sets - one for males, other for females
mdims <- subset(bdims, sex == 1)
fdims <- subset(bdims, sex == 0)
EXERCISE 1
# Histogram of Men's Heights (in cm)
hist(mdims$hgt, xlab = "Height (cm)", main = "Men's Heights (cm)")

summary(mdims$hgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 157.2 172.9 177.8 177.7 182.7 198.1
# Histogram of Men's Heights (in inches)
# Convert heights from cm to inches (easier to relate to)
# 1 inch = 2.54 cm
minches <- mdims$hgt/2.54
# Plot histogram with more breaks to see more detail in distribution
hist(minches, xlab = "Height (in)", main = "Men's Heights (inches)", breaks=12)

summary(minches)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 61.89 68.07 70.00 69.98 71.91 77.99
sd(minches)
## [1] 2.8282
# Histogram of Women's Heights (in cm)
hist(fdims$hgt, xlab = "Height (cm)", main = "Women's Heights (cm)")

summary(fdims$hgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 147.2 160.0 164.5 164.9 169.5 182.9
# Histogram of Women's Heights (in inches)
# Convert heights from cm to inches (easier to relate to)
# 1 inch = 2.54 cm
finches <- fdims$hgt/2.54
hist(finches, xlab = "Height (in)", main = "Women's Heights (inches)", breaks=20)

summary(finches)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 57.95 62.99 64.76 64.91 66.73 72.01
sd(finches)
## [1] 2.576615
# Plot men and women next to each other using par() function
par(mfrow=c(1,2))
# Add lines for the mean. Blue for men, Red for women. Show both means on each chart.
hist(mdims$hgt, xlim=c(140,200), ylim=c(0,80), main="Men's Heights",xlab="cm")
abline(v=mean(mdims$hgt), col="blue", lwd=3)
abline(v=mean(fdims$hgt), col="red", lwd=3)
hist(fdims$hgt, xlim=c(140,200), ylim=c(0,80), main="Women's Heights",xlab="cm")
abline(v=mean(mdims$hgt), col="blue", lwd=3)
abline(v=mean(fdims$hgt), col="red", lwd=3)

# The histograms for men and women heights look fairly normal.
# Men have a much stronger peak, with many men falling around the median (~69").
# Women have an unexpected second peak at ~ 63" (159 cm).
EXERCISE 2
# Store mean & SD for women's heights as separate object to be referenced later
fhgtmean <- mean(fdims$hgt)
fhgtsd <- sd(fdims$hgt)
# Create a density histogram for women's heights
hist(fdims$hgt, probability = TRUE, ylim = c(0, 0.09), xlab="Height (cm)", main="Density Histogram of Women's Heights", breaks=15)
x <- 140:190
y <- dnorm(x = x, mean = fhgtmean, sd = fhgtsd)
lines(x = x, y = y, col = "blue")

# Adding breaks=15 allows you to see that the distribution does not look quite normal. There is a second peak at 158-159 cm. With fewer breaks, it's easy to miss this second peak.But the normal curve imposed on the histogram makes this distribution look otherwise fairly normal.
# POST-DISCUSSION W/PROF - This curve is not considered bimodal. It would be considered a normal distribution
# Do the Shapiro Wilk Test to confirm
shapiro.test(fdims$hgt)
##
## Shapiro-Wilk normality test
##
## data: fdims$hgt
## W = 0.99283, p-value = 0.2437
# p-value is 0.2437 ==> NO COMPELLING EVIDENCE FOR NON-NORMALITY. We can say it's normal.
EXERCISE 3
# Construct a normal probability plot, also called Q-Q plot
qqnorm(fdims$hgt, main="Normal Q-Q Plot Women's Heights")
qqline(fdims$hgt)

# The points do not all fall along the straight line. There are points off the line on the lower and upper ends.
# Simulate data from a normal distribution using rnorm
sim_norm <- rnorm(n = length(fdims$hgt), mean = fhgtmean, sd = fhgtsd)
# Normal probability plot for sim_norm (a simulated normal distribution)
qqnorm(sim_norm)
qqline(sim_norm)

# This Q-Q plot looks a bit different from the Q-Q plot based on real data. The lower outliers (z scores < -2 ) are below the line instead of above.
EXERCISE 4
# Generate more Q-Q plots of simulated normal distributions of women's heights
qqnormsim(fdims$hgt)

# Two of these plots have lower outliers above the line, just like the normal probability plot based on real data. So these plots do provide evidence that female heights are fairly normal.
EXERCISE 5
# Construct a normal probability plot for women's weights
qqnorm(fdims$wgt, main="Normal Q-Q Plot Women's Weights")
qqline(fdims$wgt)

# This Q-Q plot does not look normal. There is a slight U-shape.
# Generate Q-Q plots for women's weights
qqnormsim(fdims$wgt)

# These Q-Q plots look more or less the same - they all have straight lines. The Q-Q plot based on real data above shows a bit more skew on both ends.
# The female weight data appears to come from a normal distribution, but there is a strong right skew or larger outliers than a perfectly normal distribution.
# To check the visual guess, plot the histogram for female weights
hist(fdims$wgt,breaks=30)

# The histogram confirms the right skew
EXERCISE 6
# Calculate the theoretical probablity a randomly chosen woman is over 182 cm
1 - pnorm(q = 182, mean = fhgtmean, sd = fhgtsd)
## [1] 0.004434387
# Calculate the empirical probabiity
sum(fdims$hgt > 182) / length(fdims$hgt)
## [1] 0.003846154
# Question 1 - What is the probability that a randomly chosen woman weighs less than 110 pounds (50 kg)?
fwgtmean <- mean(fdims$wgt)
fwgtsd <- sd(fdims$wgt)
# Theoretical probability
pnorm(q = 50, mean = fwgtmean, sd = fwgtsd)
## [1] 0.135143
# Empirical probability
sum(fdims$wgt < 50) / length(fdims$wgt)
## [1] 0.1038462
# Question 2 - What is the probability that a randomly chosen woman's height is between 60 inches (152 cm) and 63 inches (160 cm?
# Theoretical probability
fminheight <- pnorm(q = 152, mean = fhgtmean, sd = fhgtsd)
fmaxheight <- pnorm(q = 160, mean = fhgtmean, sd = fhgtsd)
fmaxheight - fminheight
## [1] 0.2036941
# Empirical probability
(sum(fdims$hgt < 160) - sum(fdims$hgt < 152)) / length(fdims$wgt)
## [1] 0.1730769
ON YOUR OWN
Question 1
# 1a. Histogram for Female Billiac (pelvic) diameter
hist(fdims$bii.di, breaks=20)

# This plot belongs to Q-Q plot B - because it has a left skew
# 1b. Histogram for Female Elbow diameter
hist(fdims$elb.di, breaks=30)

# This plot belongs to Q-Q plot C - because it is reasonably bell-shaped
# 1c. Histogram for General Age
hist(bdims$age, breaks=30)

# This plot belongs to Q-Q plot D - because it has a strong right skew and looks more exponential than 1d
# 1d. Histogram for Female Chest depth
hist(fdims$che.de, breaks=30)

# This plot belongs to Q-Q plot A - because it looks normal with a right skew
# CHECK QUESTION 1 ANSWERS
# Check your answer for 1a against the Q-Q plot
qqnorm(fdims$bii.di)
qqline(fdims$bii.di)

# Check your answer for 1b against the Q-Q plot
qqnorm(fdims$elb.di)
qqline(fdims$elb.di)

# Check your answer for 1c against the Q-Q plot
qqnorm(bdims$age)
qqline(bdims$age)

# Check your answer for 1d against the Q-Q plot
qqnorm(fdims$che.de)
qqline(fdims$che.de)

Question 2
# Why the slight step-wise pattern in Plots C and D ??
# I'm not sure what is meant by "step" pattern. I see small "steps" in all the plots. I don't see any distinct difference in "steps" in Plots C and D.
# FOLLOW-UP
Question 3
# Q-Q plot for female knee diameter
qqnorm(fdims$kne.di)
qqline(fdims$kne.di)

# Knee diameter looks right skewed.
# Check answer
hist(fdims$kne.di)

# Histogram confirms the right skew