#The data
library(DATA606)
## Loading required package: shiny
## Warning: package 'shiny' was built under R version 3.5.2
## Loading required package: openintro
## Warning: package 'openintro' was built under R version 3.5.2
## Please visit openintro.org for free statistics materials
## 
## Attaching package: 'openintro'
## The following objects are masked from 'package:datasets':
## 
##     cars, trees
## Loading required package: OIdata
## Warning: package 'OIdata' was built under R version 3.5.2
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 3.5.2
## Loading required package: bitops
## Loading required package: maps
## Warning: package 'maps' was built under R version 3.5.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:openintro':
## 
##     diamonds
## Loading required package: markdown
## Warning: package 'markdown' was built under R version 3.5.2
## 
## Welcome to CUNY DATA606 Statistics and Probability for Data Analytics 
## This package is designed to support this course. The text book used 
## is OpenIntro Statistics, 3rd Edition. You can read this by typing 
## vignette('os3') or visit www.OpenIntro.org. 
##  
## The getLabs() function will return a list of the labs available. 
##  
## The demo(package='DATA606') will list the demos that are available.
## 
## Attaching package: 'DATA606'
## The following object is masked from 'package:utils':
## 
##     demo
startLab('Lab3')
## Setting working directory to C:/Users/zahir/Documents/R/win-library/3.5/DATA606/labs/Lab3
## [1] "C:/Users/zahir/Documents/Data 606/Lab3/zahir-normal_distribution.Rmd"
load("more/bdims.RData")
head(bdims)
##   bia.di bii.di bit.di che.de che.di elb.di wri.di kne.di ank.di sho.gi
## 1   42.9   26.0   31.5   17.7   28.0   13.1   10.4   18.8   14.1  106.2
## 2   43.7   28.5   33.5   16.9   30.8   14.0   11.8   20.6   15.1  110.5
## 3   40.1   28.2   33.3   20.9   31.7   13.9   10.9   19.7   14.1  115.1
## 4   44.3   29.9   34.0   18.4   28.2   13.9   11.2   20.9   15.0  104.5
## 5   42.5   29.9   34.0   21.5   29.4   15.2   11.6   20.7   14.9  107.5
## 6   43.3   27.0   31.5   19.6   31.3   14.0   11.5   18.8   13.9  119.8
##   che.gi wai.gi nav.gi hip.gi thi.gi bic.gi for.gi kne.gi cal.gi ank.gi
## 1   89.5   71.5   74.5   93.5   51.5   32.5   26.0   34.5   36.5   23.5
## 2   97.0   79.0   86.5   94.8   51.5   34.4   28.0   36.5   37.5   24.5
## 3   97.5   83.2   82.9   95.0   57.3   33.4   28.8   37.0   37.3   21.9
## 4   97.0   77.8   78.8   94.0   53.0   31.0   26.2   37.0   34.8   23.0
## 5   97.5   80.0   82.5   98.5   55.4   32.0   28.4   37.7   38.6   24.4
## 6   99.9   82.5   80.1   95.3   57.5   33.0   28.0   36.6   36.1   23.5
##   wri.gi age  wgt   hgt sex
## 1   16.5  21 65.6 174.0   1
## 2   17.0  23 71.8 175.3   1
## 3   16.9  28 80.7 193.5   1
## 4   16.6  23 72.6 186.5   1
## 5   18.0  22 78.8 187.2   1
## 6   16.9  21 74.8 181.5   1
#Creating a subset
mdims <- subset(bdims, sex == 1)
fdims <- subset(bdims, sex == 0)

#Exercise 1: Make a histogram of men's heights and a histogram of women's heights. How would you compare the various aspects of the two distributions?
par(mfrow=c(1,2))
hist(mdims$hgt, main="Height of men", xlab="Height")
hist(fdims$hgt, main="Height of women", xlab="Height")

summary(mdims$hgt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   157.2   172.9   177.8   177.7   182.7   198.1
summary(fdims$hgt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   147.2   160.0   164.5   164.9   169.5   182.9
##The mean of the male height is 178 whereas for female it is 165.
##The men heights look more normally distributed whereas the female distribution is skewed left.
##Both distributions are unimodal.

#The Normal Distribution
fhgtmean <- mean(fdims$hgt)
fhgtsd   <- sd(fdims$hgt)
par(mfrow=c(1,1))
hist(fdims$hgt, probability = TRUE, main="Height of women", xlab="Height", ylim = c(0, 0.06))
x <- 140:190
y <- dnorm(x = x, mean = fhgtmean, sd = fhgtsd)
lines(x = x, y = y, col = "blue")

#Exercise 2: Based on the this plot, does it appear that the data follow a nearly normal distribution?
##The data seems to follow a normal distribution but we should check with a QQplot
qqnorm(fdims$hgt)
qqline(fdims$hgt)

##The date points are very close to the line but there are some outliers both at the top and bottom.

#Evaluating the Normal Distribution
sim_norm <- rnorm(n = length(fdims$hgt), mean = fhgtmean, sd = fhgtsd)

#Exercise 3: Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data?
qqnormsim(fdims$hgt)

##All of the points do not fall on theline.
##The simulated charts look very similar to the chart using real data.

#Exercise 4: Does the normal probability plot for fdims$hgt look similar to the plots created for the simulated data? That is, do plots provide evidence that the female heights are nearly normal?
##Yes they look very similar.
##Yes the plots provide evidence that female heights are nearly normal.

#Exercise 5: Using the same technique, determine whether or not female weights appear to come from a normal distribution.
fwgtmean <- mean(fdims$wgt)
fwgtsd   <- sd(fdims$wgt)
hist(fdims$wgt, probability = TRUE, main="Weight of women", xlab="Weight")
x <- 40:110
y <- dnorm(x = x, mean = fwgtmean, sd = fwgtsd)
lines(x = x, y = y, col = "blue")

##The weight distribution seems to be skewed to the right.
##we will plot a QQ chart to check
qqnorm(fdims$wgt)
qqline(fdims$wgt)

qqnormsim(fdims$wgt)

##There are too many outliers in the real data qqplot which reconfirms the data does not follow a normal distribution.

#Normal Probabilities
##"What is the probability that a randomly chosen young adult female is taller than 6 feet (about 182 cm)?"
1 - pnorm(q = 182, mean = fhgtmean, sd = fhgtsd)
## [1] 0.004434387
##Calculate empirically
sum(fdims$hgt > 182) / length(fdims$hgt)
## [1] 0.003846154
##The probabilities are not equal but are close.

#Exercise 6: Write out two probability questions that you would like to answer; one regarding female heights and one regarding female weights. Calculate the those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which variable, height or weight, had a closer agreement between the two methods?
##What is the probability that the height of a women is between 160 to 170cm?
pnorm(q = 170, mean = fhgtmean, sd = fhgtsd)-pnorm(q = 160, mean = fhgtmean, sd = fhgtsd)
## [1] 0.5550392
sum(fdims$hgt >= 160 & fdims$hgt<=170) / length(fdims$hgt)
## [1] 0.5846154
##What is the probability that the weight of a woman is over65 kg
1-pnorm(q =65, mean = fwgtmean, sd = fwgtsd)
## [1] 0.3236397
sum(fdims$wgt >= 65) / length(fdims$wgt)
## [1] 0.2615385
##The variable height has a closer agreement.

#ON YOUR OWN
##1. Now let's consider some of the other variables in the body dimensions data set. Using the figures at the end of the exercises, match the histogram to its normal probability plot. All of the variables have been standardized (first subtract the mean, then divide by the standard deviation), so the units won't be of any help. If you are uncertain based on these figures, generate the plots in R to check.

##a)The histogram for female biiliac (pelvic) diameter (bii.di) belongs to normal probability plot letter
qqnorm(fdims$bii.di)
qqline(fdims$bii.di)

##Answer is B

##b)The histogram for female elbow diameter (elb.di) belongs to normal probability plot letter
qqnorm(fdims$elb.di)
qqline(fdims$elb.di)

##Answer is C

##c)The histogram for general age (age) belongs to normal probability plot letter
qqnorm(bdims$age)
qqline(bdims$age)

##Answer is D

##d)The histogram for female chest depth (che.de) belongs to normal probability plot letter
qqnorm(fdims$che.de)
qqline(fdims$che.de)

##Answer is A

##2: Note that normal probability plots C and D have a slight stepwise pattern. Why do you think this is the case?
##This is because the variable age is in discrete values and then converted to z scores.The variable elbo diameter
##is in 1 decimal place. If the variables were in smaller units, the stepwise pattern would be less.

##3: As you can see, normal probability plots can be used both to assess normality and visualize skewness. Make a normal probability plot for female knee diameter (kne.di). Based on this normal probability plot, is this variable left skewed, symmetric, or right skewed? Use a histogram to confirm your findings.
qqnorm(fdims$kne.di)
qqline(fdims$kne.di)

##The data seems to be right skewed as there are more outliers on the top
hist(fdims$kne.di, main="Female knee diameter", xlab="Knee Diameter in cm")

##The histogram is clearly right skewed.