#Chapter 2 Homework by: Jared Ratka
library('plyr')
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('ggplot2')
library('readr')
library('readxl')



#1. A test is graded from 0 to 50, with an average score of 35 and a standard deviation of 10. For comparison to other tests, it would be convenient to rescale to a mean of 100 and standard deviation of 15.
  #(a)  How can the scores be linearly transformed to have this new mean and standard deviation?
  #(b)  There is another linear transformation that also rescales the scores to have mean 100 and standard deviation 15. What is it, and why would you NOT want to use it for this purpose?
  
    #A) By using the linear transformation of X; Y = aX+b we can set up two equations and solve for the unknowns:
      #100 = a * 35  + b
      #225 = a^2 * 100
      #Solving for a and b we get: a = 1.5 and b = 47.5
      #Therefore, the linear transformation needed is Y = 1.5X + 47.5
   # B) By solving 225 = a^2 * 100 for a really gets us that "a" can be either positive or negative 1.5, so the other linear transformation would be:
      #Y = -1.5X + 152.5
     # Using this transformation can cause an originally lower score to score higher than an originally higher score post-transformation. For example:
       # A student with an original score of 25 would score 115 post-transformation. While a student who originally scored 35 would score a 100 post-transformation.
      #Using this transformation would skew the results to an unfair outcome.
#2. The following are the proportions of girl births in Vienna for each month in 1908 and 1909 (out of an average of 3900 births per month):
girls.1908 <- c(.4777, .4875, .4859, .4754, .4874, .4864, .4813, .4787, .4895, .4797, .4876, .4859)
girls.1909 <- c(.4857, .4907, .5010, .4903, .4860, .4911, .4871, .4725, .4822, .4870, .4823, .4973)
n <- 3900
girls <- c(girls.1908, girls.1909)
#a) Compute the standard deviation of these proportions and compare to the standard deviation that would be expected if the sexes of babies were inde- pendently decided with a constant probability over the 24-month period.
#b) The actual and theoretical standard deviations from (a) differ, of course. Is this difference statistically significant? 
  #A) 
std.girls <- sd(girls)
std.girls
## [1] 0.006409724
p.girls <- mean(girls)
p.girls
## [1] 0.485675
stddev.expected <- sqrt(p.girls*(1-p.girls)/n)
stddev.expected #the standard deviation
## [1] 0.008003121
  #B)
dof <- 23
rho <- 0.05
alpha <- qchisq(rho/2., dof)
upper.deviation <- sqrt(dof*(std.girls^2)/alpha)
beta <- qchisq(1 - (rho/2.), dof)
lower.deviation <- sqrt(dof*(std.girls^2)/beta)
upper.deviation
## [1] 0.008991309
lower.deviation
## [1] 0.004981725
# The standard deviation of 0.0064 falls within the range of the upper and lower deviations, therefore it is not significant.

#3. Demonstration of the Central Limit Theorem: let x = x1 + · · · + x20, the sum of 20 independent Uniform(0,1) random variables. In R, create 1000 simulations of x and plot their histogram. On the histogram, overlay a graph of the normal density function. Comment on any differences between the histogram and the curve.
N <- 20
a <- 0
b <- 1
mean.single <- 0.5 * (a + b)
mean.sum <- N * mean.single
var.single <- (1/12.) * ((b-a)^2)
sd.sum <- sqrt(N * var.single)
mean.sum
## [1] 10
var.single
## [1] 0.08333333
sd.sum
## [1] 1.290994
sim <- 1000
obs <- runif(N * sim)
y <- matrix(obs, nrow=sim, ncol=N, byrow = F)
thedata <- data.frame(out = rowSums(y))
ggplot(thedata, aes(x=out), family = "Courier") + geom_histogram(aes(y=..density..), binwidth = 1, alpha =0.1) + geom_density() + 
  stat_function(fun=dnorm, args=list(mean=mean.sum, sd=sd.sum), lwd = 2, col = 'blue', alpha = 0.2) +
  ggtitle("Central Limit Theorom") + xlab("Value") + ylab("Density")

#It is very similar and it differs the most at the top of the curve, but it does get even closer if you increase the amount of simulations.

#4. Distribution of averages and differences: the heights of men in the United States are approximately normally distributed with mean 69.1 inches and standard deviation 2.9 inches. The heights of women are approximately normally distributed with mean 63.7 inches and standard deviation 2.7 inches. Let x be the average height of 100 randomly sampled men, and y be the average height of 100 randomly sampled women. In R, create 1000 simulations of x - y and plot their histogram. Using the simulations, compute the mean and standard deviation of the distribution of x - y and compare to their exact values.
N <- 100
men.ht.mean <- 69.1
men.ht.dev <- 2.9
women.ht.mean <- 63.7
women.ht.dev <- 2.7
sim <- 1000
obs.men <- rnorm(N * sim, mean = men.ht.mean, sd = men.ht.dev)
obs.women <- rnorm(N * sim, mean = women.ht.mean, sd = women.ht.dev)
y_men <- matrix(obs.men, nrow=sim, ncol=N, byrow = F)
y_women <- matrix(obs.women, nrow=sim, ncol=N, byrow = F)
ht.data <- data.frame(ht.men = rowMeans(y_men), ht.women = rowMeans(y_women))
ht.data['diff'] = ht.data['ht.men'] - ht.data['ht.women']
ggplot(ht.data, aes(x=diff)) + geom_histogram() + ggtitle("x-y") + xlab("Value") + ylab("Counts")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#theoretical mean
theo.mean <- men.ht.mean - women.ht.mean
theo.mean
## [1] 5.4
#mean using the simulations
colwise(mean)(ht.data['diff'])
##       diff
## 1 5.403482
#theoretical standard deviation
theo.mean <- sqrt(men.ht.dev^2 + women.ht.dev^2)
theo.mean
## [1] 3.962323
#standard deviation using the simulations
colwise(sd)(ht.data['diff'])
##        diff
## 1 0.3956065
#both the mean and the standard deviation of the simulations are close to the theoretical values.

#5. Correlated random variables: suppose that the heights of husbands and wives have a correlation of 0.3. Let x and y be the heights of a married couple chosen at random. What are the mean and standard deviation of the average height, (x + y)/2?
men.ht.mean <- 69.1
men.ht.dev <- 2.9
women.ht.mean <- 63.7
women.ht.dev <- 2.7
rho <- 0.3

#mean of the avverage height
mean.avg <- (men.ht.mean + women.ht.mean)/2
mean.avg
## [1] 66.4
#standard deviation of the average height
dev.all <- (men.ht.dev^2 + women.ht.dev^2 + 2 * rho * men.ht.dev * women.ht.dev)^(1/2)
dev.all
## [1] 4.516415