This is entirely my own work except as noted at the end of the document.

require(ggplot2)
require(PASWR)

primaryColor = "black"
secondaryColor = "red"

ed.histo <- function(data, col1=primaryColor, col2=secondaryColor) {
  p <- ggplot(as.data.frame(data),aes(x=data),color=col1)
  p + geom_histogram(aes(y = ..density..), fill = col2)+ geom_density(color =   col1, size = 1.5)
}

ed.q <- function(data, col1=primaryColor, col2=secondaryColor) {
  p2 <- ggplot(as.data.frame(data),aes(sample=data))
  p2 + stat_qq(color=col2)
}

importData <- function(dataset) {
  url = url(paste("http://www1.appstate.edu/~arnholta/Data/", dataset, ".csv", sep=""))
  read.csv(file=url)
}

tboot <- function(data,B=10^4,conf.level=0.95){
  alpha <- 1 - conf.level
  xbar <- mean(data)
  S <- sd(data)
  n <- length(data)
  xbarstar <- numeric(B)
  Sstar <- numeric(B)
  for(i in 1:B){
    xs <-sample(data,size=length(data),replace=T)
    xbarstar[i] <- mean(xs)
    Sstar[i] <- sd(xs)
  }
  
  TS <- (xbarstar - xbar)/(Sstar/sqrt(n))
  Q1 <- quantile(TS, prob = alpha/2)
  Q2 <- quantile(TS, prob = 1- alpha/2)
  CI <- c(xbar - Q2*S/sqrt(n), xbar - Q1*S/sqrt(n))
  CIP <- quantile(xbarstar,prob = c(alpha/2,1- alpha/2))
  data.frame("boot.conf"=CI, "percentile.conf"=CIP)
}

Prob1 - Import the data set Spruce into R.

# Your code here
Spruce <- importData("Spruce")
head(Spruce)
  Tree Competition Fertilizer Height0 Height5 Diameter0 Diameter5
1    1          NC          F    15.0    60.0  1.984375       7.4
2    2          NC          F     9.0    45.2  1.190625       5.2
3    3          NC          F    12.0    42.0  1.785937       5.7
4    4          NC          F    13.7    49.5  1.587500       6.4
5    5          NC          F    12.0    47.3  1.587500       6.2
6    6          NC          F    12.0    56.4  1.587500       7.4
  Ht.change Di.change
1      45.0  5.415625
2      36.2  4.009375
3      30.0  3.914062
4      35.8  4.812500
5      35.3  4.612500
6      44.4  5.812500
ed.histo(Spruce$Ht.change)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Spruce$Ht.change)

The data has normal distribution.

# Your code here
spruceT <- t.test(Spruce$Ht.change)

I am 95% confident that the change in height for a spruce tree in a 5 year period within (28.336845, 33.5298216)

With Fertilizer

# Your code here
ed.histo(Spruce$Ht.change[Spruce$Fertilizer == "F"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Spruce$Ht.change[Spruce$Fertilizer == "F"])

Without Fertilizer

ed.histo(Spruce$Ht.change[Spruce$Fertilizer != "F"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Spruce$Ht.change[Spruce$Fertilizer != "F"])

* Find the 95% one-sided lower \(t\) confidence bound for the difference in mean heights (\(\mu_F - \mu_{NF}\)) over the 5-year period of the study and give a sentence interpreting your interval.

# Your code here
spruce_t2 <- t.test(Spruce$Ht.change~Spruce$Fertilizer,alternative="greater")

I am 95% confident that the difference in heights between spruce trees that were fertilized and those that were not during a 5 year period is at least 11.4666429.

Prob2 - Consider the data set Girls2004 with birth weights of baby girls born in Wyoming or Alaska.

# Your code here
Girls <- importData("Girls2004")
head(Girls)
  ID State MothersAge Smoker Weight Gestation
1  1    WY      15-19     No   3085        40
2  2    WY      35-39     No   3515        39
3  3    WY      25-29     No   3775        40
4  4    WY      20-24     No   3265        39
5  5    WY      25-29     No   2970        40
6  6    WY      20-24     No   2850        38

Wyoming

# Your code here
ed.histo(Girls$Weight[Girls$State == "WY"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Girls$Weight[Girls$State == "WY"])

Alaska

ed.histo(Girls$Weight[Girls$State == "AK"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Girls$Weight[Girls$State == "AK"])

# Your code here
girlsT <- t.test(Girls$Weight[Girls$State == "AK"],Girls$Weight[Girls$State == "WY"])

I am 95% confident that the difference in weights between girls born in Wyoming and girls born in Alaska is within (83.2939508, 533.6060492).

Smokers

ed.histo(Girls$Weight[Girls$Smoker == "Yes"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Girls$Weight[Girls$Smoker == "Yes"])

Non Smokers

ed.histo(Girls$Weight[Girls$Smoker == "No"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Girls$Weight[Girls$Smoker == "No"])

# Your code here
girlsT <- t.test(Girls$Weight[Girls$Smoker == "Yes"],Girls$Weight[Girls$Smoker == "No"])
girlsT

    Welch Two Sample t-test

data:  Girls$Weight[Girls$Smoker == "Yes"] and Girls$Weight[Girls$Smoker == "No"]
t = -1.8552, df = 14.35, p-value = 0.08423
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -617.9197   44.0330
sample estimates:
mean of x mean of y 
 3114.636  3401.580 

Prob3 - Import the FlightDelays data set into R. Although the data represent all flights for United Airlines and American Airlines in May and June 2009, assume for this exercise that these flights are a sample from all flights flown by the two airlines under similar conditions. We will compare the lengths of flight delays between the two airlines.

# Your code here
Flight <- importData("FlightDelays")
head(Flight)
  ID Carrier FlightNo Destination DepartTime Day Month FlightLength Delay
1  1      UA      403         DEN      4-8am Fri   May          281    -1
2  2      UA      405         DEN     8-Noon Fri   May          277   102
3  3      UA      409         DEN      4-8pm Fri   May          279     4
4  4      UA      511         ORD     8-Noon Fri   May          158    -2
5  5      UA      667         ORD      4-8am Fri   May          143    -3
6  6      UA      669         ORD      4-8am Fri   May          150     0
  Delayed30
1        No
2       Yes
3        No
4        No
5        No
6        No

United Airlines*

# Your code here
ed.histo(Flight$Delay[Flight$Carrier == "UA"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Flight$Delay[Flight$Carrier == "UA"])

American Airlines

ed.histo(Flight$Delay[Flight$Carrier == "AA"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(Flight$Delay[Flight$Carrier == "AA"])

flight_t <- t.test(Flight$Delay[Flight$Carrier == "UA"],Flight$Delay[Flight$Carrier == "AA"])
flight_t

    Welch Two Sample t-test

data:  Flight$Delay[Flight$Carrier == "UA"] and Flight$Delay[Flight$Carrier == "AA"]
t = 3.8255, df = 1843.8, p-value = 0.0001349
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 2.868194 8.903198
sample estimates:
mean of x mean of y 
 15.98308  10.09738 

I am 95% confident that the difference in delay times between United Airlines and American Airlines is within (2.8681941, 8.9031985).

Prob4 - Run a simulation to see if the \(t\) ratio \(T = (\bar{X} -\mu)/(S/\sqrt{n})\) has a \(t\) distribution or even an approximate \(t\) distribution when the samples are drawn from a nonnormal distribution. Be sure to superimpose the appropriate \(t\) density curve over the density of your simulated \(T\). Try two different nonnormal distributions \(\left( Unif(a = 0, b = 1), Exp(\lambda = 1) \right)\) and remember to see if sample size makes a difference (use \(n = 15\) and \(n=500\)).

# Your code here
n <- 15
mu <- 0
N <- rexp(n)
#Tb <- mean(N)-mu/sqrt(n)
curve(dt(x,n-1),-5,5)
curve(dexp(x,1),add=T)

SOULTION:

Prob5 - One question is the 2002 General Social Survey asked participants whom they voted for in the 2000 election. Of the 980 women who voted, 459 voted for Bush. Of the 759 men who voted, 426 voted for Bush.

# Your code here
testWoman <- prop.test(459, 980)
testWoman

    1-sample proportions test with continuity correction

data:  459 out of 980, null probability 0.5
X-squared = 3.7969, df = 1, p-value = 0.05135
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.4368038 0.5001819
sample estimates:
        p 
0.4683673 
# Your code here
testMen <- prop.test(426, 980)
testMen

    1-sample proportions test with continuity correction

data:  426 out of 980, null probability 0.5
X-squared = 16.458, df = 1, p-value = 4.974e-05
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.4034682 0.4664379
sample estimates:
        p 
0.4346939 
# Your code here

SOLUTION:

Prob6 - A retail store wishes to conduct a marketing survey of its customers to see if customers would favor longer store hours. How many people should be in their sample if the marketers want their margin of error to be at most 3% with 95% confidence, assuming

# Your code here
err <- 0.03
p_sqiggle <- .5
sample_size <- ceiling((p_sqiggle*(1-p_sqiggle))/(err/1.96)^2)

You would need a mimimum of [1068] people

# Your code here
p_sqiggle <- .65
sample_size <- ceiling((p_sqiggle*(1-p_sqiggle))/(err/1.96)^2)

Since 65% of respondants favored longer hours, the store should use a sample size of [972] people.

Prob7 - Suppose researchers wish to study the effectiveness of a new drug to alleviate hives due to math anxiety. Seven hundred math students are randomly assigned to take either this drug or a placebo. Suppose 34 of the 350 students who took the drug break out in hives compared to 56 of the 350 students who took the placebo.

# Your code here
testDrug <- prop.test(34,350)
testDrug

    1-sample proportions test with continuity correction

data:  34 out of 350, null probability 0.5
X-squared = 225.6, df = 1, p-value < 2.2e-16
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.06913692 0.13429260
sample estimates:
         p 
0.09714286 

I am 95% confident that the proportion of students taking the drug who break out in hives is within (0.0691369, 0.1342926)

# Your code here
testPlacebo <- prop.test(56,350)
testPlacebo

    1-sample proportions test with continuity correction

data:  56 out of 350, null probability 0.5
X-squared = 160.48, df = 1, p-value < 2.2e-16
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.1240384 0.2036158
sample estimates:
   p 
0.16 

I am 95% confident that the proportion of students taking the placebo who break out in hives is within (0.1240384,0.2036158)

No conclusion can be reached.

# Your code here
x <- c(34, 56)
y <- c(350, 350)
testHives <- prop.test(x, y)
testHives

    2-sample test for equality of proportions with continuity
    correction

data:  x out of y
X-squared = 5.623, df = 1, p-value = 0.01773
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.11508783 -0.01062645
sample estimates:
    prop 1     prop 2 
0.09714286 0.16000000 

SOLUTION:

Prob8 - An article in the March 2003 New England Journal of Medicine describes a study to see if aspirin is effective in reducing the incidence of colorectal adenomas, a precursor to most colorectal cancers (Sandler et al. (2003)). Of 517 patients in the study, 259 were randomly assigned to receive aspirin and the remaining 258 received a placebo. One or more adenomas were found in 44 of the aspirin group and 70 in the placebo group. Find a 95% one-sided upper bound for the difference in proportions \((p_A - p_P)\) and interpret your interval.

# Your code here
x <- c(44, 70)
y <- c(259, 258)
test <- prop.test(x, y, alternative="less")

SOLUTION:

Prob9 - The data set Bangladesh has measurements on water quality from 271 wells in Bangladsesh. There are two missing values in the chlorine variable. Use the following R code to remove these two observations.

> chlorine <- with(Bangladesh, Chlorine[!is.na(Chlorine)])

# Your code here
Banga <- importData("Bangladesh")
chlorine <- with(Banga, Chlorine[!is.na(Chlorine)])
# Your code here
ed.histo(chlorine)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ed.q(chlorine)

The average cholorine level from 271 wells in Bangladsesh is [78.0840149] and has a standard deviation of [210.019193]. The distribution is skewed to the right, and appears to be exponential.

# Your code here
test <- t.test(chlorine)

SOLUTION:

# Your code here
test <- tboot(chlorine)
test
      boot.conf percentile.conf
97.5%  56.75082        55.11407
2.5%  111.73520       105.22474

I would report the bootstrap confidence interval (56.7508192, 111.7352039) because it accounts for skewness.

# Your code here
skew <- skewness(Banga$Arsenic, na.rm = T)
test <- tboot(Banga$Arsenic)

SOLUTION:

Prob10 - The data set MnGroundwater has measurements on water quality of 895 randomly selected wells in Minnesota.

# Your code here
Water <- importData("MnGroundwater")
head(Water)
  County        Aquifer.Group Water.Level Alkalinity Aluminum Arsenic
1 Aitkin surficial Quaternary          55     137000    0.059   1.810
2 Aitkin    buried Quaternary          30     214000    2.380   0.059
3 Aitkin    buried Quaternary          20     120000    0.410   1.440
4 Aitkin    buried Quaternary           3     283000  158.190   6.340
5 Aitkin    buried Quaternary           0     236000    0.059  10.170
6 Aitkin    buried Quaternary          30     229000    0.059   6.900
  Chloride Lead  pH              Basin.Name
1      490 0.17 7.1 Upper Mississippi River
2    89250 0.18 7.6 Upper Mississippi River
3      300 0.52 6.9 Upper Mississippi River
4      780 1.15 8.2 Upper Mississippi River
5     5090 0.02 7.9 Upper Mississippi River
6     2590 0.12 7.8 Upper Mississippi River
# Your code here
ed.histo(Water$Alkalinity)
ed.q(Water$Alkalinity)

The data has a normal distribution with its Alkalinity and has a mean of 2.906826810^{5} and a standard deviation of 1.083343410^{5}.

# Your code here
test <- t.test(Water$Alkalinity)

I am 95% confident that the mean \(\mu\) of alkalinity levels in Minnesota wells is within (2.83575610{5},2.977897610{5}).

# Your code here

SOLUTION:

Prob11 Consider the babies born in Texas in 2004 (TXBirths2004). We will compare the weights of babies born to nonsmokers and smokers.

# Your code here
Texas <- importData("TXBirths2004")
head(Texas)
  ID MothersAge Smoker Gender Weight Gestation Number Multiple
1  1      20-24     No   Male   3033        39      1       No
2  2      20-24     No   Male   3232        40      1       No
3  3      25-29     No Female   3317        37      1       No
4  4      25-29     No Female   2560        36      1       No
5  5      15-19     No Female   2126        37      1       No
6  6      30-34     No Female   2948        38      1       No
# Your code here
smokers <- sum(Texas$Smoker == "Yes")
nonsmokers <- sum(Texas$Smoker == "No")

SOLUTION:

# Your code here
ggplot(data = Texas, aes(x = Weight)) + geom_histogram() + facet_grid(Smoker~.)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Texas, aes(sample = Weight)) + stat_qq() + facet_grid(Smoker~.)
smokersEDA <- c(mean(Texas$Weight[Texas$Smoker == "Yes"]), sd(Texas$Weight[Texas$Smoker == "Yes"]))
nonsmokersEDA <- c(mean(Texas$Weight[Texas$Smoker == "No"]), sd(Texas$Weight[Texas$Smoker == "No"]))

The distribution of smokers is approximately normal with a mean of 3205.9888889 and a standard-deviation of 504.2439104. The distribution of weights among non-smokers is a little bit skewed to the left, with a mean of 3287.493654 and a standard-deviation of 554.4828601.

# Your code here
Weight.Smokers <- subset(Texas, select=Weight, Smoker == "Yes", drop=TRUE)
Weight.Non <- subset(Texas, select=Weight, Smoker == "No", drop=TRUE)
thetahat <- mean(Weight.Smokers) - mean(Weight.Non)
nx <- length(Weight.Smokers)
ny <- length(Weight.Non)
SE <- sqrt(var(Weight.Smokers)/nx + var(Weight.Non)/ny)
N <- 10^4
Tstar <- numeric(N)
DM <- numeric(N)
set.seed(1)
for(i in 1:N)
{
  bootx <- sample(Weight.Smokers, nx, replace=TRUE)
  booty <- sample(Weight.Non, ny, replace=TRUE)
  Tstar[i] <- (mean(bootx) - mean(booty) - thetahat) /
    sqrt(var(bootx)/nx + var(booty)/ny)
  DM[i] <- mean(bootx) - mean(booty)
}
CItboot <- thetahat - quantile(Tstar, c(.975, .025)) * SE
names(CItboot) <- NULL
CItboot
[1] -189.41687   28.80408
CIperct <- quantile(DM, c(0.025, 0.975))
CIperct
      2.5%      97.5% 
-188.62885   26.01667 
t.test(Weight.Smokers, Weight.Non)$conf
[1] -190.69149   27.68196
attr(,"conf.level")
[1] 0.95

The 95% bootstrap \(t\) interval for the difference in means is (-189.4168666, 28.8040847). For comparison, the formula \(t\) interval is (-190.6914856,27.6819554) and the bootstrap percentile interval is (-188.6288527, 26.0166706).

# Your code here
CItboot <- thetahat - quantile(Tstar, .975) * SE
names(CItboot) <- NULL
CItboot
[1] -189.4169
test <- t.test(Weight.Smokers, Weight.Non, alternative="g")

Using a one sided test I was able to find that the difference in weights of Texan babies is at least -189.4168666. This is as compared to the formula \(t\) interval lower limit of -172.88094.