This is entirely our own work except as noted at the end of the document.

1 Import the data set Spruce into R.

*Note: data set information can be found at http://www1.appstate.edu/~arnholta/Data/

Spruce <- read.csv("http://www1.appstate.edu/~arnholta/Data/Spruce.csv")
head(Spruce)
  Tree Competition Fertilizer Height0 Height5 Diameter0 Diameter5
1    1          NC          F    15.0    60.0  1.984375       7.4
2    2          NC          F     9.0    45.2  1.190625       5.2
3    3          NC          F    12.0    42.0  1.785937       5.7
4    4          NC          F    13.7    49.5  1.587500       6.4
5    5          NC          F    12.0    47.3  1.587500       6.2
6    6          NC          F    12.0    56.4  1.587500       7.4
  Ht.change Di.change
1      45.0  5.415625
2      36.2  4.009375
3      30.0  3.914062
4      35.8  4.812500
5      35.3  4.612500
6      44.4  5.812500
summary(Spruce)
      Tree       Competition Fertilizer    Height0         Height5     
 Min.   : 1.00   C :36       F :36      Min.   : 9.00   Min.   :24.00  
 1st Qu.:18.75   NC:36       NF:36      1st Qu.:13.28   1st Qu.:37.92  
 Median :36.50                          Median :14.75   Median :45.30  
 Mean   :36.50                          Mean   :14.57   Mean   :45.51  
 3rd Qu.:54.25                          3rd Qu.:16.00   3rd Qu.:53.25  
 Max.   :72.00                          Max.   :18.70   Max.   :68.00  
   Diameter0       Diameter5        Ht.change       Di.change    
 Min.   :1.191   Min.   : 2.700   Min.   : 8.30   Min.   :1.019  
 1st Qu.:1.786   1st Qu.: 4.450   1st Qu.:23.20   1st Qu.:2.712  
 Median :1.984   Median : 5.750   Median :30.10   Median :3.915  
 Mean   :1.935   Mean   : 5.931   Mean   :30.93   Mean   :3.996  
 3rd Qu.:1.984   3rd Qu.: 7.100   3rd Qu.:38.17   3rd Qu.:5.116  
 Max.   :2.381   Max.   :11.300   Max.   :51.50   Max.   :8.919  

1.1 Create exploratory plots to check the distribution of the variable Ht.change.

EDA.hist(Spruce$Ht.change)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(Spruce$Ht.change)

1.2 Find a 95% \(t\) confidence interval for the mean height change over the 5-year period of the study and give a sentence interpreting your interval.

spruce_t <- t.test(Spruce$Ht.change)
head(spruce_t)
$statistic
      t 
23.7549 

$parameter
df 
71 

$p.value
[1] 1.636099e-35

$conf.int
[1] 28.33685 33.52982
attr(,"conf.level")
[1] 0.95

$estimate
mean of x 
 30.93333 

$null.value
mean 
   0 
spruce_t$conf.int[1]
[1] 28.33685
spruce_t$conf.int[2]
[1] 33.52982

SOLUTION: We are 95% confident that the mean height change for a spruce tree over the 5-year period is within (28.33685, 33.52982) .

1.3 Create exploratory plots to compare the distributions of the variable Ht.change for the seedlings in the fertilized and nonfertilized plots.

#With Fertilizer
EDA.hist(Spruce$Ht.change[Spruce$Fertilizer == "F"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(Spruce$Ht.change[Spruce$Fertilizer == "F"])

#Without Fertilizer 
EDA.hist(Spruce$Ht.change[Spruce$Fertilizer != "F"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(Spruce$Ht.change[Spruce$Fertilizer != "F"])

  • Find the 95% one-sided lower \(t\) confidence bound for the difference in mean heights (\(\mu_F - \mu_{NF}\)) over the 5-year period of the study and give a sentence interpreting your interval.
spruce_t2 <- t.test(Spruce$Ht.change~Spruce$Fertilizer,alternative="greater")
head(spruce_t2)
$statistic
       t 
7.558587 

$parameter
     df 
69.6971 

$p.value
[1] 6.067505e-11

$conf.int
[1] 11.46664      Inf
attr(,"conf.level")
[1] 0.95

$estimate
 mean in group F mean in group NF 
        38.28889         23.57778 

$null.value
difference in means 
                  0 
spruce_t2$conf.int[1]
[1] 11.46664

SOLUTION: We are 95% confident that the difference in heights between spruce trees that were fertilized and those that were not during a 5 year period is at least 11.46664.

2 Consider the data set Girls2004 with birth weights of baby girls born in Wyoming or Alaska.

Girls2004 <- read.csv("http://www1.appstate.edu/~arnholta/Data/Girls2004.csv")
Girls2004
   ID State MothersAge Smoker Weight Gestation
1   1    WY      15-19     No   3085        40
2   2    WY      35-39     No   3515        39
3   3    WY      25-29     No   3775        40
4   4    WY      20-24     No   3265        39
5   5    WY      25-29     No   2970        40
6   6    WY      20-24     No   2850        38
7   7    WY      20-24     No   2737        38
8   8    WY      25-29     No   3515        37
9   9    WY      25-29     No   3742        39
10 10    WY      35-39     No   3570        40
11 11    WY      20-24     No   3834        41
12 12    WY      20-24    Yes   3090        39
13 13    WY      25-29    Yes   3350        40
14 14    WY      30-34     No   3292        37
15 15    WY      15-19     No   3317        40
16 16    WY      30-34     No   2485        37
17 17    WY      20-24     No   3215        39
18 18    WY      20-24     No   3230        40
19 19    WY      30-34     No   3345        39
20 20    WY      25-29     No   3050        41
21 21    WY      30-34     No   2212        37
22 22    WY      35-39     No   3605        39
23 23    WY      30-34     No   2722        39
24 24    WY      30-34     No   2880        39
25 25    WY      20-24     No   3610        39
26 26    WY      30-34     No   3355        39
27 27    WY      20-24     No   3995        41
28 28    WY      20-24    Yes   2948        39
29 29    WY      35-39     No   3345        41
30 30    WY      30-34    Yes   2892        39
31 31    WY      20-24     No   2466        37
32 32    WY      20-24    Yes   3290        39
33 33    WY      25-29     No   3310        39
34 34    WY      40-44     No   3175        37
35 35    WY      25-29     No   2715        38
36 36    WY      25-29     No   3540        38
37 37    WY      25-29     No   3402        38
38 38    WY      25-29    Yes   3923        39
39 39    WY      20-24     No   3204        37
40 40    WY      15-19    Yes   2495        37
41 41    AK      20-24     No   4337        41
42 42    AK      20-24     No   2948        40
43 43    AK      30-34     No   3269        39
44 44    AK      20-24     No   3608        38
45 45    AK      30-34     No   4016        39
46 46    AK      25-29     No   2919        40
47 47    AK      20-24     No   2608        37
48 48    AK      40-44     No   4309        39
49 49    AK      20-24     No   3288        39
50 50    AK      25-29     No   3742        38
51 51    AK      15-19     No   4394        41
52 52    AK      20-24     No   2182        37
53 53    AK      25-29     No   4592        40
54 54    AK      20-24     No   3090        39
55 55    AK      30-34     No   3770        40
56 56    AK      20-24     No   3977        39
57 57    AK      25-29     No   3153        40
58 58    AK      25-29     No   3458        41
59 59    AK      15-19     No   3912        38
60 60    AK      20-24    Yes   2863        40
61 61    AK      35-39     No   3190        39
62 62    AK      25-29    Yes   3515        38
63 63    AK      25-29     No   3288        39
64 64    AK      15-19     No   3114        40
65 65    AK      30-34    Yes   3543        41
66 66    AK      20-24     No   3825        39
67 67    AK      25-29     No   3458        39
68 68    AK      30-34     No   3698        41
69 69    AK      20-24     No   3572        39
70 70    AK      30-34    Yes   2352        40
71 71    AK      20-24     No   3175        40
72 72    AK      25-29     No   3742        41
73 73    AK      20-24     No   3997        39
74 74    AK      25-29     No   2576        38
75 75    AK      30-34     No   3572        40
76 76    AK      35-39     No   3968        39
77 77    AK      20-24     No   4564        42
78 78    AK      20-24     No   4210        40
79 79    AK      25-29     No   3260        38
80 80    AK      20-24     No   3600        40
summary(Girls2004)
       ID        State   MothersAge Smoker       Weight    
 Min.   : 1.00   AK:40   15-19: 6   No :69   Min.   :2182  
 1st Qu.:20.75   WY:40   20-24:29   Yes:11   1st Qu.:3076  
 Median :40.50           25-29:22            Median :3331  
 Mean   :40.50           30-34:15            Mean   :3362  
 3rd Qu.:60.25           35-39: 6            3rd Qu.:3709  
 Max.   :80.00           40-44: 2            Max.   :4592  
   Gestation    
 Min.   :37.00  
 1st Qu.:38.00  
 Median :39.00  
 Mean   :39.14  
 3rd Qu.:40.00  
 Max.   :42.00  

2.1 Create exploratory plots and compare the distribution of weight between the babies born in the two states.

#Wyoming
EDA.hist(Girls2004$Weight[Girls2004$State == "WY"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(Girls2004$Weight[Girls2004$State == "WY"])

#Alaska
EDA.hist(Girls2004$Weight[Girls2004$State == "AK"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(Girls2004$Weight[Girls2004$State == "AK"])

SOLUTION: On average, the distribution of weight between the babies born in the two states is greater in Alaksa than in Wyoming.

2.2 Find a 95% \(t\) confidence interval for the difference in mean weights for girls born in these two states. Give a sentence interpreting this interval.

girls_t <- t.test(Girls2004$Weight[Girls2004$State == "AK"],Girls2004$Weight[Girls2004$State == "WY"])
girls_t

    Welch Two Sample t-test

data:  Girls2004$Weight[Girls2004$State == "AK"] and Girls2004$Weight[Girls2004$State == "WY"]
t = 2.7316, df = 71.007, p-value = 0.007946
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  83.29395 533.60605
sample estimates:
mean of x mean of y 
  3516.35   3207.90 
girls_t$conf.int[1]
[1] 83.29395
girls_t$conf.int[2]
[1] 533.606

SOLUTION: We are 95% confident that the difference in weights between girls born in Wyoming and girls born in Alaska is within (83.29395, 533.606).

2.3 Create exploratory plots and compare the distribution of weights between babies born to nonsmokers and babies born to smokers.

#Smokers
EDA.hist(Girls2004$Weight[Girls2004$Smoker == "Yes"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(Girls2004$Weight[Girls2004$Smoker == "Yes"])

#Nonsmokers
EDA.hist(Girls2004$Weight[Girls2004$Smoker == "No"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(Girls2004$Weight[Girls2004$Smoker == "No"])

SOLUTION: On avereage, the weights of babies born for smokers is less than the weights of babies born for nonsmokers.

2.4 Find a 95% \(t\) confidence interval for the difference in mean weights between babies born to nonsmokers and smokers. Give a sentence interpreting this interval.

girls_t2 <- t.test(Girls2004$Weight[Girls2004$Smoker == "Yes"],Girls2004$Weight[Girls2004$Smoker == "No"])
girls_t2

    Welch Two Sample t-test

data:  Girls2004$Weight[Girls2004$Smoker == "Yes"] and Girls2004$Weight[Girls2004$Smoker == "No"]
t = -1.8552, df = 14.35, p-value = 0.08423
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -617.9197   44.0330
sample estimates:
mean of x mean of y 
 3114.636  3401.580 
girls_t2$conf.int[1]
[1] -617.9197
girls_t2$conf.int[2]
[1] 44.033

SOLUTION: We are 95% confident that the difference in weights between girls born to smokers and girls born to non-smokers is within (-617.9197, 44.033)

3 Import the FlightDelays data set into R. Although the data represent all flights for United Airlines and American Airlines in May and June 2009, assume for this exercise that these flights are a sample from all flights flown by the two airlines under similar conditions. We will compare the lengths of flight delays between the two airlines.

FlightDelays <- read.csv("http://www1.appstate.edu/~arnholta/Data/FlightDelays.csv")
summary(FlightDelays)
       ID       Carrier      FlightNo      Destination    DepartTime  
 Min.   :   1   AA:2906   Min.   :  71.0   BNA: 172    4-8am   : 699  
 1st Qu.:1008   UA:1123   1st Qu.: 371.0   DEN: 264    4-8pm   : 972  
 Median :2015             Median : 691.0   DFW: 918    8-Mid   : 257  
 Mean   :2015             Mean   : 827.1   IAD:  55    8-Noon  :1053  
 3rd Qu.:3022             3rd Qu.: 787.0   MIA: 610    Noon-4pm:1048  
 Max.   :4029             Max.   :2255.0   ORD:1785                   
                                           STL: 225                   
  Day       Month       FlightLength       Delay        Delayed30 
 Fri:637   June:2030   Min.   : 68.0   Min.   :-19.00   No :3432  
 Mon:630   May :1999   1st Qu.:155.0   1st Qu.: -6.00   Yes: 597  
 Sat:453               Median :163.0   Median : -3.00             
 Sun:551               Mean   :185.3   Mean   : 11.74             
 Thu:566               3rd Qu.:228.0   3rd Qu.:  5.00             
 Tue:628               Max.   :295.0   Max.   :693.00             
 Wed:564                                                          
head(FlightDelays)
  ID Carrier FlightNo Destination DepartTime Day Month FlightLength Delay
1  1      UA      403         DEN      4-8am Fri   May          281    -1
2  2      UA      405         DEN     8-Noon Fri   May          277   102
3  3      UA      409         DEN      4-8pm Fri   May          279     4
4  4      UA      511         ORD     8-Noon Fri   May          158    -2
5  5      UA      667         ORD      4-8am Fri   May          143    -3
6  6      UA      669         ORD      4-8am Fri   May          150     0
  Delayed30
1        No
2       Yes
3        No
4        No
5        No
6        No

3.1 Create exploratory plots of the lengths of delays for the two airlines.

#United Airlines
EDA.hist(FlightDelays$Delay[FlightDelays$Carrier == "UA"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(FlightDelays$Delay[FlightDelays$Carrier == "UA"])

#American Airlines
EDA.hist(FlightDelays$Delay[FlightDelays$Carrier == "AA"])
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA.qq(FlightDelays$Delay[FlightDelays$Carrier == "AA"])

3.2 Find a 95% \(t\) confidence interval for the difference in mean flight delays between the two airlines and interpret this interval.

flight_t <- t.test(FlightDelays$Delay[FlightDelays$Carrier == "UA"],FlightDelays$Delay[FlightDelays$Carrier == "AA"])
flight_t

    Welch Two Sample t-test

data:  FlightDelays$Delay[FlightDelays$Carrier == "UA"] and FlightDelays$Delay[FlightDelays$Carrier == "AA"]
t = 3.8255, df = 1843.8, p-value = 0.0001349
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 2.868194 8.903198
sample estimates:
mean of x mean of y 
 15.98308  10.09738 
flight_t$conf.int[1]
[1] 2.868194
flight_t$conf.int[2]
[1] 8.903198

SOLUTION: We are 95% confident that the difference in delay times between United Airlines and American Airlines is within (2.868194, 8.903198).

4 Run a simulation to see if the \(t\) ratio \(T = (\bar{X} -\mu)/(S/\sqrt{n})\) has a \(t\) distribution or even an approximate \(t\) distribution when the samples are drawn from a nonnormal distribution. Be sure to superimpose the appropriate \(t\) density curve over the density of your simulated \(T\). Try two different nonnormal distributions \(\left( Unif(a = 0, b = 1), Exp(\lambda = 1) \right)\) and remember to see if sample size makes a difference (use \(n = 15\) and \(n=500\)).

4.1 Class Notes:

\(T = \frac{\bar{X} - \mu}{\frac{s}{\sqrt{n}}}\) ~ \(t_{n-1}\) Assuming t distribution \(t_{n-1}\) is a normal distribution

sims <- 10^4
xbar <- numeric(sims)
SD <- numeric(sims)

#Example given in class,
## Start
Mu <- 100
sigma <- 10
n<- 15
## End

for (i in 1: sims)
{
  xn <- rnorm(n, Mu, sigma)
  xbar[i] <- mean(xn)
  SD[i] <- sd(xn)
}

T <- (xbar - Mu) - (SD / sqrt(n))
hist(T)

#alternative way with hist graphics of density historgram
hist(T, freq = FALSE)

# Need to add t distribution according to the 14 degrees of freedom
curve(dt(x,15))
curve(dt(x,15), -5, 5)

# Class notes
junk <- rt(10000, 15)
hist(junk)
hist(junk, freq = FALSE)
hist(junk, freq = FALSE, breaks = "Scott")
curve(dt(x, 15), add = TRUE)

SOULTION: Unbalanced sample sizes, pooled CI’s are not capturing true mean differenceat 95%. Balanced case, pooled CI does better.

5 One question is the 2002 General Social Survey asked participants whom they voted for in the 2000 election. Of the 980 women who voted, 459 voted for Bush. Of the 759 men who voted, 426 voted for Bush.

prop.test(459,980)$conf.int[1]
[1] 0.4368038
prop.test(459,980)$conf.int[2]
[1] 0.5001819

SOLUTION: We are 95% confident that the proportion of women who voted for Bush from the 2002 General Social Survey is within (0.4368038,0.5001819).

prop.test(426,759)$conf.int[1]
[1] 0.5250797
prop.test(426,759)$conf.int[2]
[1] 0.5968214

SOLUTION: We are 95% confident that the proportion of men who voted for Bush from the 2002 General Social Survey is is within (0.5250797, 0.5968214). The intervals for the proportion of men who voted for Bush and the proportion of women who voted for Bush do not overlap but since we tested the proportion of means seperately, we cannot conclude anything about their apparent difference. To be able to conclude something, we would need to test a difference of proportions explicitly.

# Your code here

SOLUTION:

Prob6 - A retail store wishes to conduct a marketing survey of its customers to see if customers would favor longer store hours. How many people should be in their sample if the marketers want their margin of error to be at most 3% with 95% confidence, assuming

library(PASWR2)

Attaching package: 'PASWR2'
The following objects are masked from 'package:PASWR':

    bino.gen, checking.plots, EPIDURAL, EURD, FCD, interval.plot,
    ksdist, normarea, nsize, ntester, oneway.plots, SBIQ, SDS4,
    SIGN.test, tsum.test, twoway.plots, z.test, zsum.test
nsize(b = .03, p = .5, type = "pi")

The required sample size (n) to estimate the population 
proportion of successes with a 0.95 confidence interval 
so that the margin of error is no more than 0.03 is 1068 . 
# Your code here

SOLUTION: A sample from 1068 people would lead to a 95% confidence and a 3% margin of error.

library(PASWR2)
nsize(b = .03,p = .65, type = "pi")

The required sample size (n) to estimate the population 
proportion of successes with a 0.95 confidence interval 
so that the margin of error is no more than 0.03 is 972 . 

SOLUTION: 972 people are required.

Prob7 - Suppose researchers wish to study the effectiveness of a new drug to alleviate hives due to math anxiety. Seven hundred math students are randomly assigned to take either this drug or a placebo. Suppose 34 of the 350 students who took the drug break out in hives compared to 56 of the 350 students who took the placebo.

prop.test(34, 350)

    1-sample proportions test with continuity correction

data:  34 out of 350, null probability 0.5
X-squared = 225.6, df = 1, p-value < 2.2e-16
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.06913692 0.13429260
sample estimates:
         p 
0.09714286 

SOLUTION: With a confidence level of 95% the proportion will be between .0691 and .1342.

prop.test(56, 350)

    1-sample proportions test with continuity correction

data:  56 out of 350, null probability 0.5
X-squared = 160.48, df = 1, p-value < 2.2e-16
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.1240384 0.2036158
sample estimates:
   p 
0.16 

SOLUTION: With a confidence level of 95% the proportion will be between .124 and .204

SOLUTION: This intervals overlap but in such a small portion that it will not be relevant when evaluating the drug.

i = c(350, 350)
t = c(34, 56)
prop.test(t, i)

    2-sample test for equality of proportions with continuity
    correction

data:  t out of i
X-squared = 5.623, df = 1, p-value = 0.01773
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.11508783 -0.01062645
sample estimates:
    prop 1     prop 2 
0.09714286 0.16000000 

SOLUTION: With a confidence level of 95% the proportion will be between -.115 and -.011

Prob8 - An article in the March 2003 New England Journal of Medicine describes a study to see if aspirin is effective in reducing the incidence of colorectal adenomas, a precursor to most colorectal cancers (Sandler et al. (2003)). Of 517 patients in the study, 259 were randomly assigned to receive aspirin and the remaining 258 received a placebo. One or more adenomas were found in 44 of the aspirin group and 70 in the placebo group. Find a 95% one-sided upper bound for the difference in proportions \((p_A - p_P)\) and interpret your interval.

# Your code here
g = c(44, 70)
t = c(259, 258)

prop.test(x = g, n = t, correct = FALSE, conf.level = .95, alternative = "greater")$conf
[1] -0.1609853  1.0000000
attr(,"conf.level")
[1] 0.95

SOLUTION: With a confidence level of 95% the proportion will be between -.161 and 1

Prob9 - The data set Bangladesh has measurements on water quality from 271 wells in Bangladsesh. There are two missing values in the chlorine variable. Use the following R code to remove these two observations.

> chlorine <- with(Bangladesh, Chlorine[!is.na(Chlorine)])

Bangladesh <- read_csv("http://www1.appstate.edu/~arnholta/Data/Bangladesh.csv")
Parsed with column specification:
cols(
  Arsenic = col_double(),
  Chlorine = col_double(),
  Cobalt = col_double()
)
chlorine = with(Bangladesh, Chlorine[!is.na(Chlorine)])
quantile(chlorine)
    0%    25%    50%    75%   100% 
   1.0    5.0   14.2   55.5 1550.0 
hist(chlorine)

SOLUTION: The distrution is skewed to the right

t.test(chlorine)

    One Sample t-test

data:  chlorine
t = 6.0979, df = 268, p-value = 3.736e-09
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
  52.87263 103.29539
sample estimates:
mean of x 
 78.08401 

SOLUTION: With a confidence level of 95% the proportion will be between 52.87 and 103.29

SOLUTION: The bootstrap confidence interval is between 77.75 and 78.26.

SOLUTION: We’re 95% confident that the mean is between 124.8 and 125.51.

Prob10 - The data set MnGroundwater has measurements on water quality of 895 randomly selected wells in Minnesota.

MnGroundwater <- read_csv("http://www1.appstate.edu/~arnholta/Data/MnGroundwater.csv")
Parsed with column specification:
cols(
  County = col_character(),
  Aquifer.Group = col_character(),
  Water.Level = col_integer(),
  Alkalinity = col_integer(),
  Aluminum = col_double(),
  Arsenic = col_double(),
  Chloride = col_integer(),
  Lead = col_double(),
  pH = col_double(),
  Basin.Name = col_character()
)
head(MnGroundwater)
# A tibble: 6 x 10
  County        Aquifer.Group Water.Level Alkalinity Aluminum Arsenic
   <chr>                <chr>       <int>      <int>    <dbl>   <dbl>
1 Aitkin surficial Quaternary          55     137000    0.059   1.810
2 Aitkin    buried Quaternary          30     214000    2.380   0.059
3 Aitkin    buried Quaternary          20     120000    0.410   1.440
4 Aitkin    buried Quaternary           3     283000  158.190   6.340
5 Aitkin    buried Quaternary           0     236000    0.059  10.170
6 Aitkin    buried Quaternary          30     229000    0.059   6.900
# ... with 4 more variables: Chloride <int>, Lead <dbl>, pH <dbl>,
#   Basin.Name <chr>
hist(MnGroundwater$Alkalinity)
qqnorm(MnGroundwater$Alkalinity)
mean(MnGroundwater$Alkalinity)
[1] 290682.7
sd(MnGroundwater$Alkalinity)
[1] 108334.3

SOLUTION:

t.test(MnGroundwater$Alkalinity)

    One Sample t-test

data:  MnGroundwater$Alkalinity
t = 80.272, df = 894, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 283575.6 297789.8
sample estimates:
mean of x 
 290682.7 

SOLUTION: We’re 95% confident that the mean alkalinity is between 283575.6 and 297789.8

SOLUTION: The bootstrap t confidence interval is more accurate and is the best method.

Prob11 Consider the babies born in Texas in 2004 (TXBirths2004). We will compare the weights of babies born to nonsmokers and smokers.

Texas <- read_csv("http://www1.appstate.edu/~arnholta/Data/TXBirths2004.csv")
Parsed with column specification:
cols(
  ID = col_integer(),
  MothersAge = col_character(),
  Smoker = col_character(),
  Gender = col_character(),
  Weight = col_integer(),
  Gestation = col_integer(),
  Number = col_integer(),
  Multiple = col_character()
)
head(Texas)
# A tibble: 6 x 8
     ID MothersAge Smoker Gender Weight Gestation Number Multiple
  <int>      <chr>  <chr>  <chr>  <int>     <int>  <int>    <chr>
1     1      20-24     No   Male   3033        39      1       No
2     2      20-24     No   Male   3232        40      1       No
3     3      25-29     No Female   3317        37      1       No
4     4      25-29     No Female   2560        36      1       No
5     5      15-19     No Female   2126        37      1       No
6     6      30-34     No Female   2948        38      1       No
smokers <- filter(Texas, Texas$Smoker == "Yes")
nonsmokers <- filter(Texas, Texas$Smoker == "No")
count(smokers)
# A tibble: 1 x 1
      n
  <int>
1    90
count(nonsmokers)
# A tibble: 1 x 1
      n
  <int>
1  1497

SOLUTION: There are 1497 nonsmokers & 90 smokers.

ggplot(data = smokers, aes(sample = Weight)) + stat_qq() + theme_bw() +
  ggtitle("Smokers") 

ggplot(data = nonsmokers, aes(sample = Weight)) + stat_qq() + theme_bw() + ggtitle("Non-Smokers")

smokersExp <- c(mean(Texas$Weight[Texas$Smoker == "Yes"]), sd(Texas$Weight[Texas$Smoker == "Yes"]))
smokersExp
[1] 3205.9889  504.2439
nonSmokersExp <- c(mean(Texas$Weight[Texas$Smoker == "No"]), sd(Texas$Weight[Texas$Smoker == "No"]))
nonSmokersExp
[1] 3287.4937  554.4829

SOLUTION: Smokers is normal shaped with a mean of 3205.9889 and a standard deviation of 504.2439. Nonsmokers is left skewed with a mean of 3287.4937 and a standard deviation of 554.4829.

SE <- sqrt(var(smokers$Weight)/length(smokers$Weight) + 
var(nonsmokers$Weight)/length(nonsmokers$Weight))            
thetaHat <- mean(smokers$Weight) - mean(nonsmokers$Weight)
x <- length(smokers$Weight)
y <- length(smokers$Weight)
N <- 10^4
TStar <- numeric(N)
DM <- numeric(N)
for(i in 1:N)
{
  xBS <- sample(smokers$Weight, x, replace=TRUE)
  yBS <- sample(nonsmokers$Weight, y, replace=TRUE)
  TStar[i] <- (mean(xBS) - mean(yBS) - thetaHat) /
    sqrt(var(xBS)/x + var(yBS)/y)
  DM[i] <- mean(xBS) - mean(yBS)
}
Boot <- thetaHat - quantile(TStar, c(.975, .025)) * SE
Boot
     97.5%       2.5% 
-187.27412   30.70641 
Boot2 <- quantile(DM, c(0.025, 0.975))
Boot2
      2.5%      97.5% 
-233.87889   74.95667 
t.test(smokers$Weight,nonsmokers$Weight)

    Welch Two Sample t-test

data:  smokers$Weight and nonsmokers$Weight
t = -1.4806, df = 102.38, p-value = 0.1418
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -190.69149   27.68196
sample estimates:
mean of x mean of y 
 3205.989  3287.494 

SOLUTION: Report the Bootstrap interval.

Boot <- thetaHat - quantile(TStar, .975) * SE
Boot
    97.5% 
-187.2741 
test <- t.test(smokers$Weight, nonsmokers$Weight, alternative="greater")
test

    Welch Two Sample t-test

data:  smokers$Weight and nonsmokers$Weight
t = -1.4806, df = 102.38, p-value = 0.9291
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 -172.8809       Inf
sample estimates:
mean of x mean of y 
 3205.989  3287.494 

SOLUTION: We are 95% sure that the difference in the mean is greater than -172.88094.