Lab 8

Name: James Tian

Section: 3

Date: 11/05/2013

Exercises

Load data & inference function:

source("http://stat.duke.edu/~kkl13/courses/sta102F13/labs/inference.R")
cont = read.csv("http://stat.duke.edu/~kkl13/courses/sta102F13/labs/contributions.csv")

Exercise 1:

table(cont$cand_nm)

## 
##              Bachmann, Michele                   Cain, Herman 
##                             34                             64 
##                 Gingrich, Newt                  Huntsman, Jon 
##                            171                             16 
##             Johnson, Gary Earl           McCotter, Thaddeus G 
##                              8                              1 
##                  Obama, Barack                      Paul, Ron 
##                           7454                            445 
##              Pawlenty, Timothy                    Perry, Rick 
##                             15                             46 
## Roemer, Charles E. 'Buddy' III                   Romney, Mitt 
##                             14                           1579 
##                 Santorum, Rick 
##                            153

We have data for Michele Bachmann, Herman Cain, Newt Gingrich, Jon Huntsman, Gray Early Johnson, Thaddeus G McCotter, Barack Obama, Ron Paul, Timothy Pawlenty, Rick Perry, Charles E. 'Buddy' III Roemer, Mitt Romney, and Rick Santorum

Barack Obama received the highest number of contributions while Thaddeus G McCotter received the fewest

Exercise 2:

# subset for major Republican candidates
rep_mjr = subset(cont, (cont$cand_nm == "Romney, Mitt" | cont$cand_nm == "Paul, Ron" | 
    cont$cand_nm == "Gingrich, Newt" | cont$cand_nm == "Santorum, Rick"))
# subset for primary election
rep_mjr_pri = subset(rep_mjr, rep_mjr$election_tp == "P2012")
pri = droplevels(rep_mjr_pri)
table(pri$cand_nm)

## 
## Gingrich, Newt      Paul, Ron   Romney, Mitt Santorum, Rick 
##            165            445            952            151

boxplot(pri$contb_receipt_amt ~ pri$cand_nm)

plot of chunk unnamed-chunk-3

Exercise 3:

by(pri$contb_receipt_amt, pri$cand_nm, FUN = sum)

## pri$cand_nm: Gingrich, Newt
## [1] 23638
## -------------------------------------------------------- 
## pri$cand_nm: Paul, Ron
## [1] 67228
## -------------------------------------------------------- 
## pri$cand_nm: Romney, Mitt
## [1] 519044
## -------------------------------------------------------- 
## pri$cand_nm: Santorum, Rick
## [1] 31747

Mitt Romney had the highest total contribution

Exercise 4:

by(pri$contb_receipt_amt, pri$cand_nm, FUN = mean)

## pri$cand_nm: Gingrich, Newt
## [1] 143.3
## -------------------------------------------------------- 
## pri$cand_nm: Paul, Ron
## [1] 151.1
## -------------------------------------------------------- 
## pri$cand_nm: Romney, Mitt
## [1] 545.2
## -------------------------------------------------------- 
## pri$cand_nm: Santorum, Rick
## [1] 210.2

Mitt Romney also had the highest average contribution

Exercise 5:

H0: avg_Gingrich = avg_Paul = avg_Romney = avg_Santorum Ha: at least one mean is different

Exercise 6:

The data needs to have been collected independently, normally distributed, and have equal variance.

qqnorm(pri$contb_receipt_amt[pri$cand_nm == "Gingrich, Newt"], main = "Gingrich")
qqline(pri$contb_receipt_amt[pri$cand_nm == "Gingrich, Newt"])

plot of chunk unnamed-chunk-6


qqnorm(pri$contb_receipt_amt[pri$cand_nm == "Paul, Ron"], main = "Paul")
qqline(pri$contb_receipt_amt[pri$cand_nm == "Paul, Ron"])

plot of chunk unnamed-chunk-6


qqnorm(pri$contb_receipt_amt[pri$cand_nm == "Romney, Mitt"], main = "Romney")
qqline(pri$contb_receipt_amt[pri$cand_nm == "Romney, Mitt"])

plot of chunk unnamed-chunk-6


qqnorm(pri$contb_receipt_amt[pri$cand_nm == "Santorum, Rick"], main = "Santorum")
qqline(pri$contb_receipt_amt[pri$cand_nm == "Santorum, Rick"])

plot of chunk unnamed-chunk-6

No, they are not met. Highly deviating tails in the normal probability plots for all the candidates do not show that the contributions are normally distributed. They may have been collected independently, but it appears their variances don't seem to be equal either.

Exercise 7:

The modified significance level is 0.05/6 = 0.008333.

inference(data = pri$contb_receipt_amt, group = pri$cand_nm, est = "mean", type = "ht", 
    alternative = "greater", method = "theoretical")

## Response variable: numerical, Explanatory variable: categorical
## ANOVA
## 
## Summary statistics:
## n_Gingrich, Newt = 165, mean_Gingrich, Newt = 143.3, sd_Gingrich, Newt = 432.8
## n_Paul, Ron = 445, mean_Paul, Ron = 151.1, sd_Paul, Ron = 277.3
## n_Romney, Mitt = 952, mean_Romney, Mitt = 545.2, sd_Romney, Mitt = 968.5
## n_Santorum, Rick = 151, mean_Santorum, Rick = 210.2, sd_Santorum, Rick = 411.1

## H_0: All means are equal.
## H_A: At least one mean is different.
## Analysis of Variance Table
## 
## Response: data
##             Df   Sum Sq  Mean Sq F value Pr(>F)
## group        3 6.29e+07 20951773    36.5 <2e-16
## Residuals 1709 9.82e+08   574777               
## 
## Pairwise tests: t tests with pooled SD 
##                Gingrich, Newt Paul, Ron Romney, Mitt
## Paul, Ron              0.9100        NA           NA
## Romney, Mitt           0.0000    0.0000           NA
## Santorum, Rick         0.4328    0.4074            0

plot of chunk unnamed-chunk-7

Of the pairwise tests, only the p-values corresponding to the differences between Romney and the rest of the candidates are below our significance value of 0.00833, so we conclude that his mean contributions are significantly different.

General Election

# subset for general elections and Obama, Romney, and Johnson
pres_temp1 = subset(cont, cont$election_tp == "G2012")
pres_temp2 = subset(pres_temp1, (pres_temp1$cand_nm == "Obama, Barack" | pres_temp1$cand_nm == 
    "Romney, Mitt" | pres_temp1$cand_nm == "Johnson, Gary Earl"))
# droplevels
pres = droplevels(pres_temp2)

Exercise 8:

inference(data = pres$contb_receipt_amt, group = pres$cand_nm, est = "mean", 
    type = "ht", alternative = "greater", method = "theoretical")

## Response variable: numerical, Explanatory variable: categorical
## Summary statistics:
## n_Johnson, Gary Earl = 6, mean_Johnson, Gary Earl = 230, sd_Johnson, Gary Earl = 226.1
## n_Obama, Barack = 2008, mean_Obama, Barack = 159.1, sd_Obama, Barack = 441.4
## n_Romney, Mitt = 627, mean_Romney, Mitt = 500.1, sd_Romney, Mitt = 795.2

## H_0: All means are equal.
## H_A: At least one mean is different.
## Analysis of Variance Table
## 
## Response: data
##             Df   Sum Sq  Mean Sq F value Pr(>F)
## group        2 5.56e+07 27782585    93.1 <2e-16
## Residuals 2638 7.87e+08   298380               
## 
## Pairwise tests: t tests with pooled SD 
##               Johnson, Gary Earl Obama, Barack
## Obama, Barack             0.7510            NA
## Romney, Mitt              0.2281             0

plot of chunk unnamed-chunk-9

At the 0.05/3=.01667 significance level, only the difference in mean contributions between Barack Obama and Mitt Romney is significant.

Exercise 9:

Gary Earl Johnson only has 6 contributions, which is too small of a sample size to draw significant contributions. The contributions for all candidates, especially Gary Earl Johnson also aren't normally distributed.

Exercise 10:

# subset for general elections and Obama, Romney, and Johnson
pres_temp1 = subset(cont, cont$election_tp == "G2012")
pres_temp2 = subset(pres_temp1, (pres_temp1$cand_nm == "Obama, Barack" | pres_temp1$cand_nm == 
    "Romney, Mitt"))
# droplevels
pres2 = droplevels(pres_temp2)

Exercise 11:

by(pres2$contb_receipt_amt, pres2$cand_nm, FUN = sum)

## pres2$cand_nm: Obama, Barack
## [1] 319497
## -------------------------------------------------------- 
## pres2$cand_nm: Romney, Mitt
## [1] 313580

by(pres2$contb_receipt_amt, pres2$cand_nm, FUN = mean)

## pres2$cand_nm: Obama, Barack
## [1] 159.1
## -------------------------------------------------------- 
## pres2$cand_nm: Romney, Mitt
## [1] 500.1

Mitt Romney has the higher mean but Barack Obama has the higher total, because Barack Obama has a much larger number of contributions.

Exercise 12:

We should use a T test because we are comparing the differences in means between two samples, not between multiple samples (F test) nor between populations where we know the true standard deviation (Z test). However, the sample sizes are so large that there should not be significant differences between the T and Z tests.

Exercise 13:

inference(data = pres2$contb_receipt_amt, est = "mean", siglevel = 0.05, type = "ht", 
    method = "theoretical", null = 0, alternative = "twosided")

## Single mean 
## Summary statistics:

## mean = 240.257 ;  sd = 565.5362 ;  n = 2635 
## H0: mu = 0 
## HA: mu != 0 
## Standard error = 11.02 
## Test statistic: Z = 21.808 
## p-value =  0

plot of chunk unnamed-chunk-12

Because our p-value is very small, far below our significance level of 0.05, we reject the null hypothesis that there is no difference between the true mean contribution to Mitt Romney and Barack Obama.

Exercise 14:

inference(data = pres2$contb_receipt_amt, est = "mean", siglevel = 0.05, type = "ci", 
    method = "theoretical", null = 0, alternative = "twosided")

## Single mean 
## Summary statistics:

plot of chunk unnamed-chunk-13

## mean = 240.257 ;  sd = 565.5362 ;  n = 2635 
## Standard error = 11.0172 
## 95 % Confidence interval = ( 218.6637 , 261.8503 )

We are 95% confident that the true mean difference between the contribution of Mitt Romney over Barack Obama is between $218.66 and $261.85.