Load Data
# Create the Tibble
ep <- tibble::tribble(~Party, ~`18-29`, ~`30-44`, ~`45-59`, ~`60+`, "Democrat", 86L,
72L, 73L, 71L, "Independent", 52L, 51L, 55L, 54L, "Republican", 61L, 74L, 70L,
73L)
# Add sums for cols and rows
ep$Totals <- rowSums(ep[, c(2:5)])
c.sums <- c("Totals", as.vector(colSums(ep[, c(2:6)], dims = 1)))
as.numeric(c.sums[2:6])## [1] 199 197 198 198 792
ep <- rbind(ep, c.sums)
# Apparently rbind converts all the values to characters, this next f(n) converts
# them back to numeric
ep <- transform(ep, `18-29` = as.numeric(`18-29`), `30-44` = as.numeric(`30-44`),
`45-59` = as.numeric(`45-59`), `60+` = as.numeric(`60+`))
# Fix the column headers
names(ep) <- c("Party", "18-29", "30-44", "45-59", "60+", "Totals")Use Google Sheets to Calculate Chi Squared by hand
# Use Google Sheets to Calculate the Chi Squared 'by hand'
library(googlesheets)
# Authorize a northeastern acct with the gs_auth() funvtion to have access to the
# sheet below
# gs_auth() ## register a google sheet object with the URL
gepURL <- gs_url("https://docs.google.com/spreadsheets/d/11AVqy2JoGA8YT8tSPTk3VJxEuxsuGBOFqqoNaAXx6SM/edit#gid=0")
# Next f(n) was used to write data to the sheet
# gep <- gep %>% gs_edit_cells(input = ep, col_names=T)
# read data from the sheet, store as gep data frame object
gep <- gs_read(gepURL, ws = 1)Verify your results using R to conduct the test.
1b
qchisq(0.95, 6)## [1] 12.59159
1 - pchisq(3.652908, 6)## [1] 0.7235272
chisq.test(ep[c(1:3), c(2:5)])##
## Pearson's Chi-squared test
##
## data: ep[c(1:3), c(2:5)]
## X-squared = 3.6529, df = 6, p-value = 0.7235
Now test for independence using ANOVA (an F test). Your three groups are Democrats, Independents, and Republicans. The average age for a Democrat is 43.3, for an Independent it’s 44.6, and for a Republican it’s 45.1. The standard deviations of each are D: 9.1, I: 9.2, R: 9.2. The overall mean age is 44.2. Do the F test by hand, again showing each step.
\(H_0:\mu_D=\mu_I=\mu_R\) The average age is the same across the affiliations. \(H_1:\neg(\mu_D=\mu_I=\mu_R)\) The average age is different across the affiliations. F-Test \[\begin{aligned} f_{stat}=\frac{\textrm{average variance between groups}}{\textrm{average variance within groups}} \\ \text{between groups}=\frac{n_{1}(\bar{y}_{1} - \bar{y})^{2}+ ... + n_{G}(\bar{y}_{G} - \bar{y})^{2} }{df=G-1} \\ \text{within groups}=\frac{(n_{1}-1)s_{1}^{2}+ ... + (n_{G}-1)s_{G}^{2} }{df=N-G}\\ \text{ where }N=\text{sum(n) in all},G=\text{# of Groups} \\ \text{compare }f_{stat} \text{ to } \text{qf}(cl,df_1,df_2) \\
\text{or compare }p_{value}=\text{1-pf}(f_{stat},df_1,df_2) \text{ to }\alpha\end{aligned}\] \(\text{between groups}=\frac{302(43.3 - 44.2)^{2}+212(44.6 - 44.2)^{2}+278(45.1 - 44.2)^{2} }{df=3-1}\)
\(\text{within groups}=\frac{(302-1)1.28^{2}+ (212-1)1.43^{2} + (278-1)1.1^{2} }{df=792-3}\)
2a - Computations with R
# Summary data input vectors
mu <- c(43.3, 44.6, 45.1)
sd <- c(9.1, 9.2, 9.2)
n <- gep$Totals[1:3]
# Function for anova test with input values
# Inputs:
# y - Vector of means
# mu - Mean of means, finds it from input values if not declared
# sd - Vector of standard deviations
# n - Vector of group totals
# Returns - f-statistic
anva <- function(y, mu = mean(y), s, n) {
wg.v <- vector("numeric")
bg.v <- vector("numeric")
for (i in 1:length(n)) {
# Between Group Variance
bgc <- n[i] * (y[i] - mu)^2
bg.v <- append(bg.v, bgc, after = length(bg.v))
i <- i + 1
}
bgvar <- sum(bg.v)/(length(n) - 1)
i <- 1
for (i in 1:length(n)) {
# within Group Variance
wgc <- (n[i] - 1) * s[i]^2
wg.v <- append(wg.v, wgc, after = length(wg.v))
i <- i + 1
}
wgvar <- sum(wg.v)/(sum(n) - length(n))
fs <- bgvar/wgvar
pv <- 1 - pf(fs, length(n) - 1, (sum(n) - length(n)))
output <- tibble::tribble(~Param, ~Value, "Fstat", fs, "pValue", pv)
return(output)
}
anva(mu, 44.2, sd, n)## # A tibble: 2 x 2
## Param Value
## <chr> <dbl>
## 1 Fstat 3.00040993
## 2 pValue 0.05033486
qf(0.95, 2, 789)## [1] 3.007136
Check your results in R using simulated data. Generate a simulated dataset by creating three vectors: Democrats, Republicans, and Independents. Each vector should be a list of ages, each with a length equal to the number of Democrats, Independents, and Republicans in the table above, and the appropriate mean and sd based on 2.a (use rnorm to generate the vectors). Combine all three into a single dataframe with two variables: age, and a factor that specifies D, I, or R. Then conduct an F test using R’s aov function on that data and compare the results to 2a. Do your results match 2a? If not, why not?
2b
# Create the data frames with rnorm
D <- data.frame(rnorm(n[1], mean = mu[1], sd[1]))
I <- data.frame(rnorm(n[2], mean = mu[2], sd[2]))
R <- data.frame(rnorm(n[3], mean = mu[3], sd[3]))
# I am unable to find a way to create a data frame with vectors of unequal length
# As a workaround, I'll use Google Sheets and then read the data back
# Create a new Worksheet named 2b gs_ws_new(gepURL,ws_title='2b')
# Put the dataframes in columns a b c respectively
# gs_edit_cells(gepURL,ws=2,input=D,anchor='A1')
# gs_edit_cells(gepURL,ws=2,input=I,anchor='B1')
# gs_edit_cells(gepURL,ws=2,input=R,anchor='C1')
ageParty <- gs_read(gepURL, ws = 2) ## Read the data back into R
cn <- c("D", "I", "R")
colnames(ageParty) <- cn
ageParty <- ageParty %>% gather(key = "Party", value = "Age") %>% filter(is.na(Age) ==
F)
aPaov <- aov(Age ~ Party, data = ageParty)
summary(aPaov)## Df Sum Sq Mean Sq F value Pr(>F)
## Party 2 767 383.5 4.4 0.0126 *
## Residuals 789 68754 87.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2b - Possible causes for discrepancy
# Determine possible causes for the discrepancy between the two tests
De <- ageParty %>% filter(Party == "D")
Ie <- ageParty %>% filter(Party == "I")
Re <- ageParty %>% filter(Party == "R")
c(mean(De$Age), mean(Ie$Age), mean(Re$Age))## [1] 43.06208 45.33170 44.83149
c(mean(D[, 1]), mean(I[, 1]), mean(R[, 1])) #experimental means## [1] 43.59896 44.45389 44.54748
mu #means provided## [1] 43.3 44.6 45.1
c(sd(D[, 1]), sd(I[, 1]), sd(R[, 1])) #experimental sd## [1] 8.770424 9.994521 9.594601
sd #sd provided## [1] 9.1 9.2 9.2