Inference for Categorical Data in R

DataCamp: Statistics with R

Bonnie Cooper

library( dplyr )
library( ggplot2 )
library( gridExtra )
library( infer )
library( tidyr )
library( broom )
library( gapminder )
library( tidyverse )
library( benford.analysis )

Inference for a Single Parameter

The General Social Survey

load( file = '/home/bonzilla/Documents/ReadingLearningTinkering/DataCamp/Statistics_with_R/gss.RData' )
glimpse( gss )
## Rows: 50,346
## Columns: 28
## $ id       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ year     <dbl> 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982, …
## $ age      <fct> 41, 49, 27, 24, 57, 29, 21, 68, 54, 80, 74, 30, 53, 39, 36, …
## $ class    <fct> WORKING CLASS, WORKING CLASS, MIDDLE CLASS, MIDDLE CLASS, LO…
## $ degree   <fct> LT HIGH SCHOOL, HIGH SCHOOL, HIGH SCHOOL, HIGH SCHOOL, LT HI…
## $ sex      <fct> MALE, FEMALE, FEMALE, FEMALE, MALE, MALE, FEMALE, MALE, FEMA…
## $ marital  <fct> MARRIED, MARRIED, NEVER MARRIED, NEVER MARRIED, NEVER MARRIE…
## $ race     <fct> WHITE, WHITE, WHITE, WHITE, WHITE, WHITE, WHITE, WHITE, WHIT…
## $ region   <fct> NEW ENGLAND, NEW ENGLAND, NEW ENGLAND, NEW ENGLAND, NEW ENGL…
## $ partyid  <fct> "STRONG DEMOCRAT", "STRONG DEMOCRAT", "IND,NEAR DEM", "IND,N…
## $ happy    <fct> PRETTY HAPPY, NOT TOO HAPPY, VERY HAPPY, PRETTY HAPPY, VERY …
## $ grass    <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ relig    <fct> CATHOLIC, CATHOLIC, CATHOLIC, CATHOLIC, CATHOLIC, CATHOLIC, …
## $ cappun2  <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ cappun   <fct> FAVOR, FAVOR, FAVOR, OPPOSE, OPPOSE, FAVOR, OPPOSE, FAVOR, F…
## $ finalter <fct> STAYED SAME, WORSE, BETTER, BETTER, STAYED SAME, BETTER, BET…
## $ protest3 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ natspac  <fct> ABOUT RIGHT, TOO MUCH, TOO LITTLE, TOO LITTLE, ABOUT RIGHT, …
## $ natarms  <fct> TOO LITTLE, TOO LITTLE, ABOUT RIGHT, TOO MUCH, TOO LITTLE, T…
## $ conclerg <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, ONLY SOME, A GREAT DEAL,…
## $ confed   <fct> ONLY SOME, ONLY SOME, ONLY SOME, ONLY SOME, A GREAT DEAL, ON…
## $ conpress <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, ONLY SOME, A GREAT DEAL,…
## $ conjudge <fct> HARDLY ANY, ONLY SOME, A GREAT DEAL, A GREAT DEAL, A GREAT D…
## $ consci   <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, A GREAT DEAL, A GREAT DE…
## $ conlegis <fct> ONLY SOME, ONLY SOME, ONLY SOME, ONLY SOME, A GREAT DEAL, ON…
## $ zodiac   <fct> TAURUS, CAPRICORN, VIRGO, PISCES, CAPRICORN, LEO, LIBRA, CAN…
## $ oversamp <dbl> 1.235, 1.235, 1.235, 1.235, 1.235, 1.235, 1.235, 1.235, 1.23…
## $ postlife <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

Let’s visualize the distribution for the feature happy:

gss2016 <- filter( gss, year == 2016 )

p1 <- ggplot( gss2016, aes( x = happy ) ) +
  geom_bar()

gss2016 <- gss2016 %>%
  mutate( Happy = case_when( happy == "VERY HAPPY" ~ "HAPPY",
                            happy == "PRETTY HAPPY" ~ "HAPPY",
                            happy == "NOT TOO HAPPY" ~ "UNHAPPY",)) %>%
  drop_na( Happy )

p2 <- ggplot( gss2016, aes( x = Happy ) ) +
  geom_bar()

grid.arrange( p1, p2, ncol = 2 )

find the proportion of PRETTY HAPPY people:

p_hats <- gss2016 %>%
  group_by( Happy ) %>%
  summarise( count = n() ) %>%
  mutate( prop = count / sum( count ) )
## `summarise()` ungrouping output (override with `.groups` argument)
p_hats
## # A tibble: 2 x 3
##   Happy   count  prop
##   <chr>   <int> <dbl>
## 1 HAPPY    2407 0.842
## 2 UNHAPPY   452 0.158

Bootstrap

95% Confidence interval. estimate by adding / subtracting \(2\cdot SE\). Find the \(SE\) by bootstrapping:

  • specify()
  • generate()
  • calculate()
boot <- gss2016 %>%
  specify( response = Happy,
           success = "HAPPY" ) %>%
  generate( reps = 500,
            type = "bootstrap" ) %>%
  calculate( stat = "prop" )

bootplot <- ggplot( boot, aes( x = stat ) ) +
  geom_density()

bootplot

find the standard deviation

SE <- boot %>% summarize( sd( stat ) ) %>% pull()
SE
## [1] 0.007114349

Now we can find our confidence interval by adding / subtracting ‘SE’ from ‘p_hat’:

CI <- c( p_hats$prop[1] - 2*SE, p_hats$prop[1] + 2*SE)
CI
## [1] 0.8276741 0.8561315

We can be 95% confident that the number of happy Americans is somewhere between 82.6% <->85.3%

bootplot + 
  geom_vline( xintercept = CI[1], color = 'red' ) +
  geom_vline( xintercept = CI[2], color = 'red' )

how much confidence people had in the scientific community in 2016? Let’s take a look at this survey question…

# Plot distribution of consci
ggplot(gss2016, aes(x = consci )) +
  # Add a bar layer
  geom_bar()

# Compute proportion of high conf
p_hat <- gss2016 %>%
  group_by( consci ) %>%
  summarise( count = n() ) %>%
  mutate( prop = count / sum( count ) )
## `summarise()` ungrouping output (override with `.groups` argument)
p_hat
## # A tibble: 4 x 3
##   consci       count   prop
##   <fct>        <int>  <dbl>
## 1 A GREAT DEAL   790 0.276 
## 2 ONLY SOME      974 0.341 
## 3 HARDLY ANY     115 0.0402
## 4 <NA>           980 0.343
boot1 <- gss2016 %>%
  mutate( ConSci = case_when( consci == "A GREAT DEAL" ~ "High",
                            consci == "ONLY SOME" ~ "Low",
                            consci == "HARDLY ANY" ~ "Low",)) %>%
  drop_na( ConSci ) %>%
  specify(response = ConSci, success = "High") %>%
  generate(reps = 1, type = "bootstrap") 


boot1 %>%
  dplyr::summarize(prop_high = mean( ConSci == "High" )) %>%
  pull()
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] 0.4241618

bootstrap many times and construct CIs

# Create bootstrap distribution for proportion with High conf
boot_dist <- gss2016 %>%
  mutate( ConSci = case_when( consci == "A GREAT DEAL" ~ "High",
                            consci == "ONLY SOME" ~ "Low",
                            consci == "HARDLY ANY" ~ "Low",)) %>%
  drop_na( ConSci ) %>%
  # Specify the response and success
  specify(response = ConSci, success = "High") %>%
  # Generate 500 bootstrap reps
  generate(reps = 500, type = "bootstrap") %>%
  # Calculate proportions
  calculate(stat = "prop")
#boot_dist

# Compute estimate of SE
SE <- boot_dist %>%
  summarize(se = sd( stat )) %>%
  pull()
#SE

# Plot bootstrap distribution of stat
ggplot( boot_dist, aes( x = stat )) +
  # Add density layer
  geom_density() +
  geom_vline( xintercept = mean( boot_dist$stat ) - 2*SE, color = 'red' ) +
  geom_vline( xintercept = mean( boot_dist$stat ) + 2*SE, color = 'red' )

Interpreting a Confidence Interval

Interpretation: “We’re 95% confident that the true proportion is somewhere in this interval.” Or, if we drew up 100 sample estimates and their CIs, 95 would contain the true value of the proportion.

The width of CIs are affected by 3 things:

  • n the number of samples. SE decreasing with increasing number of sample
  • \(\alpha\) the confidence level. Higher confidence == wider CI
  • p the value of the parameter. SEs are highest when estimating proportions close to 0.5.

SE: Standard Error: the less data you have to make an estimate, the more uncertainty there will be as to the value of that estimate

The Approximation Shortcut

a shortcut that uses the Normal Distribution.

is the sample sufficiently large?

p_hat <- gss2016 %>%
  mutate( Happy = case_when( happy == "VERY HAPPY" ~ "HAPPY",
                            happy == "PRETTY HAPPY" ~ "HAPPY",
                            happy == "NOT TOO HAPPY" ~ "UNHAPPY",)) %>%
  drop_na( Happy ) %>%
  summarize( mean( Happy == "HAPPY" ) ) %>%
  pull()

n <- nrow( gss2016 )

#these 2 criteria both need to be greater than 10:
c( n * p_hat, n*(1-p_hat))
## [1] 2407  452

Shortcut SE:

SE_approx <- sqrt( p_hat * ( 1 - p_hat ) / n )
SE_approx
## [1] 0.006823167

Does the normal distribution describe our data?

ggplot( boot, aes( x = stat ) ) +
          geom_density() +
  stat_function( fun = dnorm,
                 color = 'purple',
                 args = list( mean = p_hat,
                              sd = SE_approx ) )

The approximation methods describe the data reasonably when normality criteria are met.

Proportions: Testing and Power

Hypothesis Test for a Proportion

  • Confidence Interval - Captures how much uncertainty there is in the estimate. is formed using the Standard Error.
  • Hypothesis Test - What estimates would you observe if the ground truth holds a particular value?
    • Using the infer library, we can use the hypothesize() function
    • the sampling distribution gives us the uncertainty in the data when the \(H_0\) is true

Demonstrate this with the survey data: Do half of Americans favor capital punishment?

gss2016 %>%
  drop_na( cappun ) %>%
  ggplot( aes( x = cappun ) ) +
  geom_bar()

p_hat <- gss2016 %>%
  drop_na( cappun ) %>%
  dplyr::summarize( mean( cappun == 'FAVOR' ) ) %>%
  pull()
p_hat 
## [1] 0.6048327

Is this data consistent with a world where only half of Americans favor the death penalty?
look at a distribution of the null hypothesis \(H_0\):

null <- gss2016 %>%
  drop_na( cappun ) %>%
  specify( response = cappun, succes = 'FAVOR' ) %>%
  hypothesize( null = 'point', p = 0.5 ) %>%
  generate( reps = 500, type = 'simulate' ) %>%
  calculate( stat = 'prop' )

ggplot( null, aes( x = stat ) ) +
  geom_density() +
  geom_vline( xintercept = p_hat, color = 'red' )

The proportion is well above the null distribution. We can quantify this with the p-value which gives the proportion of \(H_0\) values \(\geq\) the estimated proportion from the data, \(\hat{p}\)

null %>%
  summarise( mean( stat > p_hat ) ) %>%
  pull() * 2
## [1] 0

Hypothesis test:

  • Null hypothesis: theory about the state of the world
  • Null distribution: distribution of test statistics assuming the nul hypothesis is true
  • p-value: a measure of consistency between the null hypothesis and your observation
    • high p-val: \(\mbox{p-value }\gt\alpha\) observation is consistent with the null
    • low p-val: \(\mbox{p-value }\lt\alpha\) observation is not consistent with the null

Let’s try the same analysis with another feature. Do 75% of Americans believe in an afterlife?

gss2016_al <- gss2016 %>%
  drop_na( postlife )
# Using `gss2016`, plot postlife
ggplot( gss2016_al, aes( x = postlife ) ) +
  # Add bar layer
  geom_bar()

calculate the population estimate:

# Calculate and save proportion that believe
p_hat <- gss2016_al %>%
  dplyr::summarize(prop_yes = mean(  postlife == 'YES')) %>%
  pull()

# See the result
p_hat
## [1] 0.8068885

Our sample \(\hat{p}=\) 0.8068885 and we would like to test our result against the claim that 75% of Americans believe in an afterlife. This is interpreted as a point null hypothesis that the population proportion has the value 75%.
Here we generate data under the null hypothesis:

1 example simulation for p = 0.75

# From previous step
sim1 <- gss2016_al %>%
  specify(response = postlife, success = "YES") %>%
  hypothesize(null = "point", p = 0.75) %>%
  generate(reps = 1, type = "simulate")

# Using sim1, plot postlife
ggplot( sim1, aes( x = postlife )) +
  # Add bar layer
  geom_bar()

find the simulated proportion

# Compute proportion that believe
sim1 %>%
  summarise(prop_yes = mean(postlife == "YES")) %>%
  pull()
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] 0.749226

Now that we’ve seen an example simulation, let’s do this many many times so that we can build a distribution of expected values assuming the true proportion is 75% Yes:

# Generate null distribution
null <- gss2016_al %>%
  specify(response = postlife, success = "YES") %>%
  hypothesize(null = "point", p = 0.75) %>%
  generate(reps = 1000, type = "simulate") %>%
  # Calculate proportions
  calculate(stat = 'prop')

Now to visualize the null distribution with our sample estimate, \(\hat{p}\)

# Visualize null distribution
ggplot( null, aes( x = stat )) +
  # Add density layer
  geom_density() +
  # Add line at observed
  geom_vline(xintercept = p_hat, color = "red")

Calculate the p-value:

null %>%
  summarize(
    # Compute the one-tailed p-value
    one_tailed_pval = mean(stat >= p_hat),
    # Compute the two-tailed p-value
    two_tailed_pval = 2 * one_tailed_pval
  ) %>%
  pull(two_tailed_pval)
## [1] 0

The videos fail to reject the null hypothesis for both of the features used above. However, the videos also subset to use only 150 records from the gss2016 dataframe. Here we’ll subset to see if we get the same result:

gss2016_al150 <- gss2016_al %>%
  sample_n( 150 )

# Calculate and save proportion that believe
p_hat <- gss2016_al %>%
  dplyr::summarize(prop_yes = mean(  postlife == 'YES')) %>%
  pull()

# Generate null distribution
null <- gss2016_al150 %>%
  specify(response = postlife, success = "YES") %>%
  hypothesize(null = "point", p = 0.75) %>%
  generate(reps = 1000, type = "simulate") %>%
  # Calculate proportions
  calculate(stat = 'prop')

# Visualize null distribution
ggplot( null, aes( x = stat )) +
  # Add density layer
  geom_density() +
  # Add line at observed
  geom_vline(xintercept = p_hat, color = "red")

null %>%
  summarize(
    # Compute the one-tailed p-value
    one_tailed_pval = mean(stat >= p_hat),
    # Compute the two-tailed p-value
    two_tailed_pval = 2 * one_tailed_pval
  ) %>%
  pull(two_tailed_pval)
## [1] 0.084

that’s similar to the result from the DataCamp videos. we could replicate the exact result by using the id values to subset the records from gss2016. I’m not sure why the DataCamp course uses less data than is available. maybe we’ll find out as the course progresses….

Intervals for Differences

A question in two variables. e.g.: Do men and women believe at different rates?

Let \(p\) be the proportion that believe in life after death

  • \(H_0 \mbox{ : } p_{female} - p{male} = 0\)
  • \(H_0 \mbox{ : } p_{female} - p{male} \neq 0\)
p1 <- ggplot( gss2016, aes( x = sex, fill = postlife ) ) +
  geom_bar()
p2 <- ggplot( gss2016, aes( x = sex, fill = postlife ) ) +
  geom_bar( position = 'fill' )

grid.arrange( p1, p2, ncol = 2 )

men and women appear to have different proportions. but is this significant?

calculate the difference in proportions for the 2 genders:

p_hats <- gss2016 %>%
  group_by( sex ) %>%
  summarise( mean( postlife == 'YES', na.rm = TRUE ) ) %>%
  pull()
## `summarise()` ungrouping output (override with `.groups` argument)
d_hat <- diff( p_hats )
d_hat
## [1] 0.1008359

Now to generate some \(H_0\) data:

  • \(H_0 \mbox{ : } p_{female} - p_{male} = 0\)
  • There is no association between belief in the afterlife and the sex of the subject
  • The variable postlife is independent of the variable sex

Here we generate the \(H_0\) distribution by permutation:

null <- gss2016 %>% 
  drop_na( c( postlife, sex ) ) %>%
  specify( postlife ~ sex, success = 'YES' ) %>%
  hypothesize( null = 'independence' ) %>%
  generate( reps = 500, type = 'permute' ) %>%
  calculate( stat = 'diff in props', order = c('FEMALE','MALE' ) )

let’s visualize the distribution

ggplot( null, aes( x = stat ) ) +
  geom_density() +
  geom_vline( xintercept = d_hat, color = 'red' )

These data suggest that there is a statistically significant difference between the sexes in the belief of life after death. furthermore, by the sign of the difference value we can conclude that more females than males believe in life after death.

Let’s run through the same analysis with a different feature, cappun: Is there a difference between males and females when it comes to the proportion of those who support the death penalty for people convicted of murder.

gss2016_cappun <- gss2016 %>%
  drop_na( cappun )
# Plot distribution of sex filled by cappun
ggplot(gss2016_cappun, aes(x = sex, fill = cappun)) +
  # Add bar layer
  geom_bar(position = "fill")

find the sample difference of the \(\hat{p}\)s

# Compute two proportions
p_hats <- gss2016_cappun %>%
  # Group by sex
  group_by( sex ) %>%
  # Calculate proportion that FAVOR
  summarize(prop_favor = mean( cappun == 'FAVOR')) %>%
  pull()
## `summarise()` ungrouping output (override with `.groups` argument)
# Compute difference in proportions
d_hat <- diff( p_hats )

# See the result
d_hat
## [1] -0.08807244

NOTE: R will do things alphabetically unless it is told otherwise. Therefore, diff() takes the difference of females - males.

Great. Now to calculate some \(H_0\) data for a distribution

# Create null distribution
null <- gss2016_cappun %>%
  # Specify the response and explanatory as well as the success
  specify(cappun ~ sex, success = "FAVOR") %>%
  # Set up null hypothesis
  hypothesize(null = "independence") %>%
  # Generate 500 reps by permutation
  generate(reps = 500, type = "permute") %>%
  # Calculate the statistics
  calculate(stat = 'diff in props', order = c("FEMALE", "MALE"))

# Visualize null
ggplot( null, aes( x= stat)) +
  # Add density layer
  geom_density() +
  # Add red vertical line at obs stat
  geom_vline(xintercept = d_hat, color = "red")

calculate the p-val:

# Compute two-tailed p-value
null %>%
  summarise(
    one_tailed_pval = mean(stat <= d_hat),
    two_tailed_pval = one_tailed_pval * 2
  ) %>%
  pull(two_tailed_pval)
## [1] 0

The p-value is approximately 0. therefore, the observed difference is statistically significantly different from the null hypothesis. Therefore, we can reject \(H_0\). Note: this is a different result that given in the course exercise, because the couse subsets the data to 150 records whereas the full dataset is used for the above calculations. again, I have no idea why DataCamp course does this.

To illustrate the difference between hypothesis testing and confidence intervals, lets take a look at how to bootstrap to find a CI for an extimate:

# Create the bootstrap distribution
boot <- gss2016_cappun %>%
  # Specify the variables and success
  specify( cappun ~ sex, success = "FAVOR") %>%
  # Generate 500 bootstrap reps
  generate( reps = 500, type = 'bootstrap' ) %>%
  # Calculate statistics
  calculate(stat = "diff in props", order = c("FEMALE", "MALE"))

    
# Compute the standard error
SE <- boot %>%
  summarize(se = sd( stat )) %>%
  pull()
  
# Form the CI (lower, upper)
c(d_hat - 2*SE, d_hat + 2*SE)
## [1] -0.12650088 -0.04964399

The CI does not span zero. Therefore, zero is not a plausible value; this agrees with the previous hypothesis test.

Statistical Errors

What is the probability that you will reject a true null hypothesis?
What is the probability that you will fail to reject a false null hypothesis? can add a feature with a random fair cointoss to evaluate independence of features

Comparing Many Parameters: Independence

Contingency Tables

gss2016 %>%
  select( partyid, natarms ) %>%
  glimpse()
## Rows: 2,859
## Columns: 2
## $ partyid <fct> "INDEPENDENT", "IND,NEAR DEM", "NOT STR REPUBLICAN", "NOT STR…
## $ natarms <fct> TOO MUCH, TOO MUCH, NA, TOO LITTLE, NA, TOO MUCH, TOO LITTLE,…
gss2016 %>%
  drop_na( partyid, natarms ) %>%
  ggplot( aes( x = partyid, fill = natarms ) ) +
  geom_bar( position = 'fill' ) +
  theme(axis.text.x=element_text(angle=45,hjust=1)) 

let’s try to clean the data the way the course does. remap the partid values to just 4 values

gss2016_parties <- gss2016 %>%
  select( partyid, natarms ) %>%
  drop_na( partyid, natarms ) %>%
  mutate( party = case_when( partyid == "INDEPENDENT" ~ "Ind",
                            partyid == "IND,NEAR DEM" ~ "Ind",
                            partyid == "IND,NEAR REP" ~ "Ind",
                            partyid == 'NOT STR DEMOCRAT' ~ 'Dem',
                            partyid == 'STRONG DEMOCRAT' ~ 'Dem',
                            partyid == 'NOT STR REPUBLICAN' ~ 'Rep',
                            partyid == 'STRONG REPUBLICAN' ~ 'Rep',
                            partyid == 'OTHER PARTY' ~ 'Oth' ),
          party = as.factor( party )) 
gss2016_parties %>%
  ggplot( aes( x = party, fill = natarms ) ) +
  geom_bar(  ) +
  theme(axis.text.x=element_text(angle=45,hjust=1)) 

Well, that’s as close as I can come without details of how DataCamp processed the dataframe.
The course uses a smaller subset of the data, this code uses the whole set

now to visualize as a contingency table:

glimpse( gss2016_parties )
## Rows: 1,382
## Columns: 3
## $ partyid <fct> "INDEPENDENT", "IND,NEAR DEM", "NOT STR REPUBLICAN", "NOT STR…
## $ natarms <fct> TOO MUCH, TOO MUCH, TOO LITTLE, TOO MUCH, TOO LITTLE, ABOUT R…
## $ party   <fct> Ind, Ind, Rep, Dem, Rep, Ind, Dem, Rep, Ind, Dem, Ind, Ind, D…
tab <- gss2016_parties %>%
  select( natarms, party ) %>%
  table()
tab
##              party
## natarms       Dem Ind Oth Rep
##   TOO LITTLE  122 197   6 200
##   ABOUT RIGHT 168 220  10  88
##   TOO MUCH    156 165  16  34
colSums( tab )
## Dem Ind Oth Rep 
## 446 582  32 322

We can see that the Oth category is much smaller than the others.

Can shift back to a dataframe form for plotting with the tidy() function:

tab %>%
  tidy() %>%
  uncount( n ) %>%
  head()
## Warning: 'tidy.table' is deprecated.
## See help("Deprecated")
## # A tibble: 6 x 2
##   natarms    party
##   <chr>      <chr>
## 1 TOO LITTLE Dem  
## 2 TOO LITTLE Dem  
## 3 TOO LITTLE Dem  
## 4 TOO LITTLE Dem  
## 5 TOO LITTLE Dem  
## 6 TOO LITTLE Dem

Now do a similar visualization to look at how people from different political parties think about spending for space exploration:

# Subset data
gss_party <- gss2016 %>%
  select( partyid, natspac ) %>%
  drop_na( partyid, natspac ) %>%
  mutate( party = case_when( partyid == "INDEPENDENT" ~ "Ind",
                            partyid == "IND,NEAR DEM" ~ "Ind",
                            partyid == "IND,NEAR REP" ~ "Ind",
                            partyid == 'NOT STR DEMOCRAT' ~ 'Dem',
                            partyid == 'STRONG DEMOCRAT' ~ 'Dem',
                            partyid == 'NOT STR REPUBLICAN' ~ 'Rep',
                            partyid == 'STRONG REPUBLICAN' ~ 'Rep',
                            partyid == 'OTHER PARTY' ~ 'Oth' ),
          party = as.factor( party )) %>%
  # Filter out the "Oth"
  filter( party != 'Oth' )

# Visualize distribution 
partyspac <- gss_party %>%
  ggplot( aes( x = party, fill = natspac ) ) +
  # Add bar layer of counts
  geom_bar( position = 'fill') +
  geom_text(aes(label = ..count..), stat = "count", position = "fill")
partyspac

This plot shows the filled histogram along with the contingency table values.

Chi-Squared Test Statistic

compare the two plots for the opinions on funding for military or space exploration by party affiliation:

partyarms <- gss2016_parties %>%
  filter( party != 'Oth' ) %>%
  ggplot( aes( x = party, fill = natarms ) ) +
  geom_bar( position = 'fill' ) +
  theme(axis.text.x=element_text(angle=45,hjust=1)) +
  geom_text(aes(label = ..count..), stat = "count", position = "fill")

grid.arrange( partyarms, partyspac, ncol = 2 )

How different are these distributions from a null distribution which shows no difference between party affiliation and opinions on spending. We need the expected counts if the variables are independent of one another.

The \(\chi^2\) statistic summarizes the squared & normalized distances of the sample proportions from the expected value.

How to find the expected values for the null hypothesis?….permutation

#an example permutation
perm_1 <- gss2016_parties %>%
  filter( party != 'Oth' ) %>%
  specify(natarms ~ party) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1, type = "permute")
  
# Visualize permuted data
ggplot( perm_1, aes( x = party, fill = natarms ) ) +
  # Add bar layer
  geom_bar( position = 'fill' ) +
  geom_text(aes(label = ..count..), stat = "count", position = "fill")

compute the \(\chi^2\)-statistic:

chi_obs_arms <- gss2016_parties %>% chisq_stat( natarms ~ party )
gss_party %>% chisq_stat( natspac ~ party )
## X-squared 
##       NaN
chi_obs_spac <- 1.32606

lets observe many more permutations:

# Create null
null_spac <- gss_party %>%
  specify(natspac ~ party) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 500, type = "permute") %>%
  calculate(stat = "Chisq")
## Warning: Explanatory variable has unused factor levels.
# Visualize null
ggplot( null_spac, aes( x = stat ) ) +
  # Add density layer
  geom_density() +
  # Add vertical line at obs stat
  geom_vline( xintercept = chi_obs_spac, color = 'red' ) +
  theme_classic()

# Create null that natarms and party are indep
null_arms <- gss2016_parties %>%
  specify(natarms ~ party) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 500, type = "permute") %>%
  calculate(stat = "Chisq")
  
# Visualize null
ggplot(null_arms, aes(x = stat)) +
  # Add density layer
  geom_density() +
  # Add vertical red line at obs stat
  geom_vline(xintercept = chi_obs_arms, color = "red")

The natarms statistic deviates way further from the null distribution compared to the natspac statistic. It seems that spending on the military is a partisan issue, while spending on space exploration is not.

Alternate Method: the Chi-Squared Distribution

an approximation method for \(\chi^2\)

  • statistics: \(\hat{x}^2\)
  • shape is determined by the degrees of freedom
  • \(df = (nrows -1) \cdot (ncols-1)\)

visualize the permutation distribution w/the approximation:

ggplot( null_spac, aes( x = stat ) ) +
  geom_density() +
  stat_function(
    fun = dchisq,
    args = list( df = 4 ),
    color = 'blue'
  ) +
  geom_vline( xintercept = chi_obs_spac, color = 'red' )

The two distributions are very close.

calculate the p-value:

1 - pchisq( chi_obs_spac, df = 4 )
## [1] 0.8569388

When to use \(\chi^2\):

  • when the counts is \(\geq\) 5 for each cell of the contingency table
  • degrees of freedom, \(df \geq 2\)

Does this data set provide evidence of an association between happiness and geography?

# Visualize distribution of region and happy
ggplot( gss2016, aes( x = region, fill = happy ) ) +
  # Add bar layer of proportions
  geom_bar( position = 'fill' ) +
  theme(axis.text.x=element_text(angle=45,hjust=1))

# Calculate and save observed statistic
chi_obs <- gss2016 %>%
  chisq_stat( happy ~ region)

# See the result
chi_obs
## X-squared 
##  12.60899

Intervals for the Chi-Squared Distribution

They don’t exist. Don’t do them.

Comparing Many Parameters: Goodness of Fit

Case Study: election fraud

Benford’s lay: the first digit law
In a free and fair election, the vote counts should follow Benford’s Law.
If the election was fraudulent, then the vote counts should not follow Benford’s law.

firstgap <- gapminder %>%
  filter( year == 2007 ) %>%
  select( country, pop ) %>%
  mutate( strpop = as.character( pop ),
          strpop = as.numeric( str_sub( strpop, 1, 1 ) ) )

head( firstgap )
## # A tibble: 6 x 3
##   country          pop strpop
##   <fct>          <int>  <dbl>
## 1 Afghanistan 31889923      3
## 2 Albania      3600523      3
## 3 Algeria     33333216      3
## 4 Angola      12420476      1
## 5 Argentina   40301927      4
## 6 Australia   20434176      2
ggplot( firstgap, aes( x = strpop ) ) +
  geom_bar() +
  ggtitle( 'Country populations' ) +
  xlab( 'first digit' ) +
  theme_classic()

Take a look at the voting data for an election in iran

url <- 'https://assets.datacamp.com/production/repositories/1703/datasets/a777b2366f4e576da5d58fda42f8337332acd3ae/iran.csv'
iran <- read.csv( url )
glimpse( iran )
## Rows: 366
## Columns: 9
## $ province         <chr> "East Azerbaijan", "East Azerbaijan", "East Azerbaij…
## $ city             <chr> "Azar Shahr", "Asko", "Ahar", "Bostan Abad", "Bonab"…
## $ ahmadinejad      <int> 37203, 32510, 47938, 38610, 36395, 435728, 20520, 12…
## $ rezai            <int> 453, 481, 568, 281, 485, 9830, 166, 55, 442, 391, 23…
## $ karrubi          <int> 138, 468, 173, 53, 190, 3513, 74, 46, 211, 126, 173,…
## $ mousavi          <int> 18312, 18799, 26220, 12603, 33695, 419983, 14340, 39…
## $ total_votes_cast <int> 56712, 52643, 75500, 51911, 71389, 876919, 35295, 16…
## $ voided_votes     <int> 606, 385, 601, 364, 624, 7865, 195, 102, 634, 661, 3…
## $ legitimate_votes <int> 56106, 52258, 74899, 51547, 70765, 869054, 35100, 16…
# Compute and save candidate totals
totals <- iran %>%
  summarise(ahmadinejad = sum( ahmadinejad ),
            rezai = sum( rezai),
            karrubi = sum( karrubi ),
            mousavi = sum(mousavi))

# Gather data
gathered_totals <- totals %>%
  gather(key = 'candidate', value = 'votes') %>%
  arrange( desc( votes ) )

# Inspect gathered totals
gathered_totals
##     candidate    votes
## 1 ahmadinejad 24515209
## 2     mousavi 13225330
## 3       rezai   659281
## 4     karrubi   328979
# Plot total votes for each candidate
ggplot( gathered_totals, aes( x = reorder( candidate, -votes ), y = votes )) +
  # Add col layer
  geom_col( stat = 'identity' ) +
  xlab( 'candidate' ) +
  theme_classic()
## Warning: Ignoring unknown parameters: stat

Now to break the votes down by province:

# Construct province-level dataset
province_totals <- iran %>%
  # Group by province
  group_by( province ) %>%
  # Sum up votes for top two candidates
  summarise( ahmadinejad = sum( ahmadinejad ), mousavi = sum( mousavi ) ) 
## `summarise()` ungrouping output (override with `.groups` argument)
# Filter for won provinces won by #2
province_totals %>%
  filter(mousavi > ahmadinejad)
## # A tibble: 2 x 3
##   province               ahmadinejad mousavi
##   <chr>                        <int>   <int>
## 1 Sistan and Baluchestan      450269  507946
## 2 West Azerbaijan             623946  656508

a couple of provinces were won by Mousavi.

Goodness of Fit

Let’s take a look at the first digits for the total counts of votes by region:

glimpse( iran )
## Rows: 366
## Columns: 9
## $ province         <chr> "East Azerbaijan", "East Azerbaijan", "East Azerbaij…
## $ city             <chr> "Azar Shahr", "Asko", "Ahar", "Bostan Abad", "Bonab"…
## $ ahmadinejad      <int> 37203, 32510, 47938, 38610, 36395, 435728, 20520, 12…
## $ rezai            <int> 453, 481, 568, 281, 485, 9830, 166, 55, 442, 391, 23…
## $ karrubi          <int> 138, 468, 173, 53, 190, 3513, 74, 46, 211, 126, 173,…
## $ mousavi          <int> 18312, 18799, 26220, 12603, 33695, 419983, 14340, 39…
## $ total_votes_cast <int> 56712, 52643, 75500, 51911, 71389, 876919, 35295, 16…
## $ voided_votes     <int> 606, 385, 601, 364, 624, 7865, 195, 102, 634, 661, 3…
## $ legitimate_votes <int> 56106, 52258, 74899, 51547, 70765, 869054, 35100, 16…
# Create first_digit variable
iran <- iran %>%
  mutate( first_digit = as.character( total_votes_cast ),
          first_digit = as.factor( str_sub( first_digit, 1, 1 ) ) )
  
# Check if get_first worked
iran %>%
  select( total_votes_cast, first_digit )
##     total_votes_cast first_digit
## 1              56712           5
## 2              52643           5
## 3              75500           7
## 4              51911           5
## 5              71389           7
## 6             876919           8
## 7              35295           3
## 8              16375           1
## 9              72152           7
## 10             77459           7
## 11             41949           4
## 12             43974           4
## 13            138840           1
## 14            135437           1
## 15             55394           5
## 16            102077           1
## 17             39306           3
## 18             34068           3
## 19             32940           3
## 20            445344           4
## 21             23899           2
## 22             57863           5
## 23             20056           2
## 24             39424           3
## 25             44554           4
## 26             22920           2
## 27             22999           2
## 28            167130           1
## 29             38510           3
## 30             93032           9
## 31             44690           4
## 32             25802           2
## 33             43729           4
## 34             65342           6
## 35            117954           1
## 36             61108           6
## 37            285700           2
## 38             25831           2
## 39             80742           8
## 40             55534           5
## 41             16391           1
## 42             44875           4
## 43             85346           8
## 44             32807           3
## 45             14779           1
## 46             57046           5
## 47             31239           3
## 48           1095399           1
## 49             60522           6
## 50             47060           4
## 51             26194           2
## 52            158001           1
## 53             22756           2
## 54             23572           2
## 55             44971           4
## 56            101755           1
## 57             83689           8
## 58             52658           5
## 59             26626           2
## 60            134790           1
## 61            182031           1
## 62             52836           5
## 63            135588           1
## 64             79390           7
## 65             35570           3
## 66            156550           1
## 67             29239           2
## 68             26945           2
## 69            116195           1
## 70             27989           2
## 71             34655           3
## 72             35015           3
## 73             42571           4
## 74             13085           1
## 75             16212           1
## 76            125718           1
## 77             39466           3
## 78             24193           2
## 79            117290           1
## 80             44870           4
## 81             28262           2
## 82             16848           1
## 83             50797           5
## 84             46545           4
## 85            246062           2
## 86            114399           1
## 87           4179188           4
## 88             68701           6
## 89            270114           2
## 90            340371           3
## 91            132693           1
## 92            320114           3
## 93            536798           5
## 94             22198           2
## 95            950243           9
## 96             75139           7
## 97            265520           2
## 98             33053           3
## 99             68000           6
## 100           189933           1
## 101            45843           4
## 102            24894           2
## 103            35424           3
## 104            98299           9
## 105            14986           1
## 106           152952           1
## 107            29040           2
## 108            20083           2
## 109            24158           2
## 110            26995           2
## 111            85033           8
## 112            29910           2
## 113            18192           1
## 114            44851           4
## 115            61076           6
## 116            76158           7
## 117            26305           2
## 118           120769           1
## 119           119338           1
## 120            27826           2
## 121            30121           3
## 122            70931           7
## 123            30105           3
## 124            59990           5
## 125            42017           4
## 126            32865           3
## 127            36997           3
## 128           202120           2
## 129            46032           4
## 130            53481           5
## 131           100996           1
## 132            92337           9
## 133            22513           2
## 134            51842           5
## 135          1536106           1
## 136            29932           2
## 137           249090           2
## 138            72440           7
## 139           188462           1
## 140            21720           2
## 141            84718           8
## 142            29666           2
## 143            14442           1
## 144            52553           5
## 145           118482           1
## 146            47206           4
## 147            16114           1
## 148            81385           8
## 149           663609           6
## 150            82825           8
## 151            58388           5
## 152           119589           1
## 153            95294           9
## 154            68202           6
## 155           195951           1
## 156            41138           4
## 157            24137           2
## 158            55605           5
## 159            54837           5
## 160            88746           8
## 161            86383           8
## 162            30248           3
## 163            13074           1
## 164            54671           5
## 165            11845           1
## 166            18075           1
## 167            13041           1
## 168           101034           1
## 169            22126           2
## 170            95281           9
## 171            37763           3
## 172           279301           2
## 173            26926           2
## 174            23290           2
## 175            57680           5
## 176            98015           9
## 177           155051           1
## 178            48781           4
## 179            23781           2
## 180            87491           8
## 181            87143           8
## 182            58922           5
## 183            24835           2
## 184           124950           1
## 185            21227           2
## 186           268191           2
## 187            34301           3
## 188            63603           6
## 189            57087           5
## 190            26401           2
## 191            28183           2
## 192            25355           2
## 193            75231           7
## 194            58844           5
## 195            26144           2
## 196            42233           4
## 197            55390           5
## 198            30093           3
## 199            20004           2
## 200           121280           1
## 201            29592           2
## 202            17738           1
## 203           102299           1
## 204            43065           4
## 205            34383           3
## 206            61283           6
## 207            22190           2
## 208           947168           9
## 209            24323           2
## 210           120734           1
## 211            74152           7
## 212            38680           3
## 213           139642           1
## 214           116738           1
## 215            48593           4
## 216           174291           1
## 217            74108           7
## 218            34184           3
## 219            66149           6
## 220            54845           5
## 221           109959           1
## 222            96527           9
## 223           102947           1
## 224           328077           3
## 225           599040           5
## 226            43209           4
## 227            57576           5
## 228            29393           2
## 229            33944           3
## 230            20727           2
## 231            79315           7
## 232           165726           1
## 233            79251           7
## 234            46389           4
## 235            55227           5
## 236            89743           8
## 237            40644           4
## 238           125512           1
## 239           113857           1
## 240            23042           2
## 241           168362           1
## 242            51850           5
## 243            31845           3
## 244            75644           7
## 245           138980           1
## 246            43710           4
## 247            70836           7
## 248            41050           4
## 249           396530           3
## 250            55635           5
## 251            15045           1
## 252            23529           2
## 253            85833           8
## 254            23201           2
## 255            15153           1
## 256            25014           2
## 257            24050           2
## 258            21490           2
## 259            41705           4
## 260            59562           5
## 261            41670           4
## 262            12690           1
## 263           503575           5
## 264            47042           4
## 265            36051           3
## 266            46386           4
## 267            20938           2
## 268           132483           1
## 269            39868           3
## 270           100723           1
## 271            74695           7
## 272            51724           5
## 273            50112           5
## 274            31694           3
## 275            59869           5
## 276            48224           4
## 277            73690           7
## 278            42652           4
## 279            54292           5
## 280           227977           2
## 281           130002           1
## 282            22440           2
## 283            76777           7
## 284            47169           4
## 285            70748           7
## 286            31938           3
## 287            79834           7
## 288           484075           4
## 289            39208           3
## 290            68019           6
## 291           101141           1
## 292            33568           3
## 293            46121           4
## 294            86223           8
## 295           106556           1
## 296            63223           6
## 297           104994           1
## 298            87767           8
## 299            32674           3
## 300            42901           4
## 301            66430           6
## 302           183277           1
## 303            42132           4
## 304           289393           2
## 305            83793           8
## 306            91010           9
## 307            50479           5
## 308           114855           1
## 309           219757           2
## 310           299161           2
## 311            81874           8
## 312           100190           1
## 313           139663           1
## 314            45377           4
## 315            85339           8
## 316            49422           4
## 317           317768           3
## 318            44689           4
## 319            35040           3
## 320           180685           1
## 321            27200           2
## 322            64209           6
## 323            70651           7
## 324            77571           7
## 325            81242           8
## 326            12206           1
## 327           319014           3
## 328            32739           3
## 329            71004           7
## 330            34181           3
## 331            27746           2
## 332            35638           3
## 333           118395           1
## 334            77626           7
## 335            27640           2
## 336            29772           2
## 337             3513           3
## 338            31152           3
## 339            69300           6
## 340           390141           3
## 341            19121           1
## 342            35663           3
## 343            22098           2
## 344            23754           2
## 345            61618           6
## 346            17453           1
## 347            52927           5
## 348           116284           1
## 349            60947           6
## 350            76806           7
## 351            68233           6
## 352            69779           6
## 353            82530           8
## 354           168274           1
## 355           110857           1
## 356           381743           3
## 357            28473           2
## 358            44526           4
## 359            33478           3
## 360            46283           4
## 361            19448           1
## 362            22644           2
## 363            42841           4
## 364            35468           3
## 365            44809           4
## 366           291886           2
# Construct bar plot
ggplot( iran, aes( x = first_digit )) +
  # Add bar layer
  geom_bar() +
  theme_classic()

bfd_iran <- benford( iran$total_votes_cast)
plot( bfd_iran )

There are some deviations from benford’s law (many of the data points are above the expected value given by the dashed red lines)

bfd_iran
## 
## Benford object:
##  
## Data: iran$total_votes_cast 
## Number of observations used = 366 
## Number of obs. for second order = 365 
## First digits analysed = 2
## 
## Mantissa: 
## 
##    Statistic  Value
##         Mean  0.530
##          Var  0.074
##  Ex.Kurtosis -0.962
##     Skewness -0.223
## 
## 
## The 5 largest deviations: 
## 
##   digits absolute.diff
## 1     14          7.97
## 2     44          6.43
## 3     17          6.09
## 4     19          5.15
## 5     12          4.72
## 
## Stats:
## 
##  Pearson's Chi-squared test
## 
## data:  iran$total_votes_cast
## X-squared = 110.91, df = 89, p-value = 0.05784
## 
## 
##  Mantissa Arc Test
## 
## data:  iran$total_votes_cast
## L2 = 0.020173, df = 2, p-value = 0.0006216
## 
## Mean Absolute Deviation (MAD): 0.004998788
## MAD Conformity - Nigrini (2012): Nonconformity
## Distortion Factor: 3.948838
## 
## Remember: Real data will never conform perfectly to Benford's Law. You should not focus on p-values!

Let’s used the \(\chi^2\) test to evaluate the goodness of fit for benford’s law to our data. Here’s an example of finding the \(\chi^2\) to compare the uniformity of party affiliation:

gss2016_parties_noOth <- gss2016_parties %>%
  filter( party != 'Oth' ) %>%
  droplevels()

gss2016_parties_noOth %>%
  ggplot( aes( x = party )) +
  geom_bar() +
  geom_hline( yintercept =  1350/3, color = 'goldenrod', size = 2 )

tab <- gss2016_parties_noOth %>%
  select( party ) %>%
  table()
tab
## .
## Dem Ind Rep 
## 446 582 322
p_uniform <- c( Dem = 1/3, Ind = 1/3, Rep = 1/3 )
chisq.test( tab, p = p_uniform )$stat
## X-squared 
##  75.16444

How to test this?…try simulating the null hypothesis

sims <- gss2016_parties_noOth %>%
  specify( response = party ) %>%
  hypothesise( null = 'point', p = p_uniform ) %>%
  generate( reps = 1, type = 'simulate' )
glimpse( sims )
## Rows: 1,350
## Columns: 2
## Groups: replicate [1]
## $ party     <fct> Rep, Rep, Dem, Dem, Dem, Rep, Dem, Dem, Dem, Dem, Ind, Ind,…
## $ replicate <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
ggplot( sims, aes( x = party) ) +
  geom_bar()

This plot looks a lot closer to the null. How let’s permute many more times to generate a null distribution:

p_benford <- c( `1` = 0.30103000, `2` = 0.17609126, `3` = 0.12493874,
                `4` = 0.09691001, `5` = 0.07918125, `6` = 0.06694679,
                `7` = 0.05799195, `8` = 0.05115252, `9` = 0.04575749 )

# Compute observed stat
chi_obs_stat <- iran %>%
  chisq_stat(response = first_digit, p = p_benford)
#chi_obs_stat

# Form null distribution
null <- iran %>%
  # Specify the response
  specify( response = first_digit ) %>%
  # Set up the null hypothesis
  hypothesize( null = 'point', p = p_benford ) %>%
  # Generate 500 reps
  generate( reps = 500, type = 'simulate' ) %>%
  # Calculate statistics
  calculate( stat = 'Chisq' )
# Compute degrees of freedom
degrees_of_freedom <- iran %>%
  # Pull out first_digit vector
  pull("first_digit") %>%
  nlevels() -1


# Plot both null dists
ggplot( null, aes( x = stat )) +
  # Add density layer
  geom_density() +
  # Add vertical line at obs stat
  geom_vline( xintercept = chi_obs_stat) +
  # Overlay chisq approx
  stat_function(fun = dchisq, args = list(df = degrees_of_freedom), color = "blue")

# Permutation p-value
null %>% summarize(pval = mean( stat > chi_obs_stat ))
## # A tibble: 1 x 1
##    pval
##   <dbl>
## 1 0.004
# Approximation p-value
1 - pchisq( chi_obs_stat, df = degrees_of_freedom )
##   X-squared 
## 0.006836368

The low p-value suggests that the results are inconsistent with benson’s law.

And now to US

Let’s compare our analysis for the Iranian election with data from a US election

url <- 'https://assets.datacamp.com/production/repositories/1703/datasets/3e73a6c4432671bff5e6f05d340ac1ee41f2ba76/iowa.csv'
iowa <- read.csv( url, colClasses = c( 'factor', 'factor', 'factor', 'factor', 'numeric' ) )
glimpse( iowa )
## Rows: 1,386
## Columns: 5
## $ office    <fct> President/Vice President, President/Vice President, Preside…
## $ candidate <fct> Evan McMullin / Nathan Johnson, Under Votes, Gary Johnson /…
## $ party     <fct> Nominated by Petition, NA, Libertarian, NA, Socialism and L…
## $ county    <fct> Adair, Adair, Adair, Adair, Adair, Adair, Adair, Adair, Ada…
## $ votes     <dbl> 10, 32, 127, 5, 0, 10, 1133, 14, 3, 2461, 3848, 38, 5, 10, …

Let’s look at the Iowa votes totals for R/D presidential/vpres candidates by county:

# Get Iowa county vote totals
iowa_county <- iowa %>%
  # Filter for rep/dem
  filter(candidate %in% c("Donald Trump / Mike Pence", "Hillary Clinton / Tim Kaine")) %>%
  # Group by county
  group_by( county ) %>%
  # Compute total votes in each county
  summarise(dem_rep_votes = sum( votes ))
## `summarise()` ungrouping output (override with `.groups` argument)
# See the result
iowa_county
## # A tibble: 99 x 2
##    county     dem_rep_votes
##    <fct>              <dbl>
##  1 Adair               3594
##  2 Adams               1960
##  3 Allamakee           6514
##  4 Appanoose           5847
##  5 Audubon             3216
##  6 Benton             12910
##  7 Black Hawk         59709
##  8 Boone              13025
##  9 Bremer             12564
## 10 Buchanan            9480
## # … with 89 more rows

Now add a column with the first digit for each vote total:

# Create first_digit variable
iowa_county <- iowa_county %>%
  mutate( first_digit = as.character( dem_rep_votes ),
          first_digit = as.factor( str_sub( first_digit, 1, 1 ) ) )

# Using iowa_county, plot first_digit
ggplot( iowa_county, aes( x = first_digit ) ) +
  # Add bar layer
  geom_bar()

Now to evaluate the diviation of the iowa data from the benford prediction with a \(\chi^2\) hypothesis test:

# Compute observed stat
chi_obs_stat <- iowa_county %>%
  chisq_stat(response = first_digit, p = p_benford)

# Form null distribution
null <- iowa_county %>%
  # Specify response
  specify( response = first_digit ) %>%
  # Set up null
  hypothesize( null = 'point', p = p_benford ) %>%
  # Generate 500 reps
  generate( reps = 5000, type = 'simulate' ) %>%
  # Calculate statistics
  calculate( stat = 'Chisq' )

# Visualize null stat
ggplot( null, aes( x = stat ) ) +
  # Add density layer
  geom_density() +
  # Add vertical line at observed stat
  geom_vline( xintercept = chi_obs_stat )

calculate the p-value:

# Permutation p-value
null %>% summarize(pval = mean( stat > chi_obs_stat ))
## # A tibble: 1 x 1
##     pval
##    <dbl>
## 1 0.0018
# Compute degrees of freedom
degrees_of_freedom <- iowa_county %>%
  # Pull out first_digit vector
  pull("first_digit") %>%
  nlevels() -1

# Approximation p-value
1 - pchisq( chi_obs_stat, df = degrees_of_freedom )
##    X-squared 
## 0.0008829836

The low p-value indicates that if in fact there was election fraud in Iowa, there would be a very small probability of observing this data or more extreme. Because the observed statistic is far into the tails of the null distribution, this indicates that your data is inconsistent with the null hypothesis that Benford’s Law applies to vote totals in Iowa.

Electrion Fraud in Iran and Iowa: debrief

What went wrong? maybe Benfords Law doesn’t apply to vote totals? probably didn’t have enough data