summary

scrape, tidy, clean, analyze, and visualize table with untidy data representing distribution of income across differing degrees of belief in God.

methods

scrape table associated with below chart

source: belief in God by household income

source: belief in God by household income

# load csv approximation of table into memory
raw_data <- read.csv("belief_in_god_by_income.csv", sep = ",")#, check.names = FALSE)
raw_data
##   Income.distribution Believe.in.God..absolutely.certain
## 1    Less than $30000                                66%
## 2       $30000-$49999                                65%
## 3       $50000-$99999                                62%
## 4     $100000 or more                                53%
##   Believe.in.God..fairly.certain
## 1                            20%
## 2                            19%
## 3                            19%
## 4                            22%
##   Believe.in.God..not.too.not.at.all.certain Believe.in.God..don.t.know
## 1                                         5%                         1%
## 2                                         4%                         1%
## 3                                         5%                       < 1%
## 4                                         8%                       < 1%
##   Do.not.believe.in.God Other.don.t.know.if.they.believe.in.God
## 1                    7%                                      2%
## 2                    9%                                      2%
## 3                   11%                                      2%
## 4                   14%                                      3%
##   Sample.Size
## 1        8845
## 2        5920
## 3        8723
## 4        7002

initial observations on state of raw data:

update headers

names(raw_data) <- c("income_distribution", "most_certain", "fairly_certain", "not_certain", "don't_know", "certainly_not", "other", "sample_size")
raw_data
##   income_distribution most_certain fairly_certain not_certain don't_know
## 1    Less than $30000          66%            20%          5%         1%
## 2       $30000-$49999          65%            19%          4%         1%
## 3       $50000-$99999          62%            19%          5%       < 1%
## 4     $100000 or more          53%            22%          8%       < 1%
##   certainly_not other sample_size
## 1            7%    2%        8845
## 2            9%    2%        5920
## 3           11%    2%        8723
## 4           14%    3%        7002

transform percentage values into numeric counts

raw_data$"don't_know"<- sub("< 1%", ".5", raw_data$"don't_know") # replace "< 1" with arbitrary character
raw_data$"don't_know"<- sub("%", "", raw_data$"don't_know") # replace "%"
raw_data$"don't_know" <- as.numeric(raw_data$"don't_know")
raw_data$"most_certain"<- sub("%", "", raw_data$"most_certain") #remove "%"
raw_data$"most_certain" <- as.numeric(raw_data$"most_certain")
raw_data$"fairly_certain"<- sub("%", "", raw_data$"fairly_certain") # replace "%"
raw_data$"fairly_certain" <- as.numeric(raw_data$"fairly_certain")
raw_data$"not_certain"<- sub("%", "", raw_data$"not_certain") # replace "%"
raw_data$"not_certain"<- as.numeric(raw_data$"not_certain") 
raw_data$"certainly_not"<- sub("%", "", raw_data$"certainly_not") # replace "%"
raw_data$"certainly_not" <- as.numeric(raw_data$"certainly_not")
raw_data$"other"<- sub("%", "", raw_data$"other") # replace "%" 
raw_data$"other" <- as.numeric(raw_data$"other")
raw_data
##   income_distribution most_certain fairly_certain not_certain don't_know
## 1    Less than $30000           66             20           5        1.0
## 2       $30000-$49999           65             19           4        1.0
## 3       $50000-$99999           62             19           5        0.5
## 4     $100000 or more           53             22           8        0.5
##   certainly_not other sample_size
## 1             7     2        8845
## 2             9     2        5920
## 3            11     2        8723
## 4            14     3        7002
# divide relevant values by 100
divide_by_100 <- function(x) (x / 100) 
raw_data <- raw_data %>% mutate_at(
  c("most_certain", "fairly_certain", "not_certain", "don't_know", "certainly_not", "other"), 
  divide_by_100
  )

# multiple relevant values by sample size
raw_data$"most_certain" <- round(raw_data$"most_certain" * raw_data$"sample_size")
raw_data$"fairly_certain" <- round(raw_data$"fairly_certain" * raw_data$"sample_size")
raw_data$"not_certain" <- round(raw_data$"not_certain" * raw_data$"sample_size")
raw_data$"don't_know" <- round(raw_data$"don't_know" * raw_data$"sample_size")
raw_data$"certainly_not" <- round(raw_data$"certainly_not" * raw_data$"sample_size")
raw_data$"other" <- round(raw_data$"other" * raw_data$"sample_size")
raw_data
##   income_distribution most_certain fairly_certain not_certain don't_know
## 1    Less than $30000         5838           1769         442         88
## 2       $30000-$49999         3848           1125         237         59
## 3       $50000-$99999         5408           1657         436         44
## 4     $100000 or more         3711           1540         560         35
##   certainly_not other sample_size
## 1           619   177        8845
## 2           533   118        5920
## 3           960   174        8723
## 4           980   210        7002
wide.raw_data <- raw_data

create “yes”, “no”, “maybe” buckets

wide.raw_data$"yes" <- raw_data$"most_certain" + raw_data$"fairly_certain" + raw_data$"not_certain"
wide.raw_data$"no" <- raw_data$"certainly_not" 
wide.raw_data$"maybe" <- raw_data$"don't_know" + raw_data$"other"
wide.raw_data
##   income_distribution most_certain fairly_certain not_certain don't_know
## 1    Less than $30000         5838           1769         442         88
## 2       $30000-$49999         3848           1125         237         59
## 3       $50000-$99999         5408           1657         436         44
## 4     $100000 or more         3711           1540         560         35
##   certainly_not other sample_size  yes  no maybe
## 1           619   177        8845 8049 619   265
## 2           533   118        5920 5210 533   177
## 3           960   174        8723 7501 960   218
## 4           980   210        7002 5811 980   245

commence analysis of data

questions:

  • what is average certainty/uncertainty across income?
  • what is aggregate certainty/uncertainty across income?
  • what is aggregate certainty/uncertainty in total?
# calculate average certainty
avg.raw_data <- raw_data %>%
select(income_distribution, most_certain, fairly_certain, not_certain, `don't_know`, certainly_not, other, sample_size) %>%
group_by(income_distribution) %>%
summarise(yes = mean(most_certain, fairly_certain, not_certain), maybe = mean(`don't_know`, other), no = mean(certainly_not))

avg.raw_data
## # A tibble: 4 x 4
##   income_distribution   yes maybe    no
##   <fct>               <dbl> <dbl> <dbl>
## 1 $100000 or more      3711    35   980
## 2 $30000-$49999        3848    59   533
## 3 $50000-$99999        5408    44   960
## 4 Less than $30000     5838    88   619
# calculate aggregate certainty
agg.raw_data <- wide.raw_data %>%
select(income_distribution, yes, no, maybe) %>%
group_by(income_distribution) %>%
summarise(yes = sum(yes), maybe = sum(maybe), no = sum(no))

agg.raw_data
## # A tibble: 4 x 4
##   income_distribution   yes maybe    no
##   <fct>               <dbl> <dbl> <dbl>
## 1 $100000 or more      5811   245   980
## 2 $30000-$49999        5210   177   533
## 3 $50000-$99999        7501   218   960
## 4 Less than $30000     8049   265   619
# melt aggregate certainty table
m.wide.raw_data <- melt(agg.raw_data)
## Using income_distribution as id variables
names(m.wide.raw_data) <- c("income_dist", "certainty", "count")
m.wide.raw_data
##         income_dist certainty count
## 1   $100000 or more       yes  5811
## 2     $30000-$49999       yes  5210
## 3     $50000-$99999       yes  7501
## 4  Less than $30000       yes  8049
## 5   $100000 or more     maybe   245
## 6     $30000-$49999     maybe   177
## 7     $50000-$99999     maybe   218
## 8  Less than $30000     maybe   265
## 9   $100000 or more        no   980
## 10    $30000-$49999        no   533
## 11    $50000-$99999        no   960
## 12 Less than $30000        no   619
# melt raw certainty table
m.raw_data <- melt(raw_data)
## Using income_distribution as id variables
names(m.raw_data) <- c("income_dist", "certainty", "count")
m.raw_data
##         income_dist      certainty count
## 1  Less than $30000   most_certain  5838
## 2     $30000-$49999   most_certain  3848
## 3     $50000-$99999   most_certain  5408
## 4   $100000 or more   most_certain  3711
## 5  Less than $30000 fairly_certain  1769
## 6     $30000-$49999 fairly_certain  1125
## 7     $50000-$99999 fairly_certain  1657
## 8   $100000 or more fairly_certain  1540
## 9  Less than $30000    not_certain   442
## 10    $30000-$49999    not_certain   237
## 11    $50000-$99999    not_certain   436
## 12  $100000 or more    not_certain   560
## 13 Less than $30000     don't_know    88
## 14    $30000-$49999     don't_know    59
## 15    $50000-$99999     don't_know    44
## 16  $100000 or more     don't_know    35
## 17 Less than $30000  certainly_not   619
## 18    $30000-$49999  certainly_not   533
## 19    $50000-$99999  certainly_not   960
## 20  $100000 or more  certainly_not   980
## 21 Less than $30000          other   177
## 22    $30000-$49999          other   118
## 23    $50000-$99999          other   174
## 24  $100000 or more          other   210
## 25 Less than $30000    sample_size  8845
## 26    $30000-$49999    sample_size  5920
## 27    $50000-$99999    sample_size  8723
## 28  $100000 or more    sample_size  7002
# calculate all certainty regardless of income
all.raw_data <- agg.raw_data %>%
   select_if(is.numeric) %>%
   summarise_all(funs(sum))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
all.raw_data
## # A tibble: 1 x 3
##     yes maybe    no
##   <dbl> <dbl> <dbl>
## 1 26571   905  3092
ggplot(m.wide.raw_data, aes(fill=certainty, y=count, x=certainty)) +
  ggtitle("simplified distribution of certainty") +
  theme(plot.title = element_text(hjust = 0.5)) +
  facet_grid(.~income_dist) +
  geom_bar(position='dodge', stat="identity") +
  xlab('certainty by income') +
  ylab('count')

ggplot(m.raw_data, aes(fill=certainty, y=count, x=certainty)) +
  ggtitle("specific distribution of certainty") +
  theme(plot.title = element_text(hjust = 0.5)) +
  facet_grid(.~income_dist) +
  geom_bar(position='dodge', stat="identity") +
  theme(axis.text.x = element_text(angle = 90)) + # rotate x-axis labels
  xlab('certainty by income') +
  ylab('count')