summary

scrape, tidy, clean, analyze, and visualize table with untidy data representing distribution of income across differing degrees of belief in God.

methods

scrape table associated with below chart

source: belief in God by household income

# load csv approximation of table into memory
raw_data <- read.csv("belief_in_god_by_income.csv", sep = ",")#, check.names = FALSE)
raw_data

##   Income.distribution Believe.in.God..absolutely.certain
## 1    Less than $30000                                66%
## 2       $30000-$49999                                65%
## 3       $50000-$99999                                62%
## 4     $100000 or more                                53%
##   Believe.in.God..fairly.certain
## 1                            20%
## 2                            19%
## 3                            19%
## 4                            22%
##   Believe.in.God..not.too.not.at.all.certain Believe.in.God..don.t.know
## 1                                         5%                         1%
## 2                                         4%                         1%
## 3                                         5%                       < 1%
## 4                                         8%                       < 1%
##   Do.not.believe.in.God Other.don.t.know.if.they.believe.in.God
## 1                    7%                                      2%
## 2                    9%                                      2%
## 3                   11%                                      2%
## 4                   14%                                      3%
##   Sample.Size
## 1        8845
## 2        5920
## 3        8723
## 4        7002

initial observations on state of raw data:

can likely retain semantic value of column headers if change names to likert scale representation (or equivalent qualitative abstraction) of “intensity of belief”
values in certain columns, like “Believe in God; don’t know” contain non-numeric values; round those to .5% for illustration purposes
to obtain counts, should calculate fractional count per percentage value
for easier analysis, one could tidy the dataset such that there are three buckets of certainty as opposed to six
it is also possible to melt the dataset to produce a tall dataset for analysis as well

update headers

names(raw_data) <- c("income_distribution", "most_certain", "fairly_certain", "not_certain", "don't_know", "certainly_not", "other", "sample_size")
raw_data

##   income_distribution most_certain fairly_certain not_certain don't_know
## 1    Less than $30000          66%            20%          5%         1%
## 2       $30000-$49999          65%            19%          4%         1%
## 3       $50000-$99999          62%            19%          5%       < 1%
## 4     $100000 or more          53%            22%          8%       < 1%
##   certainly_not other sample_size
## 1            7%    2%        8845
## 2            9%    2%        5920
## 3           11%    2%        8723
## 4           14%    3%        7002

transform percentage values into numeric counts

raw_data$"don't_know"<- sub("< 1%", ".5", raw_data$"don't_know") # replace "< 1" with arbitrary character
raw_data$"don't_know"<- sub("%", "", raw_data$"don't_know") # replace "%"
raw_data$"don't_know" <- as.numeric(raw_data$"don't_know")
raw_data$"most_certain"<- sub("%", "", raw_data$"most_certain") #remove "%"
raw_data$"most_certain" <- as.numeric(raw_data$"most_certain")
raw_data$"fairly_certain"<- sub("%", "", raw_data$"fairly_certain") # replace "%"
raw_data$"fairly_certain" <- as.numeric(raw_data$"fairly_certain")
raw_data$"not_certain"<- sub("%", "", raw_data$"not_certain") # replace "%"
raw_data$"not_certain"<- as.numeric(raw_data$"not_certain") 
raw_data$"certainly_not"<- sub("%", "", raw_data$"certainly_not") # replace "%"
raw_data$"certainly_not" <- as.numeric(raw_data$"certainly_not")
raw_data$"other"<- sub("%", "", raw_data$"other") # replace "%" 
raw_data$"other" <- as.numeric(raw_data$"other")
raw_data

##   income_distribution most_certain fairly_certain not_certain don't_know
## 1    Less than $30000           66             20           5        1.0
## 2       $30000-$49999           65             19           4        1.0
## 3       $50000-$99999           62             19           5        0.5
## 4     $100000 or more           53             22           8        0.5
##   certainly_not other sample_size
## 1             7     2        8845
## 2             9     2        5920
## 3            11     2        8723
## 4            14     3        7002

# divide relevant values by 100
divide_by_100 <- function(x) (x / 100) 
raw_data <- raw_data %>% mutate_at(
  c("most_certain", "fairly_certain", "not_certain", "don't_know", "certainly_not", "other"), 
  divide_by_100
  )

# multiple relevant values by sample size
raw_data$"most_certain" <- round(raw_data$"most_certain" * raw_data$"sample_size")
raw_data$"fairly_certain" <- round(raw_data$"fairly_certain" * raw_data$"sample_size")
raw_data$"not_certain" <- round(raw_data$"not_certain" * raw_data$"sample_size")
raw_data$"don't_know" <- round(raw_data$"don't_know" * raw_data$"sample_size")
raw_data$"certainly_not" <- round(raw_data$"certainly_not" * raw_data$"sample_size")
raw_data$"other" <- round(raw_data$"other" * raw_data$"sample_size")
raw_data

##   income_distribution most_certain fairly_certain not_certain don't_know
## 1    Less than $30000         5838           1769         442         88
## 2       $30000-$49999         3848           1125         237         59
## 3       $50000-$99999         5408           1657         436         44
## 4     $100000 or more         3711           1540         560         35
##   certainly_not other sample_size
## 1           619   177        8845
## 2           533   118        5920
## 3           960   174        8723
## 4           980   210        7002

wide.raw_data <- raw_data

create “yes”, “no”, “maybe” buckets

wide.raw_data$"yes" <- raw_data$"most_certain" + raw_data$"fairly_certain" + raw_data$"not_certain"
wide.raw_data$"no" <- raw_data$"certainly_not" 
wide.raw_data$"maybe" <- raw_data$"don't_know" + raw_data$"other"
wide.raw_data

##   income_distribution most_certain fairly_certain not_certain don't_know
## 1    Less than $30000         5838           1769         442         88
## 2       $30000-$49999         3848           1125         237         59
## 3       $50000-$99999         5408           1657         436         44
## 4     $100000 or more         3711           1540         560         35
##   certainly_not other sample_size  yes  no maybe
## 1           619   177        8845 8049 619   265
## 2           533   118        5920 5210 533   177
## 3           960   174        8723 7501 960   218
## 4           980   210        7002 5811 980   245

commence analysis of data

questions:

what is average certainty/uncertainty across income?
what is aggregate certainty/uncertainty across income?
what is aggregate certainty/uncertainty in total?

# calculate average certainty
avg.raw_data <- raw_data %>%
select(income_distribution, most_certain, fairly_certain, not_certain, `don't_know`, certainly_not, other, sample_size) %>%
group_by(income_distribution) %>%
summarise(yes = mean(most_certain, fairly_certain, not_certain), maybe = mean(`don't_know`, other), no = mean(certainly_not))

avg.raw_data

## # A tibble: 4 x 4
##   income_distribution   yes maybe    no
##   <fct>               <dbl> <dbl> <dbl>
## 1 $100000 or more      3711    35   980
## 2 $30000-$49999        3848    59   533
## 3 $50000-$99999        5408    44   960
## 4 Less than $30000     5838    88   619

# calculate aggregate certainty
agg.raw_data <- wide.raw_data %>%
select(income_distribution, yes, no, maybe) %>%
group_by(income_distribution) %>%
summarise(yes = sum(yes), maybe = sum(maybe), no = sum(no))

agg.raw_data

## # A tibble: 4 x 4
##   income_distribution   yes maybe    no
##   <fct>               <dbl> <dbl> <dbl>
## 1 $100000 or more      5811   245   980
## 2 $30000-$49999        5210   177   533
## 3 $50000-$99999        7501   218   960
## 4 Less than $30000     8049   265   619

# melt aggregate certainty table
m.wide.raw_data <- melt(agg.raw_data)

## Using income_distribution as id variables

names(m.wide.raw_data) <- c("income_dist", "certainty", "count")
m.wide.raw_data

##         income_dist certainty count
## 1   $100000 or more       yes  5811
## 2     $30000-$49999       yes  5210
## 3     $50000-$99999       yes  7501
## 4  Less than $30000       yes  8049
## 5   $100000 or more     maybe   245
## 6     $30000-$49999     maybe   177
## 7     $50000-$99999     maybe   218
## 8  Less than $30000     maybe   265
## 9   $100000 or more        no   980
## 10    $30000-$49999        no   533
## 11    $50000-$99999        no   960
## 12 Less than $30000        no   619

# melt raw certainty table
m.raw_data <- melt(raw_data)

## Using income_distribution as id variables

names(m.raw_data) <- c("income_dist", "certainty", "count")
m.raw_data

##         income_dist      certainty count
## 1  Less than $30000   most_certain  5838
## 2     $30000-$49999   most_certain  3848
## 3     $50000-$99999   most_certain  5408
## 4   $100000 or more   most_certain  3711
## 5  Less than $30000 fairly_certain  1769
## 6     $30000-$49999 fairly_certain  1125
## 7     $50000-$99999 fairly_certain  1657
## 8   $100000 or more fairly_certain  1540
## 9  Less than $30000    not_certain   442
## 10    $30000-$49999    not_certain   237
## 11    $50000-$99999    not_certain   436
## 12  $100000 or more    not_certain   560
## 13 Less than $30000     don't_know    88
## 14    $30000-$49999     don't_know    59
## 15    $50000-$99999     don't_know    44
## 16  $100000 or more     don't_know    35
## 17 Less than $30000  certainly_not   619
## 18    $30000-$49999  certainly_not   533
## 19    $50000-$99999  certainly_not   960
## 20  $100000 or more  certainly_not   980
## 21 Less than $30000          other   177
## 22    $30000-$49999          other   118
## 23    $50000-$99999          other   174
## 24  $100000 or more          other   210
## 25 Less than $30000    sample_size  8845
## 26    $30000-$49999    sample_size  5920
## 27    $50000-$99999    sample_size  8723
## 28  $100000 or more    sample_size  7002

# calculate all certainty regardless of income
all.raw_data <- agg.raw_data %>%
   select_if(is.numeric) %>%
   summarise_all(funs(sum))

## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.

all.raw_data

## # A tibble: 1 x 3
##     yes maybe    no
##   <dbl> <dbl> <dbl>
## 1 26571   905  3092

ggplot(m.wide.raw_data, aes(fill=certainty, y=count, x=certainty)) +
  ggtitle("simplified distribution of certainty") +
  theme(plot.title = element_text(hjust = 0.5)) +
  facet_grid(.~income_dist) +
  geom_bar(position='dodge', stat="identity") +
  xlab('certainty by income') +
  ylab('count')

ggplot(m.raw_data, aes(fill=certainty, y=count, x=certainty)) +
  ggtitle("specific distribution of certainty") +
  theme(plot.title = element_text(hjust = 0.5)) +
  facet_grid(.~income_dist) +
  geom_bar(position='dodge', stat="identity") +
  theme(axis.text.x = element_text(angle = 90)) + # rotate x-axis labels
  xlab('certainty by income') +
  ylab('count')

Data607_Proj3

Joe_Rovalino, Tamiko Jenkins and Nicholas Chung

10/02/2019

summary

scrape, tidy, clean, analyze, and visualize table with untidy data representing distribution of income across differing degrees of belief in God.

methods

scrape table associated with below chart

initial observations on state of raw data:

update headers

transform percentage values into numeric counts

create “yes”, “no”, “maybe” buckets

commence analysis of data

questions: