source: belief in God by household income
# load csv approximation of table into memory
raw_data <- read.csv("belief_in_god_by_income.csv", sep = ",")#, check.names = FALSE)
raw_data
## Income.distribution Believe.in.God..absolutely.certain
## 1 Less than $30000 66%
## 2 $30000-$49999 65%
## 3 $50000-$99999 62%
## 4 $100000 or more 53%
## Believe.in.God..fairly.certain
## 1 20%
## 2 19%
## 3 19%
## 4 22%
## Believe.in.God..not.too.not.at.all.certain Believe.in.God..don.t.know
## 1 5% 1%
## 2 4% 1%
## 3 5% < 1%
## 4 8% < 1%
## Do.not.believe.in.God Other.don.t.know.if.they.believe.in.God
## 1 7% 2%
## 2 9% 2%
## 3 11% 2%
## 4 14% 3%
## Sample.Size
## 1 8845
## 2 5920
## 3 8723
## 4 7002
names(raw_data) <- c("income_distribution", "most_certain", "fairly_certain", "not_certain", "don't_know", "certainly_not", "other", "sample_size")
raw_data
## income_distribution most_certain fairly_certain not_certain don't_know
## 1 Less than $30000 66% 20% 5% 1%
## 2 $30000-$49999 65% 19% 4% 1%
## 3 $50000-$99999 62% 19% 5% < 1%
## 4 $100000 or more 53% 22% 8% < 1%
## certainly_not other sample_size
## 1 7% 2% 8845
## 2 9% 2% 5920
## 3 11% 2% 8723
## 4 14% 3% 7002
raw_data$"don't_know"<- sub("< 1%", ".5", raw_data$"don't_know") # replace "< 1" with arbitrary character
raw_data$"don't_know"<- sub("%", "", raw_data$"don't_know") # replace "%"
raw_data$"don't_know" <- as.numeric(raw_data$"don't_know")
raw_data$"most_certain"<- sub("%", "", raw_data$"most_certain") #remove "%"
raw_data$"most_certain" <- as.numeric(raw_data$"most_certain")
raw_data$"fairly_certain"<- sub("%", "", raw_data$"fairly_certain") # replace "%"
raw_data$"fairly_certain" <- as.numeric(raw_data$"fairly_certain")
raw_data$"not_certain"<- sub("%", "", raw_data$"not_certain") # replace "%"
raw_data$"not_certain"<- as.numeric(raw_data$"not_certain")
raw_data$"certainly_not"<- sub("%", "", raw_data$"certainly_not") # replace "%"
raw_data$"certainly_not" <- as.numeric(raw_data$"certainly_not")
raw_data$"other"<- sub("%", "", raw_data$"other") # replace "%"
raw_data$"other" <- as.numeric(raw_data$"other")
raw_data
## income_distribution most_certain fairly_certain not_certain don't_know
## 1 Less than $30000 66 20 5 1.0
## 2 $30000-$49999 65 19 4 1.0
## 3 $50000-$99999 62 19 5 0.5
## 4 $100000 or more 53 22 8 0.5
## certainly_not other sample_size
## 1 7 2 8845
## 2 9 2 5920
## 3 11 2 8723
## 4 14 3 7002
# divide relevant values by 100
divide_by_100 <- function(x) (x / 100)
raw_data <- raw_data %>% mutate_at(
c("most_certain", "fairly_certain", "not_certain", "don't_know", "certainly_not", "other"),
divide_by_100
)
# multiple relevant values by sample size
raw_data$"most_certain" <- round(raw_data$"most_certain" * raw_data$"sample_size")
raw_data$"fairly_certain" <- round(raw_data$"fairly_certain" * raw_data$"sample_size")
raw_data$"not_certain" <- round(raw_data$"not_certain" * raw_data$"sample_size")
raw_data$"don't_know" <- round(raw_data$"don't_know" * raw_data$"sample_size")
raw_data$"certainly_not" <- round(raw_data$"certainly_not" * raw_data$"sample_size")
raw_data$"other" <- round(raw_data$"other" * raw_data$"sample_size")
raw_data
## income_distribution most_certain fairly_certain not_certain don't_know
## 1 Less than $30000 5838 1769 442 88
## 2 $30000-$49999 3848 1125 237 59
## 3 $50000-$99999 5408 1657 436 44
## 4 $100000 or more 3711 1540 560 35
## certainly_not other sample_size
## 1 619 177 8845
## 2 533 118 5920
## 3 960 174 8723
## 4 980 210 7002
wide.raw_data <- raw_data
wide.raw_data$"yes" <- raw_data$"most_certain" + raw_data$"fairly_certain" + raw_data$"not_certain"
wide.raw_data$"no" <- raw_data$"certainly_not"
wide.raw_data$"maybe" <- raw_data$"don't_know" + raw_data$"other"
wide.raw_data
## income_distribution most_certain fairly_certain not_certain don't_know
## 1 Less than $30000 5838 1769 442 88
## 2 $30000-$49999 3848 1125 237 59
## 3 $50000-$99999 5408 1657 436 44
## 4 $100000 or more 3711 1540 560 35
## certainly_not other sample_size yes no maybe
## 1 619 177 8845 8049 619 265
## 2 533 118 5920 5210 533 177
## 3 960 174 8723 7501 960 218
## 4 980 210 7002 5811 980 245
# calculate average certainty
avg.raw_data <- raw_data %>%
select(income_distribution, most_certain, fairly_certain, not_certain, `don't_know`, certainly_not, other, sample_size) %>%
group_by(income_distribution) %>%
summarise(yes = mean(most_certain, fairly_certain, not_certain), maybe = mean(`don't_know`, other), no = mean(certainly_not))
avg.raw_data
## # A tibble: 4 x 4
## income_distribution yes maybe no
## <fct> <dbl> <dbl> <dbl>
## 1 $100000 or more 3711 35 980
## 2 $30000-$49999 3848 59 533
## 3 $50000-$99999 5408 44 960
## 4 Less than $30000 5838 88 619
# calculate aggregate certainty
agg.raw_data <- wide.raw_data %>%
select(income_distribution, yes, no, maybe) %>%
group_by(income_distribution) %>%
summarise(yes = sum(yes), maybe = sum(maybe), no = sum(no))
agg.raw_data
## # A tibble: 4 x 4
## income_distribution yes maybe no
## <fct> <dbl> <dbl> <dbl>
## 1 $100000 or more 5811 245 980
## 2 $30000-$49999 5210 177 533
## 3 $50000-$99999 7501 218 960
## 4 Less than $30000 8049 265 619
# melt aggregate certainty table
m.wide.raw_data <- melt(agg.raw_data)
## Using income_distribution as id variables
names(m.wide.raw_data) <- c("income_dist", "certainty", "count")
m.wide.raw_data
## income_dist certainty count
## 1 $100000 or more yes 5811
## 2 $30000-$49999 yes 5210
## 3 $50000-$99999 yes 7501
## 4 Less than $30000 yes 8049
## 5 $100000 or more maybe 245
## 6 $30000-$49999 maybe 177
## 7 $50000-$99999 maybe 218
## 8 Less than $30000 maybe 265
## 9 $100000 or more no 980
## 10 $30000-$49999 no 533
## 11 $50000-$99999 no 960
## 12 Less than $30000 no 619
# melt raw certainty table
m.raw_data <- melt(raw_data)
## Using income_distribution as id variables
names(m.raw_data) <- c("income_dist", "certainty", "count")
m.raw_data
## income_dist certainty count
## 1 Less than $30000 most_certain 5838
## 2 $30000-$49999 most_certain 3848
## 3 $50000-$99999 most_certain 5408
## 4 $100000 or more most_certain 3711
## 5 Less than $30000 fairly_certain 1769
## 6 $30000-$49999 fairly_certain 1125
## 7 $50000-$99999 fairly_certain 1657
## 8 $100000 or more fairly_certain 1540
## 9 Less than $30000 not_certain 442
## 10 $30000-$49999 not_certain 237
## 11 $50000-$99999 not_certain 436
## 12 $100000 or more not_certain 560
## 13 Less than $30000 don't_know 88
## 14 $30000-$49999 don't_know 59
## 15 $50000-$99999 don't_know 44
## 16 $100000 or more don't_know 35
## 17 Less than $30000 certainly_not 619
## 18 $30000-$49999 certainly_not 533
## 19 $50000-$99999 certainly_not 960
## 20 $100000 or more certainly_not 980
## 21 Less than $30000 other 177
## 22 $30000-$49999 other 118
## 23 $50000-$99999 other 174
## 24 $100000 or more other 210
## 25 Less than $30000 sample_size 8845
## 26 $30000-$49999 sample_size 5920
## 27 $50000-$99999 sample_size 8723
## 28 $100000 or more sample_size 7002
# calculate all certainty regardless of income
all.raw_data <- agg.raw_data %>%
select_if(is.numeric) %>%
summarise_all(funs(sum))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
all.raw_data
## # A tibble: 1 x 3
## yes maybe no
## <dbl> <dbl> <dbl>
## 1 26571 905 3092
ggplot(m.wide.raw_data, aes(fill=certainty, y=count, x=certainty)) +
ggtitle("simplified distribution of certainty") +
theme(plot.title = element_text(hjust = 0.5)) +
facet_grid(.~income_dist) +
geom_bar(position='dodge', stat="identity") +
xlab('certainty by income') +
ylab('count')
ggplot(m.raw_data, aes(fill=certainty, y=count, x=certainty)) +
ggtitle("specific distribution of certainty") +
theme(plot.title = element_text(hjust = 0.5)) +
facet_grid(.~income_dist) +
geom_bar(position='dodge', stat="identity") +
theme(axis.text.x = element_text(angle = 90)) + # rotate x-axis labels
xlab('certainty by income') +
ylab('count')