First I open the packages I need to use for data manipulation and visualization.
library(tidyverse)
library(ggpubr)
Now I get the data and give it a nice short name (dt)
dt = read_csv('dataviz-01.csv')
dt # look at the data
## # A tibble: 50,000 x 14
## userid eventType amountDonated timestamp category gender age
## <chr> <chr> <dbl> <dttm> <chr> <chr> <chr>
## 1 69f62~ viewedPr~ NA 2019-10-01 00:00:04 sports M 18-24
## 2 4459d~ viewedPr~ NA 2019-10-01 00:00:22 technol~ M 18-24
## 3 0db9e~ viewedPr~ NA 2019-10-01 00:00:32 environ~ M 55+
## 4 68195~ viewedPr~ NA 2019-10-01 00:00:38 technol~ M 18-24
## 5 9508a~ viewedPr~ NA 2019-10-01 00:00:51 sports F 25-34
## 6 d420e~ viewedPr~ NA 2019-10-01 00:01:04 fashion F 45-54
## 7 0e005~ viewedPr~ NA 2019-10-01 00:02:17 games F 35-44
## 8 c6f18~ viewedPr~ NA 2019-10-01 00:02:18 technol~ F 35-44
## 9 2eb99~ viewedPr~ NA 2019-10-01 00:02:25 fashion F 45-54
## 10 e040f~ viewedPr~ NA 2019-10-01 00:02:39 technol~ F 45-54
## # ... with 49,990 more rows, and 7 more variables: maritalStatus <chr>,
## # device <chr>, city <chr>, state <chr>, latitude <dbl>, longitude <dbl>,
## # zipCode <dbl>
First I want to do some basic exploratory analysis…
How many unique people are in the data?
dt %>%
count(userid) %>%
arrange(-n) %>%
mutate(id=1:n()) %>%
ggplot() +
aes(id,n) +
geom_col() +
scale_y_continuous(breaks=0:25)
What categories of projects do people contribute funds toward?
dt %>%
count(category)
## # A tibble: 5 x 2
## category n
## <chr> <int>
## 1 environment 9980
## 2 fashion 9937
## 3 games 9955
## 4 sports 10099
## 5 technology 10029
It looks like we have 5 categories: environment, fashion, games, sports, and tech. And each of the categories seems to get plenty of love.
Now, it might also be useful to look at the people that either only viewed a project vs. those who funded.
dt %>%
count(category, eventType)
## # A tibble: 10 x 3
## category eventType n
## <chr> <chr> <int>
## 1 environment fundedProject 4089
## 2 environment viewedProject 5891
## 3 fashion fundedProject 4073
## 4 fashion viewedProject 5864
## 5 games fundedProject 4173
## 6 games viewedProject 5782
## 7 sports fundedProject 4179
## 8 sports viewedProject 5920
## 9 technology fundedProject 4144
## 10 technology viewedProject 5885
So just over half the people who view projects don’t fund them.
When they do donate, how much do they contribute? Below I look at the distribution of payments by category.
dt %>%
filter(eventType == 'fundedProject') %>%
ggplot() +
aes(amountDonated) +
geom_histogram() +
facet_wrap(~category)
Now for the Big Show. Let’s take a look at who is most likely to contribute to Dennis’ bike project. I do this by grouping the data by category, gender, age, marital status, type of device used (iOS or Android), and location (city, state).
I then compute some summary statistics of money donated to be computed by groups (mean, SD, min., max., N). I restrict the data to people who contribute funds. I then narrow the data down do people who contribute to environmental projects, sports projects, and and tech. projects. Why? Because I think these are the people who would be most interested in helping Dennis build his eco-friendly bike.
dt %>%
filter(
amountDonated>0
) %>%
group_by(
category, gender, age, maritalStatus,
device, city, state
) %>%
summarize(
total = sum(amountDonated),
mean = mean(amountDonated),
sd = sd(amountDonated),
min = min(amountDonated),
max = max(amountDonated),
n = n()
) %>%
arrange(-total) %>%
filter(
category %in% c('sports','environment','technology')
) -> top_funders
top_funders
## # A tibble: 6,618 x 13
## # Groups: category, gender, age, maritalStatus, device, city [6,591]
## category gender age maritalStatus device city state total mean sd
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 environ~ F 18-24 married iOS Atla~ GA 5214 40.4 14.9
## 2 sports F 18-24 married iOS Atla~ GA 4958 39.0 14.8
## 3 technol~ M 18-24 married iOS Atla~ GA 4732 40.1 14.7
## 4 sports M 18-24 married iOS Atla~ GA 4578 40.9 15.9
## 5 technol~ F 18-24 married iOS Atla~ GA 4076 37.1 14.4
## 6 environ~ M 18-24 married iOS Atla~ GA 3346 38.9 16.5
## 7 environ~ F 18-24 married iOS Lake~ OR 2605 40.7 16.5
## 8 sports F 18-24 married iOS Lake~ OR 2523 44.3 13.3
## 9 environ~ F 18-24 single andro~ Atla~ GA 2522 39.4 13.6
## 10 technol~ F 18-24 married iOS Lake~ OR 2363 39.4 14.1
## # ... with 6,608 more rows, and 3 more variables: min <dbl>, max <dbl>, n <int>
Now, that’s a pretty big table. The below figure synthesizes the results, reporting the most relevant information for who is most likely to contribute funds to Dennis’ project.
extrafont::loadfonts(quiet=T,device='win') # for more fonts
top_funders %>%
group_by(
category
) %>%
summarize(
total = sum(total)
) %>%
mutate(
highlight = ifelse(total==max(total), "yes", "no" )
) %>%
ggplot() +
aes(reorder(category,total),total,
fill = highlight) +
geom_col(width = .5) +
labs(x='',y='Total $',
title = 'Category') +
scale_y_continuous(
labels=scales::comma_format()
) +
scale_fill_manual(
values=c("yes"="tomato", "no"="gray"),
guide = F
) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
text = element_text(family = 'SimSun-ExtB'),
axis.ticks.y = element_blank(),
axis.text = element_text(color='black')
) +
coord_flip() -> p1
top_funders %>%
group_by(
gender
) %>%
summarize(
total = sum(total)
) %>%
mutate(
highlight = ifelse(total==max(total), "yes", "no" )
) %>%
mutate(
gender = c('Female','Male','Unknown')
) %>%
ggplot() +
aes(reorder(gender,total),total,fill=highlight) +
geom_col(width = .5) +
labs(x='',y='Total $',
title = 'Gender') +
scale_y_continuous(
labels=scales::comma_format()
) +
scale_fill_manual(
values=c("yes"="tomato", "no"="gray"),
guide = F
) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
text = element_text(family = 'SimSun-ExtB'),
axis.ticks.y = element_blank(),
axis.text = element_text(color='black')
) +
coord_flip() -> p2
top_funders %>%
group_by(
age
) %>%
summarize(
total = sum(total)
) %>%
mutate(
highlight = ifelse(total==max(total), "yes", "no" )
) %>%
ggplot() +
aes(reorder(age,total),total,fill=highlight) +
geom_col(width = .5) +
labs(x='',y='Total $',
title = 'Age') +
scale_y_continuous(
labels=scales::comma_format()
) +
scale_fill_manual(
values=c("yes"="tomato", "no"="gray"),
guide = F
) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
text = element_text(family = 'SimSun-ExtB'),
axis.ticks.y = element_blank(),
axis.text = element_text(color='black')
) +
coord_flip() -> p3
top_funders %>%
group_by(
device
) %>%
summarize(
total = sum(total)
) %>%
mutate(
highlight = ifelse(total==max(total), "yes", "no" )
) %>%
ggplot() +
aes(reorder(device,total),total,fill=highlight) +
geom_col(width = .5) +
labs(x='',y='Total $',
title = 'Device') +
scale_y_continuous(
labels=scales::comma_format()
) +
scale_fill_manual(
values=c("yes"="tomato", "no"="gray"),
guide = F
) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
text = element_text(family = 'SimSun-ExtB'),
axis.ticks.y = element_blank(),
axis.text = element_text(color='black')
) +
coord_flip() -> p4
top_funders %>%
mutate(
location = paste(city,state,sep=', ')
) %>%
group_by(
location
) %>%
summarize(
total = sum(total)
) %>%
arrange(-total) %>%
.[1:10,] %>%
mutate(
highlight = ifelse(total==max(total), "yes", "no" )
) %>%
ggplot() +
aes(reorder(location,total),total,fill=highlight) +
geom_col(width = .5) +
labs(x='',y='Total $',
title = 'Top 10 Locations') +
scale_y_continuous(
labels=scales::comma_format()
) +
scale_fill_manual(
values=c("yes"="tomato", "no"="gray"),
guide = F
) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
text = element_text(family = 'SimSun-ExtB'),
axis.ticks.y = element_blank(),
axis.text = element_text(color='black')
) +
coord_flip() -> p5
gridExtra::grid.arrange(
p1,p2,p3,p4,p5,
layout_matrix = rbind(
c(1,1,2,2),
c(5,5,3,3),
c(5,5,4,4)
)
) -> full_plot
That doesn’t look too bad. However, I’d like to add a title, and save the output.
annotate_figure(
full_plot,
top = text_grob(
"Who will be interested in Dennis' bicycle project?",
family = "SimSun-ExtB",
size = 18
)
) -> full_plot
ggsave(
plot = full_plot,
'plot1.png',
units = 'in',
height = 7,
width = 10
)
full_plot
Done! So it looks like the best people to target are people women, age 18-24, who like donating to sports projects, who live in Atlanta, GA, and who use an iOS device.