library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Saving the URL into an object and then reading that object
# save the url
class_data <- "https://docs.google.com/spreadsheets/d/e/2PACX-1vRCGayKLOy-52gKmEoPOj3ZKnOQVtCiooSloiCr-i_ci27e4n1CMPL0Z9s6MeFX9oQuN9E-HCFJnWjD/pub?gid=1456715839&single=true&output=csv"
# read the data, clean names and save into the object "raw_data"
raw_data <- read_csv(class_data) %>% clean_names()
## Rows: 138 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Timestamp, First name, Last name, Candy type, Box code
## dbl (6): Red, Green, Orange, Yellow, Blue, Brown
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# peek at the data
raw_data
Reomoving the timestamp, candy_type, and box_code columns
candy <- raw_data %>%
select(
-timestamp,
-candy_type,
-box_code
)
candy %>% head()
Pivoting the data to be long
candy_long <- candy %>%
pivot_longer(
cols = red:brown, # sets which columns to pivot based on their names
names_to = "color", # sets column name for color
values_to = "candies" # sets column name for candies
)
candy_long %>% head()
Finding the average candies for each colour
candy_avg <- candy_long %>%
group_by(color) %>%
summarize(avg_candies = mean(candies, na.rm = TRUE)) %>%
mutate(
avg_candies = round(avg_candies, 1),
color = str_to_title(color)
)
candy_avg
Build a bar chart of average color using ggplot
ggplot(candy_avg, aes(x = color %>% reorder(avg_candies), y = avg_candies)) + # sets x and y axes
geom_col() + # adds the bars
coord_flip() + # flips the axis
geom_text(aes(label = avg_candies), hjust = 2, color = "white") + # plots read-able votes text values on chart
labs(
title = "Average M&M Color Distribution",
subtitle = str_wrap("Average number of candies for each M&M color in a package, from Reporting with Data students across Spring 2022-2023"),
caption = "By Shezan Samanani",
x = "M&M Colors",
y = "Average Number of M&Ms"
) +
theme_linedraw()
Exporting data for Datawrapper
candy_avg |> write_csv("candy_avg.csv")
Practicing pivoting the data wider
candy_long |>
pivot_wider(names_from = c(first_name, last_name), values_from = candies)
Using pivot_wider on my own
candy_long %>%
pivot_wider(
names_from = color,
values_from = candies
)
Finding who had the most and least candies in their bag
candy_long %>%
group_by(first_name, last_name) %>%
summarize(total_candies = sum(candies)) %>%
arrange(desc(total_candies)) %>%
filter(total_candies >= 58)
## `summarise()` has grouped output by 'first_name'. You can override using the
## `.groups` argument.
candy_long %>%
group_by(first_name, last_name) %>%
summarize(total_candies = sum(candies)) %>%
arrange(total_candies) %>%
filter(total_candies <= 53)
## `summarise()` has grouped output by 'first_name'. You can override using the
## `.groups` argument.
Finding the average number of candies in each bag
candy_long %>%
group_by(first_name, last_name) %>%
summarize(total_candies = sum(candies)) %>%
summary(total_candies)
## `summarise()` has grouped output by 'first_name'. You can override using the
## `.groups` argument.
## first_name last_name total_candies
## Length:138 Length:138 Min. :51.00
## Class :character Class :character 1st Qu.:54.00
## Mode :character Mode :character Median :56.00
## Mean :55.94
## 3rd Qu.:57.00
## Max. :73.00
The average number of candies in each bag is 55.62, which can be rounded up to 56.