#Student participation:
# Keydy Sanchez
# Vanessa Wasveiler
#Load your libraries before getting started
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(lemon)
##
## Attaching package: 'lemon'
## The following object is masked from 'package:purrr':
##
## %||%
## The following objects are masked from 'package:ggplot2':
##
## CoordCartesian, element_render
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
cereal <- read_csv("cereal.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## mfr = col_character(),
## type = col_character(),
## calories = col_double(),
## protein = col_double(),
## fat = col_double(),
## sodium = col_double(),
## fiber = col_double(),
## carbo = col_double(),
## sugars = col_double(),
## potass = col_double(),
## vitamins = col_double(),
## shelf = col_double(),
## weight = col_double(),
## cups = col_double(),
## rating = col_double()
## )
view(cereal)
For this part, you need to make at least 4 graphs. For each graph, briefly describe in 1-2 sentences what is seen. You do not need to use all of the verbs for your actual graphs, but you need to produce outputs that include all of the verbs below (this could just be a print() statement after a pipe where you use the dplyr verb). You can have statements that combine the verbs together (which will likely happen).
select() starts_with() filter() arrange() distinct() slice() mutate() summarise()
cereal %>%
arrange(sodium)%>%
filter(fat >= 1) %>%
ggplot(aes(x = sodium, y = fat)) +
geom_bar(stat = "identity")+
scale_y_continuous(labels = number_format())+
labs(x = "sodium", y = "fat")
#The graph above shows the ralationship between Sodium and fat, it is very noticeable that the cereal with most fat also happens to be the one with more sodium.
cereal %>%
select("vitamins", "sugars")%>%
arrange(desc(vitamins))%>%
distinct(vitamins, .keep_all = TRUE) %>%
ggplot(aes(x = sugars, y = vitamins)) +
geom_line(stat = "identity")+
scale_y_continuous(labels = number_format())+
labs(x = "sugars", y = "vitamins")
#The graph above shows the ralationship between cereal vitamins and Sugars, it is very noticeable that the cereal with higher amounts of Vitamins have less sugar.
cereal %>%
slice(1:20) %>% #this will return the top 20 cereal rates by proteins
ggplot() +
aes(x = protein, y = rating, color = protein) +
labs(x = "Protein", y = "Rating") +
scale_y_continuous(labels = number_format())+
scale_x_continuous(labels = number_format())+
geom_point()
#The graph above shows the relationship between Protein and Ratings, and we can see that the cereal with higher ratings are also the cereal with higher protein amounts.
cereal %>%
group_by(protein) %>%
mutate(protein = as.factor(protein)) %>%
summarise(freq = n()) %>%
ggplot(aes(x = reorder(protein, freq), y = freq, fill = protein)) +
geom_bar(stat = "identity", show.legend = FALSE) + labs(x = "Protein", y = "Frequency") + coord_flip()
## `summarise()` ungrouping output (override with `.groups` argument)
#The graph above shows the amount of Proteins and the frequency in which those proteins show in the cereal.
cereal %>%
group_by(name, calories) %>%
head(20) %>%
select(starts_with("C")) %>%
ggplot(aes(x = calories, y = name)) +
geom_point()
## Adding missing grouping variables: `name`
#The graph above shows the first 20 names that start with a C and the amount of calories for each.
Refer to the notes that show how to use the mdy functions and ymd functions. I provide the code on how to do this.
wrong_format_date1 <- “01-25-1999”
wrong_format_date2 <- 25012005
wrong_format_date3 <- “2005-05-31”
random_string_date_1 <- "01-25-1999"
random_string_date_2 <- "25012005"
random_string_date_3 <- "2005-05-31"
class(random_string_date_1)
## [1] "character"
class(random_string_date_2)
## [1] "character"
class(random_string_date_3)
## [1] "character"
random_date_1 <- mdy(random_string_date_1)
random_date_2 <- dmy(random_string_date_2)
random_date_3 <- ymd(random_string_date_3)
class(random_date_1)
## [1] "Date"
class(random_date_2)
## [1] "Date"
class(random_date_3)
## [1] "Date"
print(c(random_date_1, random_date_2, random_date_3))
## [1] "1999-01-25" "2005-01-25" "2005-05-31"
#Manufacturer of Cereal
cereal_new <- cereal %>%
mutate(mfr = str_replace(mfr, "G", "General Mills")) %>%
mutate(mfr = str_replace(mfr, "N", "Nabisco")) %>%
mutate(mfr = str_replace(mfr, "Q", "Quaker Oats")) %>%
mutate(mfr = str_replace(mfr, "K", "Kelloggs")) %>%
mutate(mfr = str_replace(mfr, "R", "Ralston Purina")) %>%
mutate(mfr = str_replace(mfr, "P", "Post")) %>%
mutate(mfr = str_replace(mfr, "A", "American Home Food Products")) %>%
mutate(type = str_replace(type, "C", "Cold")) %>%
mutate(type = str_replace(type, "H", "Hot")) %>%
view()