## -- Attaching packages -------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:reshape':
##
## rename, round_any
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
For this assignment I will create an example. Using one or more TidyVerse packages, and any dataset from fivethirtyeight.com or Kaggle, create a programming sample “vignette” that demonstrates how to use one or more of the capabilities of the selected TidyVerse package with my selected dataset.”
In light of the current situation across the world, I chose the Bob Ross elements by episode recordset.
url<-"https://raw.githubusercontent.com/fivethirtyeight/data/master/bob-ross/elements-by-episode.csv"
BobRoss <- read_csv(url)
## Parsed with column specification:
## cols(
## .default = col_double(),
## EPISODE = col_character(),
## TITLE = col_character()
## )
## See spec(...) for full column specifications.
Bulkbobr <- reshape::melt(df, id=c("EPISODE","TITLE"))
Bob <- unique(subset(Bulkbobr, value == 1, select = c("EPISODE","TITLE","variable")))
Bob <-Bob[order(Bob$EPISODE),]
Bob <- rename(Bob, c("variable"="object"))
head(Bob,10)
## EPISODE TITLE object
## 2822 S01E01 "A WALK IN THE WOODS" BUSHES
## 6449 S01E01 "A WALK IN THE WOODS" DECIDUOUS
## 10882 S01E01 "A WALK IN THE WOODS" GRASS
## 19345 S01E01 "A WALK IN THE WOODS" RIVER
## 23375 S01E01 "A WALK IN THE WOODS" TREE
## 23778 S01E01 "A WALK IN THE WOODS" TREES
## 3226 S01E02 "MT. MCKINLEY" CABIN
## 5241 S01E02 "MT. MCKINLEY" CLOUDS
## 5644 S01E02 "MT. MCKINLEY" CONIFER
## 14913 S01E02 "MT. MCKINLEY" MOUNTAIN
top10 <- head(dplyr::arrange(Bob2), n = 10)
ggplot(data=top10,aes(x=object, y=freq)) +
ggplot2::aes(x=reorder(object, -freq), y=freq) +
ggplot2::geom_bar(stat="identity",fill="steelblue") +
ggplot2::labs(x="Objects",y="Frequency") +
ggplot2::ggtitle("Most used objects in Bob Ross Paintings") +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1))
My colleague Nilsa has done a great job providing us with the total frequency of each object across all seasons of Bob Ross’s show. As an extension, I will be breaking down out analysis by season, in hopes of spotting trends in subject matter over the years.
We will begin by using the str_split function in the stringr package. This allows us to split the Episode column. We actually are not concerned with the episode number, so we could simply pull the first 3 characters (or more accurately, characters 2 and 3) from each entry in the episode column. For the purposes of this exercise, however, we will use this function.
After breaking the column into 2 new columns, we use cbind to join this new dataset into our “Bob” dataframe.
From here, we can use the group_by and summarize functions to calculate how often each object is used n each season.
I was unaware that this show had been on for 31 years. For the purposes of this exercise, I elected to focus on objects which had appeared more than 5 times a season, and to visualize changes over time by focusing on 3 consecutive seasons every 10 seasons (seasons 1, 2, 3, 11, 12, 13, 21, 22, 23).
BobF2 <- BobF %>% filter(count > 5)
BobF3 <- BobF2 %>% filter(`1` %in% c("S01","S02","S03","S11","S12","S13","S21","S22","S23"))
ggplot(BobF3, aes( x=object, y=count, group=`1`))+ facet_wrap(~`1`)+
ggplot2::aes(x=reorder(object, -count), y=count) +
ggplot2::geom_bar(stat="identity",fill="steelblue") +
ggplot2::labs(x="Objects",y="Frequency") +
ggplot2::ggtitle("Most used objects in Bob Ross Paintings, by Season 1-9") +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1))
The first thing that stands out if how “Tree” and “Trees” are noticeably absent in my colleagues work. After studying her analysis, I found the mistake. She did not select a field when using the Arrange function to sort for the top 10 subjects. As a result, she selected the top 10 alphabetically, and arranged these top 10 by count. I will fix this below.
Looking over my plots above, assuming each season is the same length (which may or may not be accurate) we can see how there seem to be more diverse set of recurring themes in Seasons 1-3 and 11-13. I would imagine that the later seasons have a more diverse set of themes overall, and it is less probable that any given theme would appear more than 5 times a season.
top10CB <- head(dplyr::arrange(Bob2,desc(freq)), n = 10)
ggplot(data=top10CB,aes(x=object, y=freq)) +
ggplot2::aes(x=reorder(object, -freq), y=freq) +
ggplot2::geom_bar(stat="identity",fill="steelblue") +
ggplot2::labs(x="Objects",y="Frequency") +
ggplot2::ggtitle("Most used objects in Bob Ross Paintings") +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1))