Loads the libraries
library (dplyr)
library(tidyr)
library(lubridate)
library(stringr)
library (ggplot2)
This data looks at Manga books from the New York Times Best Seller list from 2010-2017. The data is transformed through a series of step for further analysis.
# Imports the CSV of all bestsellters
nytbestsellers <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/nytbestsellers.csv", header = TRUE, sep = ',', na.strings="", fill = TRUE)
knitr::kable(head(nytbestsellers))
published_date | list_name | list_name_encoded | rank | isbn13 | isbn10 | title | author | description | amazon_product_url | price | weeks_on_list |
---|---|---|---|---|---|---|---|---|---|---|---|
2010-01-03 | Chapter Books | chapter-books | 1 | 9780316036245 | 0316036242 | WITCH AND WIZARD | James Patterson and Gabrielle Charbonnet | One of each, brother and sister, flex their newfound powers. | https://www.amazon.com/Witch-Wizard-James-Patterson/dp/0446562432?tag=NYTBS-20 | 17.99 | 1 |
2010-01-03 | Chapter Books | chapter-books | 2 | 9780439023481 | 0439023483 | THE HUNGER GAMES | Suzanne Collins | In a dystopian future, a girl fights for survival on live TV. | https://www.amazon.com/The-Hunger-Games-Suzanne-Collins-ebook/dp/B002MQYOFW?tag=NYTBS-20 | 17.99 | 67 |
2010-01-03 | Chapter Books | chapter-books | 3 | 9780439023498 | 0439023491 | CATCHING FIRE | Suzanne Collins | The protagonist of “The Hunger Games” returns. | https://www.amazon.com/Catching-Fire-Hunger-Games-Book/dp/0439023491?tag=NYTBS-20 | 17.99 | 16 |
2010-01-03 | Chapter Books | chapter-books | 4 | 9780763644109 | 0763644102 | THE MAGICIAN’S ELEPHANT | Kate DiCamillo | An orphan in search of his sister follows a fortuneteller’s mysterious instructions. | https://www.amazon.com/The-Magicians-Elephant-Kate-DiCamillo/dp/0763652989?tag=NYTBS-20 | 16.99 | 15 |
2010-01-03 | Chapter Books | chapter-books | 5 | 9780385738934 | 0385738935 | FALLEN | Lauren Kate | Thwarted love among misfits at a Savannah, Ga., boarding school. | https://www.amazon.com/Fallen-Lauren-Kate/dp/0385739133?tag=NYTBS-20 | 17.99 | 2 |
2010-01-03 | Hardcover Advice | hardcover-advice | 1 | 9781904994503 | 1904994504 | GUINNESS WORLD RECORDS 2010 | edited Craig Glenday | Tallest, fastest, youngest, most. | https://www.amazon.com/Guinness-World-Records-2010-Decade/dp/1904994504?tag=NYTBS-20 | 28.95 | 14 |
# Filters the initial large best sellers list for manga books and a subset of columns to make the data more manageable
nytmanga <- nytbestsellers %>%
filter(list_name_encoded == "manga") %>%
select(published_date, list_name_encoded, rank, title, author, price, weeks_on_list )
# Converts the character date columns into a real date column; adds a separate column that permits a yearly plot by using the floor date function.
nytmanga <- nytmanga %>%
mutate(published_date = ymd(published_date),
pubfloordate = floor_date(published_date, unit = "year" ))
# Converts the book title column into separate columns to split between the main title and the volume number.
nytmanga <- nytmanga %>%
mutate(main_title = ifelse(str_detect(title, 'VOL.'),
str_match(title, '(.*),\\sVOL.')[,2],
title),
VolumeNo = str_match(title, 'VOL.\\s{1,}(\\d+)')[,2])
# Splits the author column into a first and second author if one was noted.
nytmanga <- nytmanga %>%
separate(col = author, sep = ' and ', into = c('first_author', 'second_author'), remove = FALSE)
knitr::kable(head(nytmanga))
published_date | list_name_encoded | rank | title | author | first_author | second_author | price | weeks_on_list | pubfloordate | main_title | VolumeNo |
---|---|---|---|---|---|---|---|---|---|---|---|
2010-01-03 | manga | 1 | NARUTO 46 | Masashi Kishimoto | Masashi Kishimoto | NA | 9.99 | 12 | 2010-01-01 | NARUTO 46 | NA |
2010-01-03 | manga | 2 | BLEACH, VOL. 29 | Tite Kubo | Tite Kubo | NA | 9.99 | 3 | 2010-01-01 | BLEACH | 29 |
2010-01-03 | manga | 3 | VAMPIRE KNIGHT, VOL. 8 | Matsuri Hino | Matsuri Hino | NA | 9.99 | 7 | 2010-01-01 | VAMPIRE KNIGHT | 8 |
2010-01-03 | manga | 4 | MAXIMUM RIDE, VOL. 2 | James Patterson and NaRae Lee | James Patterson | NaRae Lee | 10.99 | 8 | 2010-01-01 | MAXIMUM RIDE | 2 |
2010-01-03 | manga | 5 | MAXIMUM RIDE, VOL. 1 | James Patterson and NaRae Lee | James Patterson | NaRae Lee | 10.99 | 2 | 2010-01-01 | MAXIMUM RIDE | 1 |
2010-01-10 | manga | 1 | NARUTO 46 | Masashi Kishimoto | Masashi Kishimoto | NA | 9.99 | 13 | 2010-01-01 | NARUTO 46 | NA |
Between 2010-2017, the majority of Mango books spent about 1.25 weeks on the NYT Best Seller list. The number of manga books on the best seller list for at least 1 week peaked in 2012.
ggplot(nytmanga, aes(x=weeks_on_list)) + geom_bar() +
ggtitle("Manga NYT Best Sellers Number of Weeks on List By Year 2010 - 2017")+
scale_x_continuous(limits = c(0,10)) +
facet_wrap(~pubfloordate)
This data looks at the global happiness score between 2015 - 2022. This information is available on a number of csv files that required joining and transformation.
# Imports the Happiness data for each year
happy2015 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2015.csv", na.strings=c("","NA"))
happy2016 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2016.csv", na.strings=c("","NA"))
happy2017 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2017.csv", na.strings=c("","NA"))
happy2018 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2018.csv", na.strings=c("","NA"))
happy2019 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2019.csv", na.strings=c("","NA"))
happy2020 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2020.csv", na.strings=c("","NA"))
happy2021 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2021.csv", na.strings=c("","NA"))
happy2022 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2022.csv", na.strings=c("","NA"))
# Example yearly happiness data
knitr::kable(head(happy2015))
Country | Region | Happiness.Rank | Happiness.Score | Standard.Error | Economy..GDP.per.Capita. | Family | Health..Life.Expectancy. | Freedom | Trust..Government.Corruption. | Generosity | Dystopia.Residual |
---|---|---|---|---|---|---|---|---|---|---|---|
Switzerland | Western Europe | 1 | 7.587 | 0.03411 | 1.39651 | 1.34951 | 0.94143 | 0.66557 | 0.41978 | 0.29678 | 2.51738 |
Iceland | Western Europe | 2 | 7.561 | 0.04884 | 1.30232 | 1.40223 | 0.94784 | 0.62877 | 0.14145 | 0.43630 | 2.70201 |
Denmark | Western Europe | 3 | 7.527 | 0.03328 | 1.32548 | 1.36058 | 0.87464 | 0.64938 | 0.48357 | 0.34139 | 2.49204 |
Norway | Western Europe | 4 | 7.522 | 0.03880 | 1.45900 | 1.33095 | 0.88521 | 0.66973 | 0.36503 | 0.34699 | 2.46531 |
Canada | North America | 5 | 7.427 | 0.03553 | 1.32629 | 1.32261 | 0.90563 | 0.63297 | 0.32957 | 0.45811 | 2.45176 |
Finland | Western Europe | 6 | 7.406 | 0.03140 | 1.29025 | 1.31826 | 0.88911 | 0.64169 | 0.41372 | 0.23351 | 2.61955 |
# Selects the happiness score from 2015 to create global happiness data frame; the data will only look at the happiness score per country.
globalhappy <- happy2015 %>%
select(Country, Happiness.Score) %>%
rename(Happiness.Score.2015 = Happiness.Score) %>%
# Joins the 2016 happiness score.
full_join(happy2016, by = 'Country') %>%
select(Country, Happiness.Score.2015, Happiness.Score) %>%
rename(Happiness.Score.2016 = Happiness.Score) %>%
# Joins the 2017 happiness score.
full_join(happy2017, by = 'Country') %>%
select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score) %>%
rename(Happiness.Score.2017 = Happiness.Score) %>%
# In 2018, the column names change. Joins based on new country and new score column names.
full_join(happy2018, by = c("Country" = "Country.or.region")) %>%
select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017, Score) %>%
rename(Happiness.Score.2018 = Score) %>%
# Joins 2019 data keeping the 2018 column names format.
full_join(happy2019, by = c("Country" = "Country.or.region")) %>%
select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017,Happiness.Score.2018, Score) %>%
rename(Happiness.Score.2019 = Score) %>%
# In 2020, the column names for country and score change again. Joins the 2020 happiness score.
full_join(happy2020, by = c("Country" = "Country.name")) %>%
select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017,Happiness.Score.2018, Happiness.Score.2019, Ladder.score) %>%
rename(Happiness.Score.2020 = Ladder.score) %>%
# Joins 2021 happiness dta keeping the 2020 column name format
full_join(happy2021, by = c("Country" = "Country.name")) %>%
select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017,Happiness.Score.2018, Happiness.Score.2019, Happiness.Score.2020, Ladder.score) %>%
rename(Happiness.Score.2021 = Ladder.score) %>%
# In 2022, the column names for country and score change again. Joins the 2022 happiness score.
full_join(happy2022, by = 'Country') %>%
select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017,Happiness.Score.2018, Happiness.Score.2019, Happiness.Score.2020, Happiness.Score.2021, Happiness.score) %>%
rename(Happiness.Score.2022 = Happiness.score)
# In 2022, the Happiness score format is changed to include a comma instead of a decimals. The values are converted to numeric and then changed to the same scale as the previous years.
globalhappy$Happiness.Score.2022 <- as.numeric(gsub(",","", globalhappy$Happiness.Score.2022))
globalhappy <- globalhappy %>%
mutate(Happiness.Score.2022 = Happiness.Score.2022 / 1000)
# The average happines score for each country is calculated and NAN value is removed.
globalhappy <- globalhappy %>%
mutate(AvgHappiness = (rowMeans(select(globalhappy, "Happiness.Score.2015", "Happiness.Score.2016","Happiness.Score.2017","Happiness.Score.2018", "Happiness.Score.2019", "Happiness.Score.2020", "Happiness.Score.2021","Happiness.Score.2022"), na.rm = TRUE))) %>%
filter(!is.na(AvgHappiness))
knitr::kable(head(globalhappy))
Country | Happiness.Score.2015 | Happiness.Score.2016 | Happiness.Score.2017 | Happiness.Score.2018 | Happiness.Score.2019 | Happiness.Score.2020 | Happiness.Score.2021 | Happiness.Score.2022 | AvgHappiness |
---|---|---|---|---|---|---|---|---|---|
Switzerland | 7.587 | 7.509 | 7.494 | 7.487 | 7.480 | 7.5599 | 7.571 | 7.512 | 7.524987 |
Iceland | 7.561 | 7.501 | 7.504 | 7.495 | 7.494 | 7.5045 | 7.554 | 7.557 | 7.521312 |
Denmark | 7.527 | 7.526 | 7.522 | 7.555 | 7.600 | 7.6456 | 7.620 | 7.636 | 7.578950 |
Norway | 7.522 | 7.498 | 7.537 | 7.594 | 7.554 | 7.4880 | 7.392 | 7.365 | 7.493750 |
Canada | 7.427 | 7.404 | 7.316 | 7.328 | 7.278 | 7.2321 | 7.103 | 7.025 | 7.264138 |
Finland | 7.406 | 7.413 | 7.469 | 7.632 | 7.769 | 7.8087 | 7.842 | 7.821 | 7.645087 |
Average global happiness scores between 2015- 2022 appear to be normally distributed. To confirm a density plot which places the data below the curve and a QQ distribution line where all of the points appear to fall near the line.
# Mean and Standard Deviation for Average Global Happiness scores
happymean <- mean(globalhappy$AvgHappiness)
happysd <- sd(globalhappy$AvgHappiness)
happymean
## [1] 5.366749
happysd
## [1] 1.079698
#Density Distribtion Plot for Average Happiness score
ggplot(globalhappy, aes(x = AvgHappiness)) +geom_blank() +geom_histogram(aes(y = ..density..)) +stat_function(fun = dnorm, args = c(mean = happymean, sd = happysd), col = "tomato")
## QQ Distibrution line to evaluate normal distribution of happiness scores
ggplot(globalhappy, aes(sample = AvgHappiness)) + geom_line(stat = "qq")+ ggtitle("Global Average Happiness QQ Distribution Line")