Prepare Environment

Loads the libraries

library (dplyr)
library(tidyr)
library(lubridate)
library(stringr)
library (ggplot2)

NY Times Bestsellers for Manga Books

This data looks at Manga books from the New York Times Best Seller list from 2010-2017. The data is transformed through a series of step for further analysis.

# Imports the CSV of all bestsellters
nytbestsellers <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/nytbestsellers.csv", header = TRUE, sep = ',', na.strings="", fill = TRUE)
knitr::kable(head(nytbestsellers))
published_date list_name list_name_encoded rank isbn13 isbn10 title author description amazon_product_url price weeks_on_list
2010-01-03 Chapter Books chapter-books 1 9780316036245 0316036242 WITCH AND WIZARD James Patterson and Gabrielle Charbonnet One of each, brother and sister, flex their newfound powers. https://www.amazon.com/Witch-Wizard-James-Patterson/dp/0446562432?tag=NYTBS-20 17.99 1
2010-01-03 Chapter Books chapter-books 2 9780439023481 0439023483 THE HUNGER GAMES Suzanne Collins In a dystopian future, a girl fights for survival on live TV. https://www.amazon.com/The-Hunger-Games-Suzanne-Collins-ebook/dp/B002MQYOFW?tag=NYTBS-20 17.99 67
2010-01-03 Chapter Books chapter-books 3 9780439023498 0439023491 CATCHING FIRE Suzanne Collins The protagonist of “The Hunger Games” returns. https://www.amazon.com/Catching-Fire-Hunger-Games-Book/dp/0439023491?tag=NYTBS-20 17.99 16
2010-01-03 Chapter Books chapter-books 4 9780763644109 0763644102 THE MAGICIAN’S ELEPHANT Kate DiCamillo An orphan in search of his sister follows a fortuneteller’s mysterious instructions. https://www.amazon.com/The-Magicians-Elephant-Kate-DiCamillo/dp/0763652989?tag=NYTBS-20 16.99 15
2010-01-03 Chapter Books chapter-books 5 9780385738934 0385738935 FALLEN Lauren Kate Thwarted love among misfits at a Savannah, Ga., boarding school. https://www.amazon.com/Fallen-Lauren-Kate/dp/0385739133?tag=NYTBS-20 17.99 2
2010-01-03 Hardcover Advice hardcover-advice 1 9781904994503 1904994504 GUINNESS WORLD RECORDS 2010 edited Craig Glenday Tallest, fastest, youngest, most. https://www.amazon.com/Guinness-World-Records-2010-Decade/dp/1904994504?tag=NYTBS-20 28.95 14
# Filters the initial large best sellers list for manga books and a subset of columns to make the data more manageable
nytmanga <- nytbestsellers %>% 
  filter(list_name_encoded == "manga") %>% 
  select(published_date, list_name_encoded, rank, title, author, price, weeks_on_list )


# Converts the character date columns into a real date column; adds a separate column that permits a yearly plot by using the floor date function.
nytmanga <- nytmanga %>% 
  mutate(published_date = ymd(published_date),
         pubfloordate = floor_date(published_date, unit = "year" ))


# Converts the book title column into separate columns to split between the main title and the volume number.
nytmanga <- nytmanga %>%
  mutate(main_title = ifelse(str_detect(title, 'VOL.'),
                      str_match(title, '(.*),\\sVOL.')[,2],
                      title),
         VolumeNo = str_match(title, 'VOL.\\s{1,}(\\d+)')[,2])

# Splits the author column into a first and second author if one was noted.
nytmanga <- nytmanga %>%
  separate(col = author, sep = ' and ', into = c('first_author', 'second_author'), remove = FALSE)

knitr::kable(head(nytmanga))
published_date list_name_encoded rank title author first_author second_author price weeks_on_list pubfloordate main_title VolumeNo
2010-01-03 manga 1 NARUTO 46 Masashi Kishimoto Masashi Kishimoto NA 9.99 12 2010-01-01 NARUTO 46 NA
2010-01-03 manga 2 BLEACH, VOL. 29 Tite Kubo Tite Kubo NA 9.99 3 2010-01-01 BLEACH 29
2010-01-03 manga 3 VAMPIRE KNIGHT, VOL. 8 Matsuri Hino Matsuri Hino NA 9.99 7 2010-01-01 VAMPIRE KNIGHT 8
2010-01-03 manga 4 MAXIMUM RIDE, VOL. 2 James Patterson and NaRae Lee James Patterson NaRae Lee 10.99 8 2010-01-01 MAXIMUM RIDE 2
2010-01-03 manga 5 MAXIMUM RIDE, VOL. 1 James Patterson and NaRae Lee James Patterson NaRae Lee 10.99 2 2010-01-01 MAXIMUM RIDE 1
2010-01-10 manga 1 NARUTO 46 Masashi Kishimoto Masashi Kishimoto NA 9.99 13 2010-01-01 NARUTO 46 NA

Conclusion - Manga Best Sellers

Between 2010-2017, the majority of Mango books spent about 1.25 weeks on the NYT Best Seller list. The number of manga books on the best seller list for at least 1 week peaked in 2012.

ggplot(nytmanga, aes(x=weeks_on_list)) + geom_bar() +
  ggtitle("Manga NYT Best Sellers Number of Weeks on List By Year 2010 - 2017")+
  scale_x_continuous(limits = c(0,10)) +
  facet_wrap(~pubfloordate)

Global Happiness Scores 2015-2022

This data looks at the global happiness score between 2015 - 2022. This information is available on a number of csv files that required joining and transformation.

# Imports the Happiness data for each year
happy2015 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2015.csv", na.strings=c("","NA"))
happy2016 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2016.csv", na.strings=c("","NA"))
happy2017 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2017.csv", na.strings=c("","NA"))
happy2018 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2018.csv", na.strings=c("","NA"))
happy2019 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2019.csv", na.strings=c("","NA"))
happy2020 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2020.csv", na.strings=c("","NA"))
happy2021 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2021.csv", na.strings=c("","NA"))
happy2022 <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project2/main/2022.csv", na.strings=c("","NA"))

# Example yearly happiness data
knitr::kable(head(happy2015))
Country Region Happiness.Rank Happiness.Score Standard.Error Economy..GDP.per.Capita. Family Health..Life.Expectancy. Freedom Trust..Government.Corruption. Generosity Dystopia.Residual
Switzerland Western Europe 1 7.587 0.03411 1.39651 1.34951 0.94143 0.66557 0.41978 0.29678 2.51738
Iceland Western Europe 2 7.561 0.04884 1.30232 1.40223 0.94784 0.62877 0.14145 0.43630 2.70201
Denmark Western Europe 3 7.527 0.03328 1.32548 1.36058 0.87464 0.64938 0.48357 0.34139 2.49204
Norway Western Europe 4 7.522 0.03880 1.45900 1.33095 0.88521 0.66973 0.36503 0.34699 2.46531
Canada North America 5 7.427 0.03553 1.32629 1.32261 0.90563 0.63297 0.32957 0.45811 2.45176
Finland Western Europe 6 7.406 0.03140 1.29025 1.31826 0.88911 0.64169 0.41372 0.23351 2.61955
# Selects the happiness score from 2015 to create global happiness data frame; the data will only look at the happiness score  per country.
globalhappy <- happy2015 %>% 
  select(Country, Happiness.Score) %>%
  rename(Happiness.Score.2015 = Happiness.Score) %>% 
  
# Joins the 2016 happiness score.
  full_join(happy2016, by = 'Country') %>% 
  select(Country, Happiness.Score.2015, Happiness.Score) %>%
  rename(Happiness.Score.2016 = Happiness.Score) %>% 
  
# Joins the 2017 happiness score.  
  full_join(happy2017, by = 'Country') %>% 
  select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score) %>%
  rename(Happiness.Score.2017 = Happiness.Score) %>% 
  
# In 2018, the column names change.  Joins based on new country and new score column names.  
  full_join(happy2018, by = c("Country" = "Country.or.region")) %>% 
  select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017, Score) %>%
  rename(Happiness.Score.2018 = Score) %>%
  
# Joins 2019 data keeping the 2018 column names format.
  full_join(happy2019, by = c("Country" = "Country.or.region")) %>% 
  select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017,Happiness.Score.2018, Score) %>%
  rename(Happiness.Score.2019 = Score) %>%
  
# In 2020, the column names for country and score change again.  Joins the 2020 happiness score.  
  full_join(happy2020, by = c("Country" = "Country.name")) %>% 
  select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017,Happiness.Score.2018, Happiness.Score.2019, Ladder.score) %>%
  rename(Happiness.Score.2020 = Ladder.score) %>% 

# Joins 2021 happiness dta keeping the 2020 column name format    
  full_join(happy2021, by = c("Country" = "Country.name")) %>% 
  select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017,Happiness.Score.2018, Happiness.Score.2019, Happiness.Score.2020, Ladder.score) %>%
  rename(Happiness.Score.2021 = Ladder.score) %>% 
  
# In 2022, the column names for country and score change again.  Joins the 2022 happiness score. 
  full_join(happy2022, by = 'Country') %>% 
  select(Country, Happiness.Score.2015, Happiness.Score.2016,Happiness.Score.2017,Happiness.Score.2018, Happiness.Score.2019, Happiness.Score.2020, Happiness.Score.2021, Happiness.score) %>%
  rename(Happiness.Score.2022 = Happiness.score)

# In 2022, the Happiness score format is changed to include a comma instead of a decimals.  The values are converted to numeric and then changed to the same scale as the previous years.
globalhappy$Happiness.Score.2022 <- as.numeric(gsub(",","", globalhappy$Happiness.Score.2022))
globalhappy <- globalhappy %>% 
  mutate(Happiness.Score.2022 = Happiness.Score.2022 / 1000)

# The average happines score for each country is calculated and NAN value is removed.
globalhappy <- globalhappy %>% 
  mutate(AvgHappiness = (rowMeans(select(globalhappy, "Happiness.Score.2015", "Happiness.Score.2016","Happiness.Score.2017","Happiness.Score.2018", "Happiness.Score.2019", "Happiness.Score.2020", "Happiness.Score.2021","Happiness.Score.2022"), na.rm = TRUE))) %>% 
  filter(!is.na(AvgHappiness))

knitr::kable(head(globalhappy))
Country Happiness.Score.2015 Happiness.Score.2016 Happiness.Score.2017 Happiness.Score.2018 Happiness.Score.2019 Happiness.Score.2020 Happiness.Score.2021 Happiness.Score.2022 AvgHappiness
Switzerland 7.587 7.509 7.494 7.487 7.480 7.5599 7.571 7.512 7.524987
Iceland 7.561 7.501 7.504 7.495 7.494 7.5045 7.554 7.557 7.521312
Denmark 7.527 7.526 7.522 7.555 7.600 7.6456 7.620 7.636 7.578950
Norway 7.522 7.498 7.537 7.594 7.554 7.4880 7.392 7.365 7.493750
Canada 7.427 7.404 7.316 7.328 7.278 7.2321 7.103 7.025 7.264138
Finland 7.406 7.413 7.469 7.632 7.769 7.8087 7.842 7.821 7.645087

Conclusion - Distribution of Global Happiness scores

Average global happiness scores between 2015- 2022 appear to be normally distributed. To confirm a density plot which places the data below the curve and a QQ distribution line where all of the points appear to fall near the line.

# Mean and Standard Deviation for Average Global Happiness scores
happymean <- mean(globalhappy$AvgHappiness)
happysd   <- sd(globalhappy$AvgHappiness)
happymean
## [1] 5.366749
happysd
## [1] 1.079698
#Density Distribtion Plot for Average Happiness score
ggplot(globalhappy, aes(x = AvgHappiness)) +geom_blank() +geom_histogram(aes(y = ..density..)) +stat_function(fun = dnorm, args = c(mean = happymean, sd = happysd), col = "tomato")

## QQ Distibrution line to evaluate normal distribution of happiness scores
ggplot(globalhappy, aes(sample = AvgHappiness)) + geom_line(stat = "qq")+ ggtitle("Global Average Happiness QQ Distribution Line")