Podstawowe operacje w R - część 5.

Wizualizacja danych

Angelika Hernik

2022-12-24

Wizualizacja danych z biblioteką ggplot2
- Wykresy zmiennej ilościowej
- Wykresy zmiennej jakościowej
- Wykresy dwóch zmiennych ilościowych
- Wykresy dwóch zmiennych jakościowych
- Wykresy zmiennej ilościowej vs zmiennej jakościowej
- Wykresy bąbelkowe (3x Challenge)
- Kilka wykresów na jednym panelu (2x Challenge).

Przydatne materiały:
- ggplot2 cheatsheet
- A. Kassambara - Guide to Create Beautiful Graphics in R.

Dane pochodzą ze strony https://flixgem.com/ (wersja zbioru danych z dnia 12 marca 2021). Dane zawierają informacje na temat 9425 filmów i seriali dostępnych na Netlix.

Wizualizacja danych z ggplot2

theme_set(theme_bw())

Podstawowe zasady tworzenia wykresów z ggplot2:
- do funkcji ggplot() podajemy ramkę danych oraz opcjonalnie osie x, y i parametry shape, color, fill, group
- dodajemy wykresy za pomocą funkcji zaczynających się geom_ lub stat_
- modyfikujemy wykresy dodając legendy, tytuły, znaczniki na osiach etc.

Do szybkich analiz można skorzystać z funkcji qplot(), która automatycznie dobiera typ wykresu do rodzaju danych.

Wykresy zmiennej ilościowej

dane %>%
  filter(Languages=="Polish") %>%
ggplot(aes(x = IMDb.Score)) +
  geom_histogram(
    aes(fill = Series.or.Movie)
    ,bins = 10
    ,color = 'black'
  ) +
  labs(title = 'Polskie filmy i seriale')

ggplot(dane, aes(x=IMDb.Score, fill=Series.or.Movie)) +
  geom_density(alpha=.25) +
  labs(title = 'Gęstość rozkładu ocen')

Wykresy zmiennej jakościowej

dane %>%
  filter(Runtime != '') %>%
  ggplot(aes(Runtime)) +
  geom_bar(aes(fill = Runtime))+
  scale_x_discrete(
    limits = c('< 30 minutes'
               ,'30-60 mins'
               ,'1-2 hour'
               ,'> 2 hrs')
    ,labels = function(x) str_wrap(x, width = 8)
  ) +
  theme(
    legend.position = 'none'
    ,axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
  ) +
  scale_fill_manual(values = brewer.pal(4, 'Accent'))

Oceny wg Series.or.Movie:

ggplot(dane, aes(x=factor(Series.or.Movie),y=IMDb.Score)) +
  geom_violin(scale="area")

## Warning: Removed 8 rows containing non-finite values (`stat_ydensity()`).

Wykresy dwóch zmiennych ilościowych

ggplot(dane, aes(x = IMDb.Score, y = IMDb.Votes)) +
  geom_point(aes(color = Series.or.Movie)) +
  geom_smooth(
    method = 'lm'
    ,se = FALSE
  ) +
  theme(legend.position = c(0.2, 0.8))

Wykresy dwóch zmiennych jakościowych

dane %>%
  filter(Runtime != '') %>%
  ggplot(aes(x = Series.or.Movie, y = Runtime)) +
  geom_jitter(aes(color = Runtime)) +
  theme(legend.position = 'none') +
  scale_y_discrete(
    limits = c('< 30 minutes'
               ,'30-60 mins'
               ,'1-2 hour'
               ,'> 2 hrs')
  ) +
  labs(x = '', y = '')

Wykresy zmiennej ilościowej vs zmiennej jakościowej

medians = dane %>%
  group_by(Series.or.Movie) %>%
  summarize(m = median(IMDb.Score, na.rm = TRUE))

ggplot(dane, aes(x = Series.or.Movie, y = IMDb.Score)) +
  geom_boxplot(
    aes(fill = Series.or.Movie)
    ,outlier.alpha = 0.25
  ) +
  geom_text(
    data = medians
    ,aes(x = Series.or.Movie, y = m, label = m)
    ,color = 'blue'
    ,hjust = 7
  ) +
  stat_boxplot(geom ='errorbar', position = 'dodge') +
  stat_summary(
    aes(ymax = ..y.., ymin = ..y..)
    ,fun = mean
    ,geom = 'errorbar'
    ,width = .75
    ,linetype = 'solid'
    ,position = 'dodge'
    ,color = 'white'
    ,size = 1
  ) +
  theme(legend.position = 'none')

Wykresy bąbelkowe

gatunki <- dane %>% 
  select(Genre) %>% 
  unlist() %>% 
  strsplit(',') %>%
  unlist() %>% 
  trimws() %>% 
  table() %>% 
  as.data.frame()

gatunki <- data.frame(
  Gatunek = gatunki$.
  ,`Count` = gatunki$Freq
  ,`IMDb Score Average` = NA
  ,`IMDb Votes Average` = NA
)

for (i in 1:nrow(gatunki)) {
  gatunki$IMDb.Score.Average[i] <- dane %>% 
    filter(str_detect(Genre, gatunki$Gatunek[i] %>%
                        as.character() %>% eval())) %>%
    select(IMDb.Score) %>% unlist() %>% mean(na.rm = TRUE)
  
  gatunki$IMDb.Votes.Average[i] <- dane %>% 
    filter(str_detect(Genre, gatunki$Gatunek[i] %>%
                        as.character() %>% eval())) %>%
    select(IMDb.Votes) %>% unlist() %>% mean(na.rm = TRUE)
}

ggplot(gatunki, aes(IMDb.Score.Average, IMDb.Votes.Average, label = Gatunek)) +
  geom_point(aes(color = Gatunek, size = Count)) +
  geom_text_repel() +
  theme(legend.position = 'none')

Kilka wykresów na jednym panelu

dane %>%
  filter(Runtime != '') %>%
  ggplot(aes(x = IMDb.Score)) +
  geom_histogram(
    aes(fill = Series.or.Movie)
    ,bins = 50
    ,color = 'black'
  ) +
  # facet_grid(.~ Series.or.Movie) +
  # facet_grid(Series.or.Movie ~ .) +
  # facet_grid(Runtime ~ Series.or.Movie, scales = 'free') +
  # facet_wrap(vars(Series.or.Movie), ncol = 2) +
  # facet_wrap(vars(Series.or.Movie), nrow = 2) +
  facet_wrap(vars(Series.or.Movie, Runtime), nrow = 2, scales = 'free') +
  theme(legend.position = 'none')

Zadanie domowe

Wykorzystaj FACETING - aby utworzyć grupę (grid - siatkę) wykresów na 1 ekranie raportu - np. histogramy IMDb.Score dla różnych kategorii widza (View.Rating):

dane %>%
  filter(View.Rating %in% c("PG-13", "R", "PG")) %>%
  ggplot(aes(x = IMDb.Score)) +
  geom_histogram(
    aes (fill = Series.or.Movie)
    ,bins = 50
    ,color = 'blue'
  ) +
  facet_grid(. ~ View.Rating)

Zadania dodatkowe - dla chętnych:

CHALLENGE 6: Stwórz analogiczny wykres pokazujący aktorów grających w najpopularniejszych produkcjach.

 aktorzy <- dane %>% 
  select(Actors) %>%
  top_n(25) %>%
  unlist() %>% 
  strsplit(',') %>%
  unlist() %>% 
  trimws() %>% 
  table() %>% 
  as.data.frame()

aktorzy <- data.frame(
  Aktor = aktorzy$.
  ,`Count` = aktorzy$Freq
  ,`IMDb Score Average` = NA
  ,`IMDb Votes Average` = NA
)

for (i in 1:nrow(aktorzy)) {
  aktorzy$IMDb.Score.Average[i] <- dane %>% 
    filter(str_detect(Actors, aktorzy$Aktor[i] %>%
                        as.character() %>% eval())) %>%
    select(IMDb.Score) %>% unlist() %>% mean(na.rm = TRUE)
  
  aktorzy$IMDb.Votes.Average[i] <- dane %>% 
    filter(str_detect(Actors, aktorzy$Aktor[i] %>%
                        as.character() %>% eval())) %>%
    select(IMDb.Votes) %>% unlist() %>% mean(na.rm = TRUE)
}

ggplot(aktorzy, aes(IMDb.Score.Average, IMDb.Votes.Average, label = Aktor)) +
  geom_point(aes(color = Aktor, size = Count)) +
  geom_text_repel() +
  theme(legend.position = 'none')

CHALLENGE 7: Stwórz analogiczny wykres pokazujący w jakich latach powstawały najpopularniejsze produkcje

lata <- dane %>% 
  select(Release.Date) %>%
  top_n(50) %>%
  unlist() %>% 
  strsplit(',') %>%
  unlist() %>% 
  trimws() %>% 
  table() %>% 
  as.data.frame()

lata <- data.frame(
  Rok = lata$.
  ,`Count` = lata$Freq
  ,`IMDb Score Average` = NA
  ,`IMDb Votes Average` = NA
)

for (i in 1:nrow(lata)) {
  lata$IMDb.Score.Average[i] <- dane %>% 
    filter(str_detect(Release.Date, lata$Rok[i] %>%
                        as.character() %>% eval())) %>%
    select(IMDb.Score) %>% unlist() %>% mean(na.rm = TRUE)
  
  lata$IMDb.Votes.Average[i] <- dane %>% 
    filter(str_detect(Release.Date, lata$Rok[i] %>%
                        as.character() %>% eval())) %>%
    select(IMDb.Votes) %>% unlist() %>% mean(na.rm = TRUE)
}

ggplot(lata, aes(IMDb.Score.Average, IMDb.Votes.Average, label = Rok)) +
  geom_point(aes(color = Rok, size = Count)) +
  geom_text_repel() +
  theme(legend.position = 'none')

CHALLENGE 8: Jakie są najpopularniejsze języki dostępne na Netflixie? Podpowiedź: wykres kolumnowy.

 języki <- dane %>% 
  select(Languages) %>%
  unlist() %>% 
  strsplit(',') %>%
  unlist() %>% 
  trimws() %>% 
  table() %>% 
  as.data.frame()

colnames(języki) <- c("Język", "Liczba")
języki <- języki %>%
  arrange(desc(Liczba)) %>%
  head(10)

ggplot(języki, aes(x = reorder(Język, -Liczba), y = Liczba)) + 
  geom_col(aes(fill = Język)) +
    geom_text(aes(label = Liczba), vjust = 1.5)

CHALLENGE 9: Dla filmów i seriali stwórz wykresy ocen z poszczególnych portali (Hidden Gem, IMDb, Rotten Tomatoes, Metacritic). Podpowiedź: warto odpiwotować dane do formatu long.

if(!require('gridExtra')) install.packages('gridExtra')
library(gridExtra)

IMDB <- dane %>%
  ggplot(aes(x = IMDb.Score)) +
  geom_histogram(
    aes(fill = Series.or.Movie)
    ,bins = 50
    ,color = 'black'
  ) +
  facet_wrap(vars(Series.or.Movie), nrow = 2) +
  theme(legend.position = 'none')

RT <- dane %>%
  ggplot(aes(x = Rotten.Tomatoes.Score)) +
  geom_histogram(
    aes(fill = Series.or.Movie)
    ,bins = 50
    ,color = 'black'
  ) +
  facet_wrap(vars(Series.or.Movie), nrow = 2) +
  theme(legend.position = 'none')

HD <- dane %>%
  ggplot(aes(x = Hidden.Gem.Score)) +
  geom_histogram(
    aes(fill = Series.or.Movie)
    ,bins = 50
    ,color = 'black'
  ) +
  facet_wrap(vars(Series.or.Movie), nrow = 2) +
  theme(legend.position = 'none')

Meta <- dane %>%
  ggplot(aes(x = Metacritic.Score)) +
  geom_histogram(
    aes(fill = Series.or.Movie)
    ,bins = 50
    ,color = 'black'
  ) +
  facet_wrap(vars(Series.or.Movie), nrow = 2) +
  theme(legend.position = 'none')

grid.arrange(IMDB, RT, HD, Meta, nrow = 2)

p1 = ggplot(dane, aes(x = IMDb.Score, y = IMDb.Votes, color = Series.or.Movie)) +
  geom_point() +
  theme(legend.position = c(0.3, 0.8))

p2 = ggplot(gatunki, aes(IMDb.Score.Average, IMDb.Votes.Average, label = Gatunek)) +
  geom_point(aes(color = Gatunek, size = Count)) +
  geom_text_repel() +
  theme(legend.position = 'none')

grid.arrange(p1, p2, ncol = 2)

CHALLENGE 10: Jakie wytwórnie filmowe produkują najwięcej i jak się to zmieniało na przestrzeni ostatnich lat?

library(lubridate)
dane$Release.Date = as.Date(Release.Date, format = "%m/%d/%Y")

wytwórnie2020 <- dane %>%
  mutate(Year = year(Release.Date)) %>%
  drop_na(Year) %>%
  filter(Year == 2020) %>%
  select(Production.House) %>%
  unlist() %>% 
  strsplit(',') %>%
  unlist() %>% 
  trimws() %>% 
  table() %>% 
  as.data.frame()

colnames(wytwórnie2020) <- c("Wytwórnia", "Liczba")
wytwórnie2020 <- wytwórnie2020 %>%
  arrange(desc(Liczba)) %>%
  head(7)

wytwórnie2019 <- dane %>%
  mutate(Year = year(Release.Date)) %>%
  drop_na(Year) %>%
  filter(Year == 2019) %>%
  select(Production.House) %>%
  unlist() %>% 
  strsplit(',') %>%
  unlist() %>% 
  trimws() %>% 
  table() %>% 
  as.data.frame()

colnames(wytwórnie2019) <- c("Wytwórnia", "Liczba")
wytwórnie2019 <- wytwórnie2019 %>%
  arrange(desc(Liczba)) %>%
  head(7)

wytwórnie2018 <- dane %>%
  mutate(Year = year(Release.Date)) %>%
  drop_na(Year) %>%
  filter(Year == 2018) %>%
  select(Production.House) %>%
  unlist() %>% 
  strsplit(',') %>%
  unlist() %>% 
  trimws() %>% 
  table() %>% 
  as.data.frame()

colnames(wytwórnie2018) <- c("Wytwórnia", "Liczba")
wytwórnie2018 <- wytwórnie2018 %>%
  arrange(desc(Liczba)) %>%
  head(7)

wytwórnie2017 <- dane %>%
  mutate(Year = year(Release.Date)) %>%
  drop_na(Year) %>%
  filter(Year == 2017) %>%
  select(Production.House) %>%
  unlist() %>% 
  strsplit(',') %>%
  unlist() %>% 
  trimws() %>% 
  table() %>% 
  as.data.frame()

colnames(wytwórnie2017) <- c("Wytwórnia", "Liczba")
wytwórnie2017 <- wytwórnie2017 %>%
  arrange(desc(Liczba)) %>%
  head(7)

w2020 <- ggplot(wytwórnie2020, aes(x = reorder(Wytwórnia, -Liczba), y = Liczba)) + 
  geom_col() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  geom_text(aes(label = Liczba), vjust = 1.5) +
  labs(title = "Top 2020") +
  xlab("Wytwórnia")

w2019 <- ggplot(wytwórnie2019, aes(x = reorder(Wytwórnia, -Liczba), y = Liczba)) + 
  geom_col() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  geom_text(aes(label = Liczba), vjust = 1.5) +
  labs(title = "Top 2019") +
  xlab("Wytwórnia")

w2018 <- ggplot(wytwórnie2018, aes(x = reorder(Wytwórnia, -Liczba), y = Liczba)) + 
  geom_col() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  geom_text(aes(label = Liczba), vjust = 1.5) +
  labs(title = "Top 2018") +
  xlab("Wytwórnia")

w2017 <- ggplot(wytwórnie2017, aes(x = reorder(Wytwórnia, -Liczba), y = Liczba)) + 
  geom_col() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  geom_text(aes(label = Liczba), vjust = 1.5) +
  labs(title = "Top 2017") +
  xlab("Wytwórnia")

grid.arrange(w2020, w2019, w2018, w2017, nrow = 2, ncol = 2)