EDA UFCG: Análise de série

Carrega a biblioteca tidyverse

## ── Attaching packages ──────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ─────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Lê o arquivo CSV

imdb_series = read_csv("../data/series_from_imdb.csv")

## Parsed with column specification:
## cols(
##   series_name = col_character(),
##   Episode = col_character(),
##   series_ep = col_double(),
##   season = col_double(),
##   season_ep = col_double(),
##   url = col_character(),
##   UserRating = col_double(),
##   UserVotes = col_double(),
##   r1 = col_double(),
##   r2 = col_double(),
##   r3 = col_double(),
##   r4 = col_double(),
##   r5 = col_double(),
##   r6 = col_double(),
##   r7 = col_double(),
##   r8 = col_double(),
##   r9 = col_double(),
##   r10 = col_double()
## )

imdb_series

Descobre todas as series que tem crime no nome pois o nome em portugues da serie é desconhecido

imdb_series %>% 
  select(series_name) %>% 
  unique() %>% 
  filter(grepl("Crime", series_name))

Achou-se o nome da serie que é “Crimes do Colarinho Branco” que traduzido fica “White Collar”. Após isso filtra-se para obter-se os episodios dessa serie

white_collar = imdb_series %>% 
  filter(series_name == "Crimes do Colarinho Branco")

white_collar

Qual o episodio mais popular?

most_popular = white_collar %>% 
  filter(UserRating == max(UserRating))

most_popular

O episodio mais popular foi no final ou no começo?

white_collar %>% 
  filter(series_ep == max(series_ep))

O episodio mais popular foi aquele que tambem recebeu uma maior porcetagem de votos 10

imdb_series %>% 
  filter(r10 == max(r10), UserRating == max(UserRating)) %>% 
  nrow() != 0

## [1] TRUE

Transformando numa função e aplicando ela ao imdb_series

rating_percentage_match = function(dataset) {
  dataset %>% 
    filter(r10 == max(r10), UserRating == max(UserRating)) %>% 
    nrow() != 0
}

rating_percentage_match(imdb_series)

## [1] TRUE

Todas as series que possuem o seu melhor episodio maior que 9.5

imdb_series %>% 
  group_by(series_name) %>% 
  filter(max(UserRating) > 9.5) %>% 
  pull(series_name) %>% 
  unique() %>% 
  sort()

##  [1] "A Sete Palmos"                             
##  [2] "Angel: O Caça-Vampiros"                    
##  [3] "Atlanta"                                   
##  [4] "Avatar: A Lenda de Aang"                   
##  [5] "Avatar: A Lenda de Korra"                  
##  [6] "Babylon 5"                                 
##  [7] "Banshee"                                   
##  [8] "Better Call Saul"                          
##  [9] "Billions"                                  
## [10] "Black Sails"                               
## [11] "BoJack Horseman"                           
## [12] "Bosch"                                     
## [13] "Buffy: A Caça-Vampiros"                    
## [14] "Caçadores de Sombras"                      
## [15] "Chuck"                                     
## [16] "Clube do Terror"                           
## [17] "Código Geass"                              
## [18] "Community"                                 
## [19] "Demolidor"                                 
## [20] "Desperate Housewives"                      
## [21] "Doctor Who"                                
## [22] "Dr. House"                                 
## [23] "Família Soprano"                           
## [24] "Flash"                                     
## [25] "Frasier"                                   
## [26] "Friends"                                   
## [27] "Game of Thrones"                           
## [28] "Glee: Em Busca da Fama"                    
## [29] "Gravity Falls: Um Verão de Mistérios"      
## [30] "Hannibal"                                  
## [31] "Hércules"                                  
## [32] "Homeland"                                  
## [33] "Homicide: Life on the Street"              
## [34] "House of Cards"                            
## [35] "Hunter x Hunter"                           
## [36] "I Love Lucy"                               
## [37] "It's Always Sunny in Philadelphia"         
## [38] "Lances da Vida"                            
## [39] "Lei & Ordem: Unidade de Vítimas Especiais" 
## [40] "Longmire: O Xerife"                        
## [41] "Mad Men: Inventando Verdades"              
## [42] "Nathan for You"                            
## [43] "One Piece: Wan pîsu"                       
## [44] "Only Fools and Horses...."                 
## [45] "Os 100"                                    
## [46] "Parenthood: Uma História de Família"       
## [47] "Parks and Recreation"                      
## [48] "Peaky Blinders: Sangue, Apostas e Navalhas"
## [49] "Please Like Me"                            
## [50] "Samurai Jack"                              
## [51] "Sarabhai vs Sarabhai"                      
## [52] "Scrubs"                                    
## [53] "Seinfeld"                                  
## [54] "Shameless"                                 
## [55] "Sherlock"                                  
## [56] "Sobrenatural"                              
## [57] "Star vs. As Forças do Mal"                 
## [58] "Star Wars Rebels"                          
## [59] "Star Wars: The Clone Wars"                 
## [60] "Steven Universo"                           
## [61] "Suits"                                     
## [62] "The Americans"                             
## [63] "The Good Wife"                             
## [64] "The Last Kingdom"                          
## [65] "The Walking Dead"                          
## [66] "Travelers"                                 
## [67] "TURN"                                      
## [68] "Um Maluco no Pedaço"                       
## [69] "Uma Família Perdida no Meio do Nada"       
## [70] "Vida de Escritório"                        
## [71] "Wentworth"                                 
## [72] "West Wing: Nos Bastidores do Poder"        
## [73] "Westworld"                                 
## [74] "Yeh Meri Family"

Serie simpsons

simpsons = imdb_series %>% 
  filter(series_name == "Os Simpsons") 

simpsons

simpsons %>% 
  group_by(season) %>% 
  select(UserRating) %>% 
  sapply(mean)

## Adding missing grouping variables: `season`

##     season UserRating 
##    28.3875     6.7600

help(sapply)

imdb_series %>% 
  filter(grepl("mica", series_name)) %>% 
  select(series_name) %>% 
  unique()

procurar_serie = function(trecho_titulo) {
  imdb_series %>% 
    filter(grepl(trecho_titulo, series_name)) %>% 
    group_by(series_name) %>% 
    unique()
}

procurar_serie("Des")

white_collar %>% 
  group_by(season) %>% 
  summarise(media = mean(UserRating))

episodio = imdb_series %>% 
  filter(series_name == "Crimes do Colarinho Branco", series_ep == 10)

episodio

vetor_ratings = c("r1","r2","r3","r4","r5","r6","r7","r8","r9","r10")

ratings_episodio = function(episodio) {
  episodio %>% 
    select(-season, -series_name, -series_ep, -season_ep, -url, -UserRating, -UserVotes) %>% 
    gather(Episode) %>% 
    rename(Rating = Episode, Percentage = value)
}

ratings_episodio(episodio) %>% 
  ggplot(mapping = aes(x = factor(Rating, level = vetor_ratings), y = Percentage)) +
  geom_point()

pega_serie = function(nome_serie) {
  imdb_series %>% 
    filter(series_name == nome_serie)
}

pega_media = function(nome_serie) {
  pega_serie(nome_serie) %>% 
    select(UserRating)
}

pega_media("Game of Thrones") %>% 
  sapply(mean)

## UserRating 
##    9.11194

EDA UFCG: Análise de série - White collar