Import data

# excel file
data <- read_excel("../00_data/Summer Movies.xlsx")
data
## # A tibble: 905 × 10
##    tconst   title_type primary_title original_title  year runtime_minutes genres
##    <chr>    <chr>      <chr>         <chr>          <dbl> <chr>           <chr> 
##  1 tt00114… movie      Midsummer Ma… Midsummer Mad…  1920 60              Drama 
##  2 tt00267… movie      A Midsummer … A Midsummer N…  1935 133             Comed…
##  3 tt00338… movie      The Teachers… Magistrarna p…  1941 86              Comedy
##  4 tt00373… movie      Summer Storm  Summer Storm    1944 106             Crime…
##  5 tt00384… movie      Centennial S… Centennial Su…  1946 102             Histo…
##  6 tt00387… tvMovie    A Midsummer … A Midsummer N…  1946 150             Drama…
##  7 tt00393… movie      One Swallow … En fluga gör …  1947 88              Comedy
##  8 tt00408… movie      Summer Holid… Summer Holiday  1948 93              Music…
##  9 tt00415… movie      In the Good … In the Good O…  1949 102             Comed…
## 10 tt00429… movie      Bountiful Su… Shchedroe leto  1951 87              Comed…
## # ℹ 895 more rows
## # ℹ 3 more variables: simple_title <chr>, average_rating <dbl>, num_votes <dbl>

Apply the following dplyr verbs to your data

Filter rows

filter(data, genres == "Drama", year == 1)
## # A tibble: 0 × 10
## # ℹ 10 variables: tconst <chr>, title_type <chr>, primary_title <chr>,
## #   original_title <chr>, year <dbl>, runtime_minutes <chr>, genres <chr>,
## #   simple_title <chr>, average_rating <dbl>, num_votes <dbl>

Arrange rows

arrange(data, desc(genres), desc(year))
## # A tibble: 905 × 10
##    tconst   title_type primary_title original_title  year runtime_minutes genres
##    <chr>    <chr>      <chr>         <chr>          <dbl> <chr>           <chr> 
##  1 tt04860… movie      Summer Love   Summer Love     2006 94              Weste…
##  2 tt32201… movie      Manson: Summ… Manson: Summe…  2024 NA              Thril…
##  3 tt87741… video      'Hocus Pocus… 'Hocus Pocus'…  2018 80              Talk-…
##  4 tt12915… video      The Best of … The Best of W…  2020 232             Sport 
##  5 tt23611… video      ECW Super Su… ECW Super Sum…  1993 NA              Sport 
##  6 tt73004… video      SummerSlam's… SummerSlam's …  1992 60              Sport 
##  7 tt13745… video      Saint Lauren… Saint Laurent…  2020 8               Short 
##  8 tt83967… video      1982: One Am… 1982: One Ama…  2017 28              Short 
##  9 tt28685… video      Zac Posen: Z… Zac Posen: Z …  2010 NA              Short 
## 10 tt28686… video      Jill Stuart … Jill Stuart S…  2010 8               Short 
## # ℹ 895 more rows
## # ℹ 3 more variables: simple_title <chr>, average_rating <dbl>, num_votes <dbl>

Select columns

select(data, year:genres)
## # A tibble: 905 × 3
##     year runtime_minutes genres                
##    <dbl> <chr>           <chr>                 
##  1  1920 60              Drama                 
##  2  1935 133             Comedy,Fantasy,Romance
##  3  1941 86              Comedy                
##  4  1944 106             Crime,Drama,Film-Noir 
##  5  1946 102             History,Music,Romance 
##  6  1946 150             Drama,Fantasy         
##  7  1947 88              Comedy                
##  8  1948 93              Musical               
##  9  1949 102             Comedy,Musical,Romance
## 10  1951 87              Comedy,Drama,Musical  
## # ℹ 895 more rows
select(data, year, genres, contains("Drama"))
## # A tibble: 905 × 2
##     year genres                
##    <dbl> <chr>                 
##  1  1920 Drama                 
##  2  1935 Comedy,Fantasy,Romance
##  3  1941 Comedy                
##  4  1944 Crime,Drama,Film-Noir 
##  5  1946 History,Music,Romance 
##  6  1946 Drama,Fantasy         
##  7  1947 Comedy                
##  8  1948 Musical               
##  9  1949 Comedy,Musical,Romance
## 10  1951 Comedy,Drama,Musical  
## # ℹ 895 more rows

Add columns

mutate(data,
       rating = average_rating) %>%
    
    #Select year rating
    select(year, rating)
## # A tibble: 905 × 2
##     year rating
##    <dbl>  <dbl>
##  1  1920    7.4
##  2  1935    6.8
##  3  1941    5.5
##  4  1944    6.6
##  5  1946    6.1
##  6  1946    7.5
##  7  1947    5.8
##  8  1948    5.7
##  9  1949    7.1
## 10  1951    5.7
## # ℹ 895 more rows

Summarize by groups

data  %>%
    separate_rows(genres)%>%
    group_by(genres) %>%
    summarise(Moving_Rating = mean(average_rating, na.rm = TRUE))
## # A tibble: 30 × 2
##    genres      Moving_Rating
##    <chr>               <dbl>
##  1 Action               5.94
##  2 Adventure            6.11
##  3 Animation            6.48
##  4 Biography            6.86
##  5 Comedy               6.14
##  6 Crime                6.11
##  7 Documentary          7.31
##  8 Drama                6.29
##  9 Family               6.05
## 10 Fantasy              6.2 
## # ℹ 20 more rows