Import data
# excel file
data <- read_excel("../00_data/Summer Movies.xlsx")
data
## # A tibble: 905 × 10
## tconst title_type primary_title original_title year runtime_minutes genres
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 tt00114… movie Midsummer Ma… Midsummer Mad… 1920 60 Drama
## 2 tt00267… movie A Midsummer … A Midsummer N… 1935 133 Comed…
## 3 tt00338… movie The Teachers… Magistrarna p… 1941 86 Comedy
## 4 tt00373… movie Summer Storm Summer Storm 1944 106 Crime…
## 5 tt00384… movie Centennial S… Centennial Su… 1946 102 Histo…
## 6 tt00387… tvMovie A Midsummer … A Midsummer N… 1946 150 Drama…
## 7 tt00393… movie One Swallow … En fluga gör … 1947 88 Comedy
## 8 tt00408… movie Summer Holid… Summer Holiday 1948 93 Music…
## 9 tt00415… movie In the Good … In the Good O… 1949 102 Comed…
## 10 tt00429… movie Bountiful Su… Shchedroe leto 1951 87 Comed…
## # ℹ 895 more rows
## # ℹ 3 more variables: simple_title <chr>, average_rating <dbl>, num_votes <dbl>
Apply the following dplyr verbs to your data
Filter rows
filter(data, genres == "Drama", year == 1)
## # A tibble: 0 × 10
## # ℹ 10 variables: tconst <chr>, title_type <chr>, primary_title <chr>,
## # original_title <chr>, year <dbl>, runtime_minutes <chr>, genres <chr>,
## # simple_title <chr>, average_rating <dbl>, num_votes <dbl>
Arrange rows
arrange(data, desc(genres), desc(year))
## # A tibble: 905 × 10
## tconst title_type primary_title original_title year runtime_minutes genres
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 tt04860… movie Summer Love Summer Love 2006 94 Weste…
## 2 tt32201… movie Manson: Summ… Manson: Summe… 2024 NA Thril…
## 3 tt87741… video 'Hocus Pocus… 'Hocus Pocus'… 2018 80 Talk-…
## 4 tt12915… video The Best of … The Best of W… 2020 232 Sport
## 5 tt23611… video ECW Super Su… ECW Super Sum… 1993 NA Sport
## 6 tt73004… video SummerSlam's… SummerSlam's … 1992 60 Sport
## 7 tt13745… video Saint Lauren… Saint Laurent… 2020 8 Short
## 8 tt83967… video 1982: One Am… 1982: One Ama… 2017 28 Short
## 9 tt28685… video Zac Posen: Z… Zac Posen: Z … 2010 NA Short
## 10 tt28686… video Jill Stuart … Jill Stuart S… 2010 8 Short
## # ℹ 895 more rows
## # ℹ 3 more variables: simple_title <chr>, average_rating <dbl>, num_votes <dbl>
Select columns
select(data, year:genres)
## # A tibble: 905 × 3
## year runtime_minutes genres
## <dbl> <chr> <chr>
## 1 1920 60 Drama
## 2 1935 133 Comedy,Fantasy,Romance
## 3 1941 86 Comedy
## 4 1944 106 Crime,Drama,Film-Noir
## 5 1946 102 History,Music,Romance
## 6 1946 150 Drama,Fantasy
## 7 1947 88 Comedy
## 8 1948 93 Musical
## 9 1949 102 Comedy,Musical,Romance
## 10 1951 87 Comedy,Drama,Musical
## # ℹ 895 more rows
select(data, year, genres, contains("Drama"))
## # A tibble: 905 × 2
## year genres
## <dbl> <chr>
## 1 1920 Drama
## 2 1935 Comedy,Fantasy,Romance
## 3 1941 Comedy
## 4 1944 Crime,Drama,Film-Noir
## 5 1946 History,Music,Romance
## 6 1946 Drama,Fantasy
## 7 1947 Comedy
## 8 1948 Musical
## 9 1949 Comedy,Musical,Romance
## 10 1951 Comedy,Drama,Musical
## # ℹ 895 more rows
Add columns
mutate(data,
rating = average_rating) %>%
#Select year rating
select(year, rating)
## # A tibble: 905 × 2
## year rating
## <dbl> <dbl>
## 1 1920 7.4
## 2 1935 6.8
## 3 1941 5.5
## 4 1944 6.6
## 5 1946 6.1
## 6 1946 7.5
## 7 1947 5.8
## 8 1948 5.7
## 9 1949 7.1
## 10 1951 5.7
## # ℹ 895 more rows
Summarize by groups
data %>%
separate_rows(genres)%>%
group_by(genres) %>%
summarise(Moving_Rating = mean(average_rating, na.rm = TRUE))
## # A tibble: 30 × 2
## genres Moving_Rating
## <chr> <dbl>
## 1 Action 5.94
## 2 Adventure 6.11
## 3 Animation 6.48
## 4 Biography 6.86
## 5 Comedy 6.14
## 6 Crime 6.11
## 7 Documentary 7.31
## 8 Drama 6.29
## 9 Family 6.05
## 10 Fantasy 6.2
## # ℹ 20 more rows