Goal: Build a regression model to predict the average movie rating
Click [here for the data] https://github.com/rfordatascience/tidytuesday/tree/master/data/2022/2022-11-01
Import Data
horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
skimr:: skim(horror_movies)
Data summary
| Name |
horror_movies |
| Number of rows |
32540 |
| Number of columns |
20 |
| _______________________ |
|
| Column type frequency: |
|
| character |
10 |
| Date |
1 |
| logical |
1 |
| numeric |
8 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| original_title |
0 |
1.00 |
1 |
191 |
0 |
30296 |
0 |
| title |
0 |
1.00 |
1 |
191 |
0 |
29563 |
0 |
| original_language |
0 |
1.00 |
2 |
2 |
0 |
97 |
0 |
| overview |
1286 |
0.96 |
1 |
1000 |
0 |
31020 |
0 |
| tagline |
19835 |
0.39 |
1 |
237 |
0 |
12513 |
0 |
| poster_path |
4474 |
0.86 |
30 |
32 |
0 |
28048 |
0 |
| status |
0 |
1.00 |
7 |
15 |
0 |
4 |
0 |
| backdrop_path |
18995 |
0.42 |
29 |
32 |
0 |
13536 |
0 |
| genre_names |
0 |
1.00 |
6 |
144 |
0 |
772 |
0 |
| collection_name |
30234 |
0.07 |
4 |
56 |
0 |
815 |
0 |
Variable type: Date
| release_date |
0 |
1 |
1950-01-01 |
2022-12-31 |
2012-12-09 |
10999 |
Variable type: logical
Variable type: numeric
| id |
0 |
1.00 |
445910.83 |
305744.67 |
17 |
146494.8 |
426521.00 |
707534.00 |
1033095.00 |
▇▆▆▅▅ |
| popularity |
0 |
1.00 |
4.01 |
37.51 |
0 |
0.6 |
0.84 |
2.24 |
5088.58 |
▇▁▁▁▁ |
| vote_count |
0 |
1.00 |
62.69 |
420.89 |
0 |
0.0 |
2.00 |
11.00 |
16900.00 |
▇▁▁▁▁ |
| vote_average |
0 |
1.00 |
3.34 |
2.88 |
0 |
0.0 |
4.00 |
5.70 |
10.00 |
▇▂▆▃▁ |
| budget |
0 |
1.00 |
543126.59 |
4542667.81 |
0 |
0.0 |
0.00 |
0.00 |
200000000.00 |
▇▁▁▁▁ |
| revenue |
0 |
1.00 |
1349746.73 |
14430479.15 |
0 |
0.0 |
0.00 |
0.00 |
701842551.00 |
▇▁▁▁▁ |
| runtime |
0 |
1.00 |
62.14 |
41.00 |
0 |
14.0 |
80.00 |
91.00 |
683.00 |
▇▁▁▁▁ |
| collection |
30234 |
0.07 |
481534.88 |
324498.16 |
656 |
155421.0 |
471259.00 |
759067.25 |
1033032.00 |
▇▅▅▅▅ |
Clean Data
data <- horror_movies %>%
# Remove unnecessary variables
select(-id, -title, -original_language, -overview, -tagline, -release_date, -poster_path, -budget, -revenue, -runtime, -status, -adult, -status, -backdrop_path, -genre_names, -collection, -collection_name) %>%
na.omit()
skimr:: skim(data)
Data summary
| Name |
data |
| Number of rows |
32540 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
3 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| original_title |
0 |
1 |
1 |
191 |
0 |
30296 |
0 |
Variable type: numeric
| popularity |
0 |
1 |
4.01 |
37.51 |
0 |
0.6 |
0.84 |
2.24 |
5088.58 |
▇▁▁▁▁ |
| vote_count |
0 |
1 |
62.69 |
420.89 |
0 |
0.0 |
2.00 |
11.00 |
16900.00 |
▇▁▁▁▁ |
| vote_average |
0 |
1 |
3.34 |
2.88 |
0 |
0.0 |
4.00 |
5.70 |
10.00 |
▇▂▆▃▁ |
Explore Data
data %>%
ggplot(aes(vote_count, vote_average)) +
geom_point()

data %>%
group_by(vote_count, vote_average) %>%
summarise(mean_group = mean(vote_average)) -> data2
data2 %>%
ggplot(aes(x= vote_count, y= mean_group,
color= vote_count, shape= vote_average,
group = vote_count,
label = round(mean_group, 2))) +
scale_shape_binned() +
geom_point()
