The Movielens data set shows the ratings of movies, genres, year it was released, timestamp and things like that. It has over 90000 observations and 7 variablesmany of its data are quantitative variables. I am trying to see the ratings of movies throughout the years.
# install.packages("dslabs") # these are data science labs
library("dslabs")
## Warning: package 'dslabs' was built under R version 4.2.3
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
## [1] "make-admissions.R"
## [2] "make-brca.R"
## [3] "make-brexit_polls.R"
## [4] "make-death_prob.R"
## [5] "make-divorce_margarine.R"
## [6] "make-gapminder-rdas.R"
## [7] "make-greenhouse_gases.R"
## [8] "make-historic_co2.R"
## [9] "make-mnist_27.R"
## [10] "make-movielens.R"
## [11] "make-murders-rda.R"
## [12] "make-na_example-rda.R"
## [13] "make-nyc_regents_scores.R"
## [14] "make-olive.R"
## [15] "make-outlier_example.R"
## [16] "make-polls_2008.R"
## [17] "make-polls_us_election_2016.R"
## [18] "make-reported_heights-rda.R"
## [19] "make-research_funding_rates.R"
## [20] "make-stars.R"
## [21] "make-temp_carbon.R"
## [22] "make-tissue-gene-expression.R"
## [23] "make-trump_tweets.R"
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(dplyr)
setwd("C:/Users/amani/OneDrive/Desktop/Data110")
movielens <- read_csv("movielens.csv")
## Rows: 100004 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): title, genres
## dbl (5): movieId, year, userId, rating, timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
movielens_chart <-ggplot(movielens, aes(x = year, y = rating)) +
xlab("Rating of the movie") +
ylab("Movie ratings throughout the years") +
ggtitle("Year it was out") +
theme_minimal(base_size = 14, base_family = "serif") +
theme(plot.background = element_rect(fill = "purple"))
movielens_chart +
geom_point()
## Warning: Removed 7 rows containing missing values (`geom_point()`).
movielens_chart +
geom_point(size = 1, alpha = 0.5) +
geom_smooth(method = lm, se=FALSE, color = "purple")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 7 rows containing missing values (`geom_point()`).
genres_of_interest <- c("Action", "Comedy", "Drama", "Romance")
movielens_subset <- movielens %>% filter(genres %in% genres_of_interest)
movielens_chart <- ggplot(movielens_subset, aes(x = rating, y = year)) +
geom_point(size = 1, alpha = 0.5, aes(color = genres)) +
geom_smooth(method = lm, se = FALSE, color = "black", lty = 2, size = 0.3) +
theme(plot.background = element_rect(fill = "pink"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
movielens_chart
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values (`geom_point()`).