Import Your Data

In the following code hunk, import your data.

#### Use read_csv() or another function

#### Make sure your data is converted into a tibble. 

#### For demonstration purposes, this example uses the mtcars data.


df_ratings <- read_tsv('title.ratings.tsv', na = "\\N", quote = '')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   averageRating = col_double(),
##   numVotes = col_double()
## )
df_basics <- read_tsv('title.basics.tsv', na = "\\N", quote = '')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   titleType = col_character(),
##   primaryTitle = col_character(),
##   originalTitle = col_character(),
##   isAdult = col_double(),
##   startYear = col_double(),
##   endYear = col_logical(),
##   runtimeMinutes = col_double(),
##   genres = col_character()
## )
## Warning: 75427 parsing failures.
##   row     col           expected actual               file
## 34983 endYear 1/0/T/F/TRUE/FALSE   1947 'title.basics.tsv'
## 35184 endYear 1/0/T/F/TRUE/FALSE   1945 'title.basics.tsv'
## 37613 endYear 1/0/T/F/TRUE/FALSE   1955 'title.basics.tsv'
## 38447 endYear 1/0/T/F/TRUE/FALSE   1949 'title.basics.tsv'
## 38448 endYear 1/0/T/F/TRUE/FALSE   1949 'title.basics.tsv'
## ..... ....... .................. ...... ..................
## See problems(...) for more details.
df_ratings <- df_ratings %>% left_join(df_basics)
## Joining, by = "tconst"
df_actors <- read_tsv('name.basics.tsv', na = "\\N", quote = '') %>%
                filter(str_detect(primaryProfession, "actor|actress"))  %>%
                select(nconst, primaryName, birthYear)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   nconst = col_character(),
##   primaryName = col_character(),
##   birthYear = col_double(),
##   deathYear = col_double(),
##   primaryProfession = col_character(),
##   knownForTitles = col_character()
## )
df_principals <- read_tsv('title.principals.tsv', na = "\\N", quote = '') %>%
  filter(str_detect(category, "actor|actress")) %>%
  select(tconst, ordering, nconst, category) %>%
  group_by(tconst) %>%
  filter(ordering == min(ordering))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   ordering = col_double(),
##   nconst = col_character(),
##   category = col_character(),
##   job = col_character(),
##   characters = col_character()
## )
df_principals <- df_principals %>% left_join(df_actors)
## Joining, by = "nconst"
df_ratings <- df_ratings %>% left_join(df_principals)
## Joining, by = "tconst"
df_ratings_movies <- df_ratings %>%
                        filter(titleType == "movie", !is.na(birthYear), numVotes >= 10) %>%
                        mutate(age_lead = startYear - birthYear)

df_actor_ages <- df_ratings_movies %>%
                  group_by(startYear) %>%
                  summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
                            med_age = quantile(age_lead, 0.50, na.rm=T),
                            high_age = quantile(age_lead, 0.75, na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
df_actor_ages_lead <- df_ratings_movies %>%
                  group_by(startYear, category) %>%
                  summarize(low_age = quantile(age_lead, 0.25, na.rm = T),
                            med_age = quantile(age_lead, 0.50, na.rm = T),
                            high_age = quantile(age_lead, 0.75, na.rm = T))
## `summarise()` regrouping output by 'startYear' (override with `.groups` argument)
df_ratings_movies_nth <- df_ratings_movies %>%
                      group_by(nconst) %>%
                      arrange(startYear) %>%
                      mutate(nth_lead = row_number())

Dataset_Visualization

Part 1

  1. We visualize the Corelation between The average Ratings
  2. Number of Votes And their respective Count
  3. Visualized using a heatMap
  4. Dataset Employed: df_ratings

Part 2

  1. We visualize the Corelation between The average Ratings
  2. Start Year or the year in which the movie was released And their respective Count
  3. Visualized using a heatMap
  4. Dataset Employed: df_ratings
## Warning: Removed 22 rows containing non-finite values (stat_bin2d).
## Warning: Removed 22 rows containing non-finite values (stat_smooth).

Part 3

  1. We visualize the Median Age of the Actor and Actress in the movies released after 1920
  2. The average age of Male Actor is bit more than the female actor and ther gap and magnitude changes over time