Data Review

Import Your Data

In the following code hunk, import your data.

#### Use read_csv() or another function

#### Make sure your data is converted into a tibble. 

#### For demonstration purposes, this example uses the mtcars data.


df_ratings <- read_tsv('title.ratings.tsv', na = "\\N", quote = '')

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   averageRating = col_double(),
##   numVotes = col_double()
## )

df_basics <- read_tsv('title.basics.tsv', na = "\\N", quote = '')

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   titleType = col_character(),
##   primaryTitle = col_character(),
##   originalTitle = col_character(),
##   isAdult = col_double(),
##   startYear = col_double(),
##   endYear = col_logical(),
##   runtimeMinutes = col_double(),
##   genres = col_character()
## )

## Warning: 75427 parsing failures.
##   row     col           expected actual               file
## 34983 endYear 1/0/T/F/TRUE/FALSE   1947 'title.basics.tsv'
## 35184 endYear 1/0/T/F/TRUE/FALSE   1945 'title.basics.tsv'
## 37613 endYear 1/0/T/F/TRUE/FALSE   1955 'title.basics.tsv'
## 38447 endYear 1/0/T/F/TRUE/FALSE   1949 'title.basics.tsv'
## 38448 endYear 1/0/T/F/TRUE/FALSE   1949 'title.basics.tsv'
## ..... ....... .................. ...... ..................
## See problems(...) for more details.

df_ratings <- df_ratings %>% left_join(df_basics)

## Joining, by = "tconst"

df_actors <- read_tsv('name.basics.tsv', na = "\\N", quote = '') %>%
                filter(str_detect(primaryProfession, "actor|actress"))  %>%
                select(nconst, primaryName, birthYear)

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   nconst = col_character(),
##   primaryName = col_character(),
##   birthYear = col_double(),
##   deathYear = col_double(),
##   primaryProfession = col_character(),
##   knownForTitles = col_character()
## )

df_principals <- read_tsv('title.principals.tsv', na = "\\N", quote = '') %>%
  filter(str_detect(category, "actor|actress")) %>%
  select(tconst, ordering, nconst, category) %>%
  group_by(tconst) %>%
  filter(ordering == min(ordering))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   ordering = col_double(),
##   nconst = col_character(),
##   category = col_character(),
##   job = col_character(),
##   characters = col_character()
## )

df_principals <- df_principals %>% left_join(df_actors)

## Joining, by = "nconst"

df_ratings <- df_ratings %>% left_join(df_principals)

## Joining, by = "tconst"

df_ratings_movies <- df_ratings %>%
                        filter(titleType == "movie", !is.na(birthYear), numVotes >= 10) %>%
                        mutate(age_lead = startYear - birthYear)

df_actor_ages <- df_ratings_movies %>%
                  group_by(startYear) %>%
                  summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
                            med_age = quantile(age_lead, 0.50, na.rm=T),
                            high_age = quantile(age_lead, 0.75, na.rm=T))

## `summarise()` ungrouping output (override with `.groups` argument)

df_actor_ages_lead <- df_ratings_movies %>%
                  group_by(startYear, category) %>%
                  summarize(low_age = quantile(age_lead, 0.25, na.rm = T),
                            med_age = quantile(age_lead, 0.50, na.rm = T),
                            high_age = quantile(age_lead, 0.75, na.rm = T))

## `summarise()` regrouping output by 'startYear' (override with `.groups` argument)

df_ratings_movies_nth <- df_ratings_movies %>%
                      group_by(nconst) %>%
                      arrange(startYear) %>%
                      mutate(nth_lead = row_number())

Dataset_Visualization

Part 1

We visualize the Corelation between The average Ratings
Number of Votes And their respective Count
Visualized using a heatMap
Dataset Employed: df_ratings

Part 2

We visualize the Corelation between The average Ratings
Start Year or the year in which the movie was released And their respective Count
Visualized using a heatMap
Dataset Employed: df_ratings

## Warning: Removed 22 rows containing non-finite values (stat_bin2d).

## Warning: Removed 22 rows containing non-finite values (stat_smooth).

Part 3

We visualize the Median Age of the Actor and Actress in the movies released after 1920
The average age of Male Actor is bit more than the female actor and ther gap and magnitude changes over time

Data Review

Ankur Bohra

11/10/2020

Import Your Data

Dataset_Visualization

Part 1

Part 2

Part 3