Required packages

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(outliers)

We are preprocessing two datasets containing movie names and movie actors names from International Movie Database(IMDB). Movie_Actors.csv contains a list of all actors . A movie with imdbID can have multiple Actors as well as no actors. Movie_Movies.csv contains the list of all the movies with their respective IMDB ID. MOvies_Movies Dataset consists of 178687 observations of 7 variables and the Movie_Actors Dataset consists of 143869 observations and 3 variables.

Now initially we joined both the datasets based on the primary key which is Imdbid.Now upon joining both the datasets we applied the TIDY principles and cleaned the data and omit the null values then to further scan and understand the ratings in the given data set we plotted the box plots and then eliminated the outliers and then upon studying the boxplots an histograms of the data we came to a conclusion that it is a right-skewed curve now we used several transformation functions to finally normalise te right skewed curve thereby using most of the data preprocessing techniques.

Data

#Importing the movie names Dataset:
movie_names <- read_csv("D:/Movie_Names.csv")
## Parsed with column specification:
## cols(
##   Awards = col_character(),
##   Country = col_character(),
##   DVD = col_character(),
##   Director = col_character(),
##   Language = col_character(),
##   Plot = col_character(),
##   Poster = col_character(),
##   Production = col_character(),
##   Rated = col_character(),
##   Released = col_character(),
##   Runtime = col_character(),
##   Title = col_character(),
##   Type = col_character(),
##   Website = col_character(),
##   Year = col_double(),
##   imdbID = col_character(),
##   imdbRating = col_double(),
##   imdbVotes = col_number()
## )
## Warning: 4 parsing failures.
##    row  col               expected  actual                 file
## 174977 Year no trailing characters –2005 'D:/Movie_Names.csv'
## 175430 Year no trailing characters –2011 'D:/Movie_Names.csv'
## 176138 Year no trailing characters –2013 'D:/Movie_Names.csv'
## 176359 Year no trailing characters –2016 'D:/Movie_Names.csv'
#Preview of the Movie names Dataset:
head(movie_names)
tail(movie_names)
#Importing the Movie Actors Dataset:
movie_actors<-read_csv("D:/Movie_Actors.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Actors = col_character(),
##   imdbID = col_character(),
##   Gender = col_character()
## )
#Preview of the Movie Actors Dataset:
head(movie_actors)
#Inner Join of Movies Dataset with Movie_Actors Dataset with IMDB_ID as the key:
movie_names_actors <- inner_join(movie_names, movie_actors, by = "imdbID")

#Preview of the dataframe after joining:
head(movie_names_actors)
#Checking the Data Types of different variables of the new joined Dataset:
str(movie_names_actors)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 143869 obs. of  21 variables:
##  $ Awards    : chr  NA NA NA NA ...
##  $ Country   : chr  "USA" "USA" "USA" "Sri Lanka" ...
##  $ DVD       : chr  NA NA NA NA ...
##  $ Director  : chr  "Rose Cummings" "Dimitri Buchowetzki" "Julia Hechler" "Udara Siriruwan" ...
##  $ Language  : chr  "English" NA "English" "Sinhalese" ...
##  $ Plot      : chr  "Rachel constantly hears her baby cry from the bedroom, but David, her husband, won't let her go to the baby. Th"| __truncated__ NA "A Gift introduces Samuel Green, Washington State's first poet laureate, whose poetry inspired The Only Time We "| __truncated__ NA ...
##  $ Poster    : chr  NA NA NA NA ...
##  $ Production: chr  NA NA NA NA ...
##  $ Rated     : chr  NA NA NA NA ...
##  $ Released  : chr  "26-Apr-12" "27-Mar-26" "27-May-13" "20-Mar-14" ...
##  $ Runtime   : chr  "20 min" "50 min" "2 min" "23 min" ...
##  $ Title     : chr  "Baby's Breath" "The Crown of Lies" "A Gift" "Journey" ...
##  $ Type      : chr  "movie" "movie" "movie" "movie" ...
##  $ Website   : chr  NA NA NA NA ...
##  $ Year      : num  2012 1926 2013 2014 1915 ...
##  $ imdbID    : chr  "tt2268369" "tt0016750" "tt3405286" "tt3816698" ...
##  $ imdbRating: num  NA NA NA NA NA NA 6.9 NA NA NA ...
##  $ imdbVotes : num  NA NA NA NA NA NA 35 NA NA NA ...
##  $ X1        : num  0 2 3 4 7 8 9 10 11 12 ...
##  $ Actors    : chr  "Cass Barbera" "Charles A. Post" "Samuel Green" "Surin Mendis" ...
##  $ Gender    : chr  "male" "male" "male" "male" ...

Understand

#Changing Certain Variables to Factors:

movie_names_actors$Gender <- as.factor(movie_names_actors$Gender)

movie_names_actors$Rated <- as.factor(movie_names_actors$Rated)

#Changing variables to Date Format:

movie_names_actors$Released <- as.Date(movie_names_actors$Released, format = "%d-%b-%y")

movie_names_actors$DVD <- as.Date(movie_names_actors$DVD, format = "%d-%b-%y")

#To view the Data Frame Classes:
str(movie_names_actors)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 143869 obs. of  21 variables:
##  $ Awards    : chr  NA NA NA NA ...
##  $ Country   : chr  "USA" "USA" "USA" "Sri Lanka" ...
##  $ DVD       : Date, format: NA NA ...
##  $ Director  : chr  "Rose Cummings" "Dimitri Buchowetzki" "Julia Hechler" "Udara Siriruwan" ...
##  $ Language  : chr  "English" NA "English" "Sinhalese" ...
##  $ Plot      : chr  "Rachel constantly hears her baby cry from the bedroom, but David, her husband, won't let her go to the baby. Th"| __truncated__ NA "A Gift introduces Samuel Green, Washington State's first poet laureate, whose poetry inspired The Only Time We "| __truncated__ NA ...
##  $ Poster    : chr  NA NA NA NA ...
##  $ Production: chr  NA NA NA NA ...
##  $ Rated     : Factor w/ 80 levels "-12","(Banned)",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Released  : Date, format: "2012-04-26" "2026-03-27" ...
##  $ Runtime   : chr  "20 min" "50 min" "2 min" "23 min" ...
##  $ Title     : chr  "Baby's Breath" "The Crown of Lies" "A Gift" "Journey" ...
##  $ Type      : chr  "movie" "movie" "movie" "movie" ...
##  $ Website   : chr  NA NA NA NA ...
##  $ Year      : num  2012 1926 2013 2014 1915 ...
##  $ imdbID    : chr  "tt2268369" "tt0016750" "tt3405286" "tt3816698" ...
##  $ imdbRating: num  NA NA NA NA NA NA 6.9 NA NA NA ...
##  $ imdbVotes : num  NA NA NA NA NA NA 35 NA NA NA ...
##  $ X1        : num  0 2 3 4 7 8 9 10 11 12 ...
##  $ Actors    : chr  "Cass Barbera" "Charles A. Post" "Samuel Green" "Surin Mendis" ...
##  $ Gender    : Factor w/ 1 level "male": 1 1 1 1 1 1 1 1 1 1 ...

Tidy & Manipulate Data I

#A new Data Frame with required attributes: 
movie_names_actors <- movie_names_actors[,c("imdbID","Title","Actors","Production","Released","Rated","Runtime","DVD","imdbRating","Gender")]

str(movie_names_actors)
## Classes 'tbl_df', 'tbl' and 'data.frame':    143869 obs. of  10 variables:
##  $ imdbID    : chr  "tt2268369" "tt0016750" "tt3405286" "tt3816698" ...
##  $ Title     : chr  "Baby's Breath" "The Crown of Lies" "A Gift" "Journey" ...
##  $ Actors    : chr  "Cass Barbera" "Charles A. Post" "Samuel Green" "Surin Mendis" ...
##  $ Production: chr  NA NA NA NA ...
##  $ Released  : Date, format: "2012-04-26" "2026-03-27" ...
##  $ Rated     : Factor w/ 80 levels "-12","(Banned)",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Runtime   : chr  "20 min" "50 min" "2 min" "23 min" ...
##  $ DVD       : Date, format: NA NA ...
##  $ imdbRating: num  NA NA NA NA NA NA 6.9 NA NA NA ...
##  $ Gender    : Factor w/ 1 level "male": 1 1 1 1 1 1 1 1 1 1 ...
#Specifying labels for Gender  for the dataset:
levels(movie_names_actors$Gender)<-c("female","male")

Tidy & Manipulate Data II

movie_names_actors_updated<-movie_names_actors%>%mutate(Diff_DVD_Released=movie_names_actors$DVD-movie_names_actors$Released)

#Preview of the updated dataset:
head(movie_names_actors_updated$Diff_DVD_Released)
## Time differences in days
## [1] NA NA NA NA NA NA
tail(movie_names_actors_updated$Diff_DVD_Released)
## Time differences in days
## [1] 760 123 258 160 129 185

Scan I

#Checking column wise if there are any null values:

colSums(is.na(movie_names_actors_updated))
##            imdbID             Title            Actors        Production 
##                 0                 0                 0            127853 
##          Released             Rated           Runtime               DVD 
##             45173            120559             42243            128344 
##        imdbRating            Gender Diff_DVD_Released 
##             86252                 0            129585
#Removing all the NA's using NA omit:
movie_with_out_na<-na.omit(movie_names_actors_updated)

#Re-checking all the columns for any NA's :
colSums(is.na(movie_with_out_na))
##            imdbID             Title            Actors        Production 
##                 0                 0                 0                 0 
##          Released             Rated           Runtime               DVD 
##                 0                 0                 0                 0 
##        imdbRating            Gender Diff_DVD_Released 
##                 0                 0                 0

Scan II

#Crating a Boxplot for IMDB Rating  after omiting NA VAlues:
movie_with_out_na$imdbRating%>%  boxplot(main="IMDB Rating", 
                               ylab="Rating", col = "grey")

z_imdb <- movie_with_out_na$imdbRating %>%  scores(type = "z")

#Summary Statistics
z_imdb %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.3721 -0.5339  0.1485  0.0000  0.7456  2.7073
movie_with_out_na_outlier <- movie_with_out_na[- which(abs(z_imdb) >3 ),]

movie_with_out_na_outlier$imdbRating%>%  boxplot(main="IMDB Rating", 
                                         ylab="Rating", col = "grey")

Transform

#Creating a histogram based on IMDB Rating:
hist(movie_with_out_na_outlier$imdbRating,
     main = "Histogram of IMDB Rating", xlab = "IMDB Rating")


movie_with_out_na_outlier_transpose <- 
  movie_with_out_na_outlier %>% 
  mutate(imdbRating = (imdbRating)^-2)

#Normalised Histogram:
hist(movie_with_out_na_outlier$imdbRating,
     main = "Histogram of IMDB Rating", xlab = "IMDB Rating")