library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(outliers)
We are preprocessing two datasets containing movie names and movie actors names from International Movie Database(IMDB). Movie_Actors.csv contains a list of all actors . A movie with imdbID can have multiple Actors as well as no actors. Movie_Movies.csv contains the list of all the movies with their respective IMDB ID. MOvies_Movies Dataset consists of 178687 observations of 7 variables and the Movie_Actors Dataset consists of 143869 observations and 3 variables.
Now initially we joined both the datasets based on the primary key which is Imdbid.Now upon joining both the datasets we applied the TIDY principles and cleaned the data and omit the null values then to further scan and understand the ratings in the given data set we plotted the box plots and then eliminated the outliers and then upon studying the boxplots an histograms of the data we came to a conclusion that it is a right-skewed curve now we used several transformation functions to finally normalise te right skewed curve thereby using most of the data preprocessing techniques.
We are dealing with two Datasets and we took the data sets from the following website, .The exact reference is stated below in the reference section.
#Importing the movie names Dataset:
movie_names <- read_csv("D:/Movie_Names.csv")
## Parsed with column specification:
## cols(
## Awards = col_character(),
## Country = col_character(),
## DVD = col_character(),
## Director = col_character(),
## Language = col_character(),
## Plot = col_character(),
## Poster = col_character(),
## Production = col_character(),
## Rated = col_character(),
## Released = col_character(),
## Runtime = col_character(),
## Title = col_character(),
## Type = col_character(),
## Website = col_character(),
## Year = col_double(),
## imdbID = col_character(),
## imdbRating = col_double(),
## imdbVotes = col_number()
## )
## Warning: 4 parsing failures.
## row col expected actual file
## 174977 Year no trailing characters –2005 'D:/Movie_Names.csv'
## 175430 Year no trailing characters –2011 'D:/Movie_Names.csv'
## 176138 Year no trailing characters –2013 'D:/Movie_Names.csv'
## 176359 Year no trailing characters –2016 'D:/Movie_Names.csv'
#Preview of the Movie names Dataset:
head(movie_names)
tail(movie_names)
#Importing the Movie Actors Dataset:
movie_actors<-read_csv("D:/Movie_Actors.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## Actors = col_character(),
## imdbID = col_character(),
## Gender = col_character()
## )
#Preview of the Movie Actors Dataset:
head(movie_actors)
#Inner Join of Movies Dataset with Movie_Actors Dataset with IMDB_ID as the key:
movie_names_actors <- inner_join(movie_names, movie_actors, by = "imdbID")
#Preview of the dataframe after joining:
head(movie_names_actors)
#Checking the Data Types of different variables of the new joined Dataset:
str(movie_names_actors)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 143869 obs. of 21 variables:
## $ Awards : chr NA NA NA NA ...
## $ Country : chr "USA" "USA" "USA" "Sri Lanka" ...
## $ DVD : chr NA NA NA NA ...
## $ Director : chr "Rose Cummings" "Dimitri Buchowetzki" "Julia Hechler" "Udara Siriruwan" ...
## $ Language : chr "English" NA "English" "Sinhalese" ...
## $ Plot : chr "Rachel constantly hears her baby cry from the bedroom, but David, her husband, won't let her go to the baby. Th"| __truncated__ NA "A Gift introduces Samuel Green, Washington State's first poet laureate, whose poetry inspired The Only Time We "| __truncated__ NA ...
## $ Poster : chr NA NA NA NA ...
## $ Production: chr NA NA NA NA ...
## $ Rated : chr NA NA NA NA ...
## $ Released : chr "26-Apr-12" "27-Mar-26" "27-May-13" "20-Mar-14" ...
## $ Runtime : chr "20 min" "50 min" "2 min" "23 min" ...
## $ Title : chr "Baby's Breath" "The Crown of Lies" "A Gift" "Journey" ...
## $ Type : chr "movie" "movie" "movie" "movie" ...
## $ Website : chr NA NA NA NA ...
## $ Year : num 2012 1926 2013 2014 1915 ...
## $ imdbID : chr "tt2268369" "tt0016750" "tt3405286" "tt3816698" ...
## $ imdbRating: num NA NA NA NA NA NA 6.9 NA NA NA ...
## $ imdbVotes : num NA NA NA NA NA NA 35 NA NA NA ...
## $ X1 : num 0 2 3 4 7 8 9 10 11 12 ...
## $ Actors : chr "Cass Barbera" "Charles A. Post" "Samuel Green" "Surin Mendis" ...
## $ Gender : chr "male" "male" "male" "male" ...
Now here we are converting the Gender and Rated Columns from the new inner joined Dataset to Factors as they are previously of the type Character.
Now we know that the columns Released and Date refer to the Released Dates of movie and DVD respectively . Thereby converting both the columns from character to Date Datatype resectively.
Now this Dataset has Satisfied the minimum requirements from 2-4 as the new data frame contains Factors, Variables of Data Type Date ,Characters and Numbers as well.
#Changing Certain Variables to Factors:
movie_names_actors$Gender <- as.factor(movie_names_actors$Gender)
movie_names_actors$Rated <- as.factor(movie_names_actors$Rated)
#Changing variables to Date Format:
movie_names_actors$Released <- as.Date(movie_names_actors$Released, format = "%d-%b-%y")
movie_names_actors$DVD <- as.Date(movie_names_actors$DVD, format = "%d-%b-%y")
#To view the Data Frame Classes:
str(movie_names_actors)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 143869 obs. of 21 variables:
## $ Awards : chr NA NA NA NA ...
## $ Country : chr "USA" "USA" "USA" "Sri Lanka" ...
## $ DVD : Date, format: NA NA ...
## $ Director : chr "Rose Cummings" "Dimitri Buchowetzki" "Julia Hechler" "Udara Siriruwan" ...
## $ Language : chr "English" NA "English" "Sinhalese" ...
## $ Plot : chr "Rachel constantly hears her baby cry from the bedroom, but David, her husband, won't let her go to the baby. Th"| __truncated__ NA "A Gift introduces Samuel Green, Washington State's first poet laureate, whose poetry inspired The Only Time We "| __truncated__ NA ...
## $ Poster : chr NA NA NA NA ...
## $ Production: chr NA NA NA NA ...
## $ Rated : Factor w/ 80 levels "-12","(Banned)",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Released : Date, format: "2012-04-26" "2026-03-27" ...
## $ Runtime : chr "20 min" "50 min" "2 min" "23 min" ...
## $ Title : chr "Baby's Breath" "The Crown of Lies" "A Gift" "Journey" ...
## $ Type : chr "movie" "movie" "movie" "movie" ...
## $ Website : chr NA NA NA NA ...
## $ Year : num 2012 1926 2013 2014 1915 ...
## $ imdbID : chr "tt2268369" "tt0016750" "tt3405286" "tt3816698" ...
## $ imdbRating: num NA NA NA NA NA NA 6.9 NA NA NA ...
## $ imdbVotes : num NA NA NA NA NA NA 35 NA NA NA ...
## $ X1 : num 0 2 3 4 7 8 9 10 11 12 ...
## $ Actors : chr "Cass Barbera" "Charles A. Post" "Samuel Green" "Surin Mendis" ...
## $ Gender : Factor w/ 1 level "male": 1 1 1 1 1 1 1 1 1 1 ...
Here in the new Data frame we have many attributes but as dicussed above we only need a combination of 10 attributes from both the Dataset , Thereby selecting and making a subset of only the required attributes into movie_names_actors dataframe.
Also we are Specifying the levels of the Attribute Gender.
Now in the new Data frame all the observations form rows and all the variables form the attributes so there is no need to switch between rows and columns.
#A new Data Frame with required attributes:
movie_names_actors <- movie_names_actors[,c("imdbID","Title","Actors","Production","Released","Rated","Runtime","DVD","imdbRating","Gender")]
str(movie_names_actors)
## Classes 'tbl_df', 'tbl' and 'data.frame': 143869 obs. of 10 variables:
## $ imdbID : chr "tt2268369" "tt0016750" "tt3405286" "tt3816698" ...
## $ Title : chr "Baby's Breath" "The Crown of Lies" "A Gift" "Journey" ...
## $ Actors : chr "Cass Barbera" "Charles A. Post" "Samuel Green" "Surin Mendis" ...
## $ Production: chr NA NA NA NA ...
## $ Released : Date, format: "2012-04-26" "2026-03-27" ...
## $ Rated : Factor w/ 80 levels "-12","(Banned)",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Runtime : chr "20 min" "50 min" "2 min" "23 min" ...
## $ DVD : Date, format: NA NA ...
## $ imdbRating: num NA NA NA NA NA NA 6.9 NA NA NA ...
## $ Gender : Factor w/ 1 level "male": 1 1 1 1 1 1 1 1 1 1 ...
#Specifying labels for Gender for the dataset:
levels(movie_names_actors$Gender)<-c("female","male")
Now As said above the new data frame has the required 10 variables and is Tidy.
Now as per requirement 6 using the mutate() function we are calculating the days each movie took to release the DVD from the exact movie release date and the code is as follows.
movie_names_actors_updated<-movie_names_actors%>%mutate(Diff_DVD_Released=movie_names_actors$DVD-movie_names_actors$Released)
#Preview of the updated dataset:
head(movie_names_actors_updated$Diff_DVD_Released)
## Time differences in days
## [1] NA NA NA NA NA NA
tail(movie_names_actors_updated$Diff_DVD_Released)
## Time differences in days
## [1] 760 123 258 160 129 185
Now as per requirement 7 we are checking the NA values present in each column using the below given code.
As the Movies Dataset is a generic dataset and we are dealing with vast number of movies throughout different languages and cultures andsince the total number of NA’s is less than 5% we will omit the NA values as shown in the below code.
#Checking column wise if there are any null values:
colSums(is.na(movie_names_actors_updated))
## imdbID Title Actors Production
## 0 0 0 127853
## Released Rated Runtime DVD
## 45173 120559 42243 128344
## imdbRating Gender Diff_DVD_Released
## 86252 0 129585
#Removing all the NA's using NA omit:
movie_with_out_na<-na.omit(movie_names_actors_updated)
#Re-checking all the columns for any NA's :
colSums(is.na(movie_with_out_na))
## imdbID Title Actors Production
## 0 0 0 0
## Released Rated Runtime DVD
## 0 0 0 0
## imdbRating Gender Diff_DVD_Released
## 0 0 0
Now we will plot a boxplot of Movie rating and the boxplot is as follows. we are trying to find the how the movie ratings are distributed the curve seems to be a right skewed curve as the mean tendsto be on the right also we can clearly see the minimum and maximum ratings and the range through which the movies are rated.
Now upon doing this we are able to see that there are certain outliers and using the below code we will be removing the outliers.
#Crating a Boxplot for IMDB Rating after omiting NA VAlues:
movie_with_out_na$imdbRating%>% boxplot(main="IMDB Rating",
ylab="Rating", col = "grey")
z_imdb <- movie_with_out_na$imdbRating %>% scores(type = "z")
#Summary Statistics
z_imdb %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.3721 -0.5339 0.1485 0.0000 0.7456 2.7073
movie_with_out_na_outlier <- movie_with_out_na[- which(abs(z_imdb) >3 ),]
movie_with_out_na_outlier$imdbRating%>% boxplot(main="IMDB Rating",
ylab="Rating", col = "grey")
#Creating a histogram based on IMDB Rating:
hist(movie_with_out_na_outlier$imdbRating,
main = "Histogram of IMDB Rating", xlab = "IMDB Rating")
movie_with_out_na_outlier_transpose <-
movie_with_out_na_outlier %>%
mutate(imdbRating = (imdbRating)^-2)
#Normalised Histogram:
hist(movie_with_out_na_outlier$imdbRating,
main = "Histogram of IMDB Rating", xlab = "IMDB Rating")