Movie Rating Analysis

This is an R Markdown document.

library(data.table); library(knitr)

A brief analysis has been performed on the data procured and documented as an R Markdown document.

Loading and processing Dataset

 #Loading Movies.DAT file
 movie <- readLines("movies.dat")
 movie <- as.data.frame(do.call("rbind",strsplit(movie,"::")),stringsAsFactors = FALSE)
 names(movie) <- c("MovieID","Title","Genres")
 
 #Loading Ratings.DAT file
 ratings = readLines("ratings.dat")
 ratings = as.data.frame(do.call("rbind", strsplit(ratings, "::")),stringsAsFactors = FALSE)
 names(ratings) = c("UserID", "MovieID", "Rating", "Timestamp")
 
 #Loading Users.DAT file
 users = readLines("users.dat")
 users = as.data.frame(do.call("rbind", strsplit(users, "::")),stringsAsFactors = FALSE)
 names(users) = c("UserID", "Gender", "Age", "Occupation","Zip-code")
 
 #View the Loaded DataSets
 head(movie)

##   MovieID                              Title                       Genres
## 1       1                   Toy Story (1995)  Animation|Children's|Comedy
## 2       2                     Jumanji (1995) Adventure|Children's|Fantasy
## 3       3            Grumpier Old Men (1995)               Comedy|Romance
## 4       4           Waiting to Exhale (1995)                 Comedy|Drama
## 5       5 Father of the Bride Part II (1995)                       Comedy
## 6       6                        Heat (1995)        Action|Crime|Thriller

 head(users)

##   UserID Gender Age Occupation Zip-code
## 1      1      F   1         10    48067
## 2      2      M  56         16    70072
## 3      3      M  25         15    55117
## 4      4      M  45          7    02460
## 5      5      M  25         20    55455
## 6      6      F  50          9    55117

 head(ratings)

##   UserID MovieID Rating Timestamp
## 1      1    1193      5 978300760
## 2      1     661      3 978302109
## 3      1     914      3 978301968
## 4      1    3408      4 978300275
## 5      1    2355      5 978824291
## 6      1    1197      3 978302268

Top ten most viewed movies with their movies Name (Ascending or Descending order)

This can be interpreted as the 10 movies which received the most votes by the users.

#Intially lets map the Movies w.r.t the No. of Votes recieved 
 top = as.data.frame(table(ratings$MovieID))
 names(top) = c("MovieID", "No.-of-Users-Voted")
 head(top)

##   MovieID No.-of-Users-Voted
## 1       1               2077
## 2      10                888
## 3     100                128
## 4    1000                 20
## 5    1002                  8
## 6    1003                121

 #Lets merge the New data set with the ratings dataset
 top = merge(top, movie, by = "MovieID")
 
 #Letss sort our new dataset with decreasing No.-of-Users-Voted
 top = top[order(top$`No.-of-Users-Voted`, decreasing = TRUE),]
 
 top10 = head(top, n = 10)

 #Top 10 Most rated movies ordered by decending Views
 print(top10[c(1:3)])

##      MovieID No.-of-Users-Voted
## 1917    2858               3428
## 1641     260               2991
## 192     1196               2990
## 209     1210               2883
## 3183     480               2672
## 1026    2028               2653
## 3302     589               2649
## 1611    2571               2590
## 273     1270               2583
## 3307     593               2578
##                                                      Title
## 1917                                American Beauty (1999)
## 1641             Star Wars: Episode IV - A New Hope (1977)
## 192  Star Wars: Episode V - The Empire Strikes Back (1980)
## 209      Star Wars: Episode VI - Return of the Jedi (1983)
## 3183                                  Jurassic Park (1993)
## 1026                            Saving Private Ryan (1998)
## 3302                     Terminator 2: Judgment Day (1991)
## 1611                                    Matrix, The (1999)
## 273                              Back to the Future (1985)
## 3307                      Silence of the Lambs, The (1991)

 #Top 10 Most rated Movies ordered by decending Movies Names
 List <- top10[order(top10$Title),]
 
 List = List[c(1,3,2,4)]

The List of 10 Most viewed Movies ordered by their Title Names are

##                                                      Title
## 1917                                American Beauty (1999)
## 273                              Back to the Future (1985)
## 3183                                  Jurassic Park (1993)
## 1611                                    Matrix, The (1999)
## 1026                            Saving Private Ryan (1998)
## 3307                      Silence of the Lambs, The (1991)
## 1641             Star Wars: Episode IV - A New Hope (1977)
## 192  Star Wars: Episode V - The Empire Strikes Back (1980)
## 209      Star Wars: Episode VI - Return of the Jedi (1983)
## 3302                     Terminator 2: Judgment Day (1991)
##      No.-of-Users-Voted
## 1917               3428
## 273                2583
## 3183               2672
## 1611               2590
## 1026               2653
## 3307               2578
## 1641               2991
## 192                2990
## 209                2883
## 3302               2649

Top twenty rated movies (Condition : The movie should be rated/viewed by at least 40 users)

Since, we already have the “No. of Users Voted” column, we should eliminate the list of Movies which are rated by less than 40 users

 #Eliminating
 top2 = subset(top, `No.-of-Users-Voted` >=40)
 
 #Lets map the dataset with ratings dataset. Since we want ratings of the movies
 top2 = merge(top2, ratings, by = "MovieID")
 
 #Initially the Ratings column is of character vector, it should be converted into numerical data type
 top3 = transform(top2, Rating = as.numeric(Rating))
 
 #Avg rating of the movies w.r.t the views
 kk = aggregate( Rating ~ MovieID, data = top3,  mean )
 
 head(kk)

##   MovieID   Rating
## 1       1 4.146846
## 2      10 3.540541
## 3     100 3.062500
## 4    1003 2.942149
## 5    1004 2.663366
## 6    1005 2.373239

 #Since the data set only contains MovieID and Avg Rating, we need to map w.r.t the MovieID columns to get Titles and No. of Users Voted
 
 kk1 = merge(top[,1:3], kk, by = "MovieID" )
 
 #Lets order the dataset w.r.t Rating
 kk1 = kk1[order(kk1$Rating, decreasing = TRUE),]
 
 #Sorted by Rating
 Top20 = head(kk1, n = 20)
 
 #Sorted by Name
 List20 <- Top20[order(Top20$Title),]

The List of Top 20 rated Movies ordered by their Title Names are

##      MovieID No.-of-Users-Voted
## 2605     912               1669
## 2510     745                657
## 1869    3435                551
## 2513     750               1367
## 2568     858               2223
## 120     1178                230
## 138     1198               2514
## 2597     904               1050
## 1470    2905                 69
## 2377     527               2304
## 750     2019                628
## 1690     318               2227
## 1234     260               2991
## 2616     922                470
## 153     1212                480
## 148     1207                928
## 2351      50               1783
## 2496     720                438
## 2468     670                 56
## 107     1148                882
##                                                                            Title
## 2605                                                           Casablanca (1942)
## 2510                                                       Close Shave, A (1995)
## 1869                                                     Double Indemnity (1944)
## 2513 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
## 2568                                                       Godfather, The (1972)
## 120                                                        Paths of Glory (1957)
## 138                                               Raiders of the Lost Ark (1981)
## 2597                                                          Rear Window (1954)
## 1470                                                              Sanjuro (1962)
## 2377                                                     Schindler's List (1993)
## 750          Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
## 1690                                            Shawshank Redemption, The (1994)
## 1234                                   Star Wars: Episode IV - A New Hope (1977)
## 2616                               Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
## 153                                                        Third Man, The (1949)
## 148                                                 To Kill a Mockingbird (1962)
## 2351                                                  Usual Suspects, The (1995)
## 2496                      Wallace & Gromit: The Best of Aardman Animation (1996)
## 2468                                      World of Apu, The (Apur Sansar) (1959)
## 107                                                   Wrong Trousers, The (1993)
##        Rating
## 2605 4.412822
## 2510 4.520548
## 1869 4.415608
## 2513 4.449890
## 2568 4.524966
## 120  4.473913
## 138  4.477725
## 2597 4.476190
## 1470 4.608696
## 2377 4.510417
## 750  4.560510
## 1690 4.554558
## 1234 4.453694
## 2616 4.491489
## 153  4.452083
## 148  4.425647
## 2351 4.517106
## 2496 4.426941
## 2468 4.410714
## 107  4.507937

Top twenty rated movies (which is calculated in the previous step) with no of views in the following age group

Age group :

Young (<20 years),
Young Adult (20-40 years),
Adult (> 40years) )

Lets break it down to the Age of Users voted w.r.t the Age Group provided.

  #Re grouping
  users$age.grp = findInterval(users$Age, c(0, 20, 40, 100))
  
  #Mapping the new DataSet w.r.t the ratings dataset
  gg = merge(subset(users, select=c(1,6)), subset(ratings, select=c(1,2,3)), by = "UserID")
 
  #Re grouping with new column
  gg$Age.Group = ifelse(gg$age.grp==1, "Young", ifelse(gg$age.grp==2, "Young Adult", "Adult"))
 
  nk = merge(kk1, gg, by = "MovieID")
  
  dd = as.data.frame.matrix(table(nk$MovieID, nk$Age.Group))
  
  #Just Aggregating
  dd$Total.Views = dd$Adult + dd$Young + dd$`Young Adult`
  
  #Establishing Row names to a Column
  setDT(dd, keep.rownames = TRUE)[]

##         rn Adult Young Young Adult Total.Views
##    1:    1   304   560        1213        2077
##    2:   10   122   260         506         888
##    3:  100    18    17          93         128
##    4: 1000     0     0           0           0
##    5: 1002     0     0           0           0
##   ---                                         
## 3702:  994   115    49         286         450
## 3703:  996    44    64         148         256
## 3704:  997     0     0           0           0
## 3705:  998     6    34          53          93
## 3706:  999    45    65         176         286

  colnames(dd)[colnames(dd)=="rn"] <- "MovieID"
  
  #Top 20 movies with maximum view sorted by Age Group
  NewList20 = merge(List20, dd, by = "MovieID")

The List of Top 20 rated Movies ordered by their Views on Age Group are

##    MovieID No.-of-Users-Voted
## 1     1148                882
## 2     1178                230
## 3     1198               2514
## 4     1207                928
## 5     1212                480
## 6     2019                628
## 7      260               2991
## 8     2905                 69
## 9      318               2227
## 10    3435                551
## 11      50               1783
## 12     527               2304
## 13     670                 56
## 14     720                438
## 15     745                657
## 16     750               1367
## 17     858               2223
## 18     904               1050
## 19     912               1669
## 20     922                470
##                                                                          Title
## 1                                                   Wrong Trousers, The (1993)
## 2                                                        Paths of Glory (1957)
## 3                                               Raiders of the Lost Ark (1981)
## 4                                                 To Kill a Mockingbird (1962)
## 5                                                        Third Man, The (1949)
## 6          Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
## 7                                    Star Wars: Episode IV - A New Hope (1977)
## 8                                                               Sanjuro (1962)
## 9                                             Shawshank Redemption, The (1994)
## 10                                                     Double Indemnity (1944)
## 11                                                  Usual Suspects, The (1995)
## 12                                                     Schindler's List (1993)
## 13                                      World of Apu, The (Apur Sansar) (1959)
## 14                      Wallace & Gromit: The Best of Aardman Animation (1996)
## 15                                                       Close Shave, A (1995)
## 16 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
## 17                                                       Godfather, The (1972)
## 18                                                          Rear Window (1954)
## 19                                                           Casablanca (1942)
## 20                               Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
##      Rating Adult Young Young Adult Total.Views
## 1  4.507937   119   221         542         882
## 2  4.473913    82    32         116         230
## 3  4.477725   475   541        1498        2514
## 4  4.425647   266   180         482         928
## 5  4.452083   160    60         260         480
## 6  4.560510   179    95         354         628
## 7  4.453694   574   663        1754        2991
## 8  4.608696    17    10          42          69
## 9  4.554558   439   510        1278        2227
## 10 4.415608   169    68         314         551
## 11 4.517106   260   434        1089        1783
## 12 4.510417   518   498        1288        2304
## 13 4.410714    20     5          31          56
## 14 4.426941    66   107         265         438
## 15 4.520548    71   159         427         657
## 16 4.449890   368   230         769        1367
## 17 4.524966   524   417        1282        2223
## 18 4.476190   275   140         635        1050
## 19 4.412822   461   261         947        1669
## 20 4.491489   141    57         272         470

Top ten critics (Users who have given very low ratings; Condition : The users should have at least rated 40 movies)

  #Conversion of Categorical data type of column to Numerical data type
  ratings = transform(ratings, UserID = as.numeric(UserID))
  ratings = transform(ratings, Rating= as.numeric(Rating))
  
  #Creating a Data Frame
  jj = as.data.frame(table(ratings$UserID))
  
  names(jj) = c("UserID", "No.-of-Movies-Voted")
  #The users at least rated 40 movies
  jj2 = subset(jj, `No.-of-Movies-Voted` >=40)
  
  jj2 = merge(jj2, ratings, by = "UserID")
  
  New = aggregate( Rating ~ UserID, data = ratings,  mean )
  
  New = New[order(New$Rating, decreasing = FALSE),]
  
  #Top10 Critics with low rating
  New10 = head(New, n = 10)

Top ten critics who have given very low ratings are

##      UserID   Rating
## 3598   3598 1.015385
## 4486   4486 1.058824
## 2744   2744 1.304348
## 4539   4539 1.815126
## 5850   5850 1.844828
## 5334   5334 1.927273
## 4349   4349 1.962963
## 4636   4636 2.000000
## 5686   5686 2.045283
## 3209   3209 2.060870

The List depicts the User ID’s of the users who have given very low ratings