This is an R Markdown document.
library(data.table); library(knitr)
A brief analysis has been performed on the data procured and documented as an R Markdown document.
#Loading Movies.DAT file
movie <- readLines("movies.dat")
movie <- as.data.frame(do.call("rbind",strsplit(movie,"::")),stringsAsFactors = FALSE)
names(movie) <- c("MovieID","Title","Genres")
#Loading Ratings.DAT file
ratings = readLines("ratings.dat")
ratings = as.data.frame(do.call("rbind", strsplit(ratings, "::")),stringsAsFactors = FALSE)
names(ratings) = c("UserID", "MovieID", "Rating", "Timestamp")
#Loading Users.DAT file
users = readLines("users.dat")
users = as.data.frame(do.call("rbind", strsplit(users, "::")),stringsAsFactors = FALSE)
names(users) = c("UserID", "Gender", "Age", "Occupation","Zip-code")
#View the Loaded DataSets
head(movie)
## MovieID Title Genres
## 1 1 Toy Story (1995) Animation|Children's|Comedy
## 2 2 Jumanji (1995) Adventure|Children's|Fantasy
## 3 3 Grumpier Old Men (1995) Comedy|Romance
## 4 4 Waiting to Exhale (1995) Comedy|Drama
## 5 5 Father of the Bride Part II (1995) Comedy
## 6 6 Heat (1995) Action|Crime|Thriller
head(users)
## UserID Gender Age Occupation Zip-code
## 1 1 F 1 10 48067
## 2 2 M 56 16 70072
## 3 3 M 25 15 55117
## 4 4 M 45 7 02460
## 5 5 M 25 20 55455
## 6 6 F 50 9 55117
head(ratings)
## UserID MovieID Rating Timestamp
## 1 1 1193 5 978300760
## 2 1 661 3 978302109
## 3 1 914 3 978301968
## 4 1 3408 4 978300275
## 5 1 2355 5 978824291
## 6 1 1197 3 978302268
This can be interpreted as the 10 movies which received the most votes by the users.
#Intially lets map the Movies w.r.t the No. of Votes recieved
top = as.data.frame(table(ratings$MovieID))
names(top) = c("MovieID", "No.-of-Users-Voted")
head(top)
## MovieID No.-of-Users-Voted
## 1 1 2077
## 2 10 888
## 3 100 128
## 4 1000 20
## 5 1002 8
## 6 1003 121
#Lets merge the New data set with the ratings dataset
top = merge(top, movie, by = "MovieID")
#Letss sort our new dataset with decreasing No.-of-Users-Voted
top = top[order(top$`No.-of-Users-Voted`, decreasing = TRUE),]
top10 = head(top, n = 10)
#Top 10 Most rated movies ordered by decending Views
print(top10[c(1:3)])
## MovieID No.-of-Users-Voted
## 1917 2858 3428
## 1641 260 2991
## 192 1196 2990
## 209 1210 2883
## 3183 480 2672
## 1026 2028 2653
## 3302 589 2649
## 1611 2571 2590
## 273 1270 2583
## 3307 593 2578
## Title
## 1917 American Beauty (1999)
## 1641 Star Wars: Episode IV - A New Hope (1977)
## 192 Star Wars: Episode V - The Empire Strikes Back (1980)
## 209 Star Wars: Episode VI - Return of the Jedi (1983)
## 3183 Jurassic Park (1993)
## 1026 Saving Private Ryan (1998)
## 3302 Terminator 2: Judgment Day (1991)
## 1611 Matrix, The (1999)
## 273 Back to the Future (1985)
## 3307 Silence of the Lambs, The (1991)
#Top 10 Most rated Movies ordered by decending Movies Names
List <- top10[order(top10$Title),]
List = List[c(1,3,2,4)]
The List of 10 Most viewed Movies ordered by their Title Names are
## Title
## 1917 American Beauty (1999)
## 273 Back to the Future (1985)
## 3183 Jurassic Park (1993)
## 1611 Matrix, The (1999)
## 1026 Saving Private Ryan (1998)
## 3307 Silence of the Lambs, The (1991)
## 1641 Star Wars: Episode IV - A New Hope (1977)
## 192 Star Wars: Episode V - The Empire Strikes Back (1980)
## 209 Star Wars: Episode VI - Return of the Jedi (1983)
## 3302 Terminator 2: Judgment Day (1991)
## No.-of-Users-Voted
## 1917 3428
## 273 2583
## 3183 2672
## 1611 2590
## 1026 2653
## 3307 2578
## 1641 2991
## 192 2990
## 209 2883
## 3302 2649
Since, we already have the “No. of Users Voted” column, we should eliminate the list of Movies which are rated by less than 40 users
#Eliminating
top2 = subset(top, `No.-of-Users-Voted` >=40)
#Lets map the dataset with ratings dataset. Since we want ratings of the movies
top2 = merge(top2, ratings, by = "MovieID")
#Initially the Ratings column is of character vector, it should be converted into numerical data type
top3 = transform(top2, Rating = as.numeric(Rating))
#Avg rating of the movies w.r.t the views
kk = aggregate( Rating ~ MovieID, data = top3, mean )
head(kk)
## MovieID Rating
## 1 1 4.146846
## 2 10 3.540541
## 3 100 3.062500
## 4 1003 2.942149
## 5 1004 2.663366
## 6 1005 2.373239
#Since the data set only contains MovieID and Avg Rating, we need to map w.r.t the MovieID columns to get Titles and No. of Users Voted
kk1 = merge(top[,1:3], kk, by = "MovieID" )
#Lets order the dataset w.r.t Rating
kk1 = kk1[order(kk1$Rating, decreasing = TRUE),]
#Sorted by Rating
Top20 = head(kk1, n = 20)
#Sorted by Name
List20 <- Top20[order(Top20$Title),]
The List of Top 20 rated Movies ordered by their Title Names are
## MovieID No.-of-Users-Voted
## 2605 912 1669
## 2510 745 657
## 1869 3435 551
## 2513 750 1367
## 2568 858 2223
## 120 1178 230
## 138 1198 2514
## 2597 904 1050
## 1470 2905 69
## 2377 527 2304
## 750 2019 628
## 1690 318 2227
## 1234 260 2991
## 2616 922 470
## 153 1212 480
## 148 1207 928
## 2351 50 1783
## 2496 720 438
## 2468 670 56
## 107 1148 882
## Title
## 2605 Casablanca (1942)
## 2510 Close Shave, A (1995)
## 1869 Double Indemnity (1944)
## 2513 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
## 2568 Godfather, The (1972)
## 120 Paths of Glory (1957)
## 138 Raiders of the Lost Ark (1981)
## 2597 Rear Window (1954)
## 1470 Sanjuro (1962)
## 2377 Schindler's List (1993)
## 750 Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
## 1690 Shawshank Redemption, The (1994)
## 1234 Star Wars: Episode IV - A New Hope (1977)
## 2616 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
## 153 Third Man, The (1949)
## 148 To Kill a Mockingbird (1962)
## 2351 Usual Suspects, The (1995)
## 2496 Wallace & Gromit: The Best of Aardman Animation (1996)
## 2468 World of Apu, The (Apur Sansar) (1959)
## 107 Wrong Trousers, The (1993)
## Rating
## 2605 4.412822
## 2510 4.520548
## 1869 4.415608
## 2513 4.449890
## 2568 4.524966
## 120 4.473913
## 138 4.477725
## 2597 4.476190
## 1470 4.608696
## 2377 4.510417
## 750 4.560510
## 1690 4.554558
## 1234 4.453694
## 2616 4.491489
## 153 4.452083
## 148 4.425647
## 2351 4.517106
## 2496 4.426941
## 2468 4.410714
## 107 4.507937
Age group :
Lets break it down to the Age of Users voted w.r.t the Age Group provided.
#Re grouping
users$age.grp = findInterval(users$Age, c(0, 20, 40, 100))
#Mapping the new DataSet w.r.t the ratings dataset
gg = merge(subset(users, select=c(1,6)), subset(ratings, select=c(1,2,3)), by = "UserID")
#Re grouping with new column
gg$Age.Group = ifelse(gg$age.grp==1, "Young", ifelse(gg$age.grp==2, "Young Adult", "Adult"))
nk = merge(kk1, gg, by = "MovieID")
dd = as.data.frame.matrix(table(nk$MovieID, nk$Age.Group))
#Just Aggregating
dd$Total.Views = dd$Adult + dd$Young + dd$`Young Adult`
#Establishing Row names to a Column
setDT(dd, keep.rownames = TRUE)[]
## rn Adult Young Young Adult Total.Views
## 1: 1 304 560 1213 2077
## 2: 10 122 260 506 888
## 3: 100 18 17 93 128
## 4: 1000 0 0 0 0
## 5: 1002 0 0 0 0
## ---
## 3702: 994 115 49 286 450
## 3703: 996 44 64 148 256
## 3704: 997 0 0 0 0
## 3705: 998 6 34 53 93
## 3706: 999 45 65 176 286
colnames(dd)[colnames(dd)=="rn"] <- "MovieID"
#Top 20 movies with maximum view sorted by Age Group
NewList20 = merge(List20, dd, by = "MovieID")
The List of Top 20 rated Movies ordered by their Views on Age Group are
## MovieID No.-of-Users-Voted
## 1 1148 882
## 2 1178 230
## 3 1198 2514
## 4 1207 928
## 5 1212 480
## 6 2019 628
## 7 260 2991
## 8 2905 69
## 9 318 2227
## 10 3435 551
## 11 50 1783
## 12 527 2304
## 13 670 56
## 14 720 438
## 15 745 657
## 16 750 1367
## 17 858 2223
## 18 904 1050
## 19 912 1669
## 20 922 470
## Title
## 1 Wrong Trousers, The (1993)
## 2 Paths of Glory (1957)
## 3 Raiders of the Lost Ark (1981)
## 4 To Kill a Mockingbird (1962)
## 5 Third Man, The (1949)
## 6 Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
## 7 Star Wars: Episode IV - A New Hope (1977)
## 8 Sanjuro (1962)
## 9 Shawshank Redemption, The (1994)
## 10 Double Indemnity (1944)
## 11 Usual Suspects, The (1995)
## 12 Schindler's List (1993)
## 13 World of Apu, The (Apur Sansar) (1959)
## 14 Wallace & Gromit: The Best of Aardman Animation (1996)
## 15 Close Shave, A (1995)
## 16 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
## 17 Godfather, The (1972)
## 18 Rear Window (1954)
## 19 Casablanca (1942)
## 20 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
## Rating Adult Young Young Adult Total.Views
## 1 4.507937 119 221 542 882
## 2 4.473913 82 32 116 230
## 3 4.477725 475 541 1498 2514
## 4 4.425647 266 180 482 928
## 5 4.452083 160 60 260 480
## 6 4.560510 179 95 354 628
## 7 4.453694 574 663 1754 2991
## 8 4.608696 17 10 42 69
## 9 4.554558 439 510 1278 2227
## 10 4.415608 169 68 314 551
## 11 4.517106 260 434 1089 1783
## 12 4.510417 518 498 1288 2304
## 13 4.410714 20 5 31 56
## 14 4.426941 66 107 265 438
## 15 4.520548 71 159 427 657
## 16 4.449890 368 230 769 1367
## 17 4.524966 524 417 1282 2223
## 18 4.476190 275 140 635 1050
## 19 4.412822 461 261 947 1669
## 20 4.491489 141 57 272 470
#Conversion of Categorical data type of column to Numerical data type
ratings = transform(ratings, UserID = as.numeric(UserID))
ratings = transform(ratings, Rating= as.numeric(Rating))
#Creating a Data Frame
jj = as.data.frame(table(ratings$UserID))
names(jj) = c("UserID", "No.-of-Movies-Voted")
#The users at least rated 40 movies
jj2 = subset(jj, `No.-of-Movies-Voted` >=40)
jj2 = merge(jj2, ratings, by = "UserID")
New = aggregate( Rating ~ UserID, data = ratings, mean )
New = New[order(New$Rating, decreasing = FALSE),]
#Top10 Critics with low rating
New10 = head(New, n = 10)
Top ten critics who have given very low ratings are
## UserID Rating
## 3598 3598 1.015385
## 4486 4486 1.058824
## 2744 2744 1.304348
## 4539 4539 1.815126
## 5850 5850 1.844828
## 5334 5334 1.927273
## 4349 4349 1.962963
## 4636 4636 2.000000
## 5686 5686 2.045283
## 3209 3209 2.060870
The List depicts the User ID’s of the users who have given very low ratings