install package
#install.packages("recommenderlab")
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(ggplot2) #Author DataFlair
library(data.table)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
movie_data <- read.csv("/cloud/project/Data/movies.csv")
rating_data <- read.csv("/cloud/project/Data/ratings.csv")
str(movie_data)
## 'data.frame': 10329 obs. of 3 variables:
## $ movieId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ title : chr "Toy Story (1995)" "Jumanji (1995)" "Grumpier Old Men (1995)" "Waiting to Exhale (1995)" ...
## $ genres : chr "Adventure|Animation|Children|Comedy|Fantasy" "Adventure|Children|Fantasy" "Comedy|Romance" "Comedy|Drama|Romance" ...
str(rating_data)
## 'data.frame': 105339 obs. of 4 variables:
## $ userId : int 1 1 1 1 1 1 1 1 1 1 ...
## $ movieId : int 16 24 32 47 50 110 150 161 165 204 ...
## $ rating : num 4 1.5 4 4 4 4 3 4 3 0.5 ...
## $ timestamp: int 1217897793 1217895807 1217896246 1217896556 1217896523 1217896150 1217895940 1217897864 1217897135 1217895786 ...
summary(movie_data)
## movieId title genres
## Min. : 1 Length:10329 Length:10329
## 1st Qu.: 3240 Class :character Class :character
## Median : 7088 Mode :character Mode :character
## Mean : 31924
## 3rd Qu.: 59900
## Max. :149532
summary(rating_data)
## userId movieId rating timestamp
## Min. : 1.0 Min. : 1 Min. :0.500 Min. :8.286e+08
## 1st Qu.:192.0 1st Qu.: 1073 1st Qu.:3.000 1st Qu.:9.711e+08
## Median :383.0 Median : 2497 Median :3.500 Median :1.115e+09
## Mean :364.9 Mean : 13381 Mean :3.517 Mean :1.130e+09
## 3rd Qu.:557.0 3rd Qu.: 5991 3rd Qu.:4.000 3rd Qu.:1.275e+09
## Max. :668.0 Max. :149532 Max. :5.000 Max. :1.452e+09
movie_genre <- as.data.frame(movie_data$genres, stringsAsFactors = FALSE)
#View(movie_genre)
movie_genre2 <- as.data.frame(tstrsplit(movie_genre[,1], '[|]',
type.convert=TRUE, names = T),
stringsAsFactors= FALSE)
#hàm tstrsplit = transpose + strsplit tách vector, chuyển đổi vector abc => a , b , c
#View(movie_genre2)
str(movie_genre2)
## 'data.frame': 10329 obs. of 10 variables:
## $ V1 : chr "Adventure" "Adventure" "Comedy" "Comedy" ...
## $ V2 : chr "Animation" "Children" "Romance" "Drama" ...
## $ V3 : chr "Children" "Fantasy" NA "Romance" ...
## $ V4 : chr "Comedy" NA NA NA ...
## $ V5 : chr "Fantasy" NA NA NA ...
## $ V6 : chr NA NA NA NA ...
## $ V7 : chr NA NA NA NA ...
## $ V8 : chr NA NA NA NA ...
## $ V9 : chr NA NA NA NA ...
## $ V10: chr NA NA NA NA ...
#?tstrsplit
#movie_genre2$`10`[1:100]
colnames(movie_genre2) <- c(1:10)
colnames(movie_genre2)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
list_genre <- c("Action", "Adventure", "Animation", "Children",
"Comedy", "Crime","Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery","Romance",
"Sci-Fi", "Thriller", "War", "Western") # gồm 18 dữ liệu
genre_mat1 <- matrix(0,10330,18) # một ma trận gồm toàn số 0, 10330 hàng, 18 cột
genre_mat1[1,] <- list_genre # gán 18 cột của ma trận cho list_genre cũng là 18 dữ liệu
colnames(genre_mat1) <- list_genre
### đang nghiên cứu phần này https://data-flair.training/blogs/data-science-r-movie-recommendation/
for (index in 1:nrow(movie_genre2)) {
for (col in 1:ncol(movie_genre2)) {
gen_col = which(genre_mat1[1,] == movie_genre2[index,col]) #Author DataFlair
genre_mat1[index+1,gen_col] <- 1
}
}
genre_mat2 <- as.data.frame(genre_mat1[-1,], stringsAsFactors=F) #remove first row, which was the genre list
for (col in 1:ncol(genre_mat2)) {
genre_mat2[,col] <- as.integer(genre_mat2[,col]) #convert from characters to integers
}
str(genre_mat2)
## 'data.frame': 10329 obs. of 18 variables:
## $ Action : int 0 0 0 0 0 1 0 0 1 1 ...
## $ Adventure : int 1 1 0 0 0 0 0 1 0 1 ...
## $ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Children : int 1 1 0 0 0 0 0 1 0 0 ...
## $ Comedy : int 1 0 1 1 1 0 1 0 0 0 ...
## $ Crime : int 0 0 0 0 0 1 0 0 0 0 ...
## $ Documentary: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Drama : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Fantasy : int 1 1 0 0 0 0 0 0 0 0 ...
## $ Film-Noir : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Horror : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Musical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Mystery : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Romance : int 0 0 1 1 0 0 1 0 0 0 ...
## $ Sci-Fi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Thriller : int 0 0 0 0 0 1 0 0 0 1 ...
## $ War : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Western : int 0 0 0 0 0 0 0 0 0 0 ...