Loading the Required Libraries

library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringr)
library(tidytext)
library(caret)
library(tm)
library(e1071)
library(caTools)

Loading The Data

ratings <- read.csv('C:\\DATA607\\FinalProject\\ml-20m\\ratings.csv')
head(ratings)
##   userId movieId rating  timestamp
## 1      1       2    3.5 1112486027
## 2      1      29    3.5 1112484676
## 3      1      32    3.5 1112484819
## 4      1      47    3.5 1112484727
## 5      1      50    3.5 1112484580
## 6      1     112    3.5 1094785740

Missing data

ratings$rating = ifelse(is.na(ratings$rating),
                     ave(ratings$rating, FUN = function(x) mean(x, na.rm = TRUE)),
                     ratings$rating)
g.scores <- read.csv('C:\\DATA607\\FinalProject\\ml-20m\\genome-scores.csv')
head(g.scores)
##   movieId tagId relevance
## 1       1     1   0.02500
## 2       1     2   0.02500
## 3       1     3   0.05775
## 4       1     4   0.09675
## 5       1     5   0.14675
## 6       1     6   0.21700
summary(ratings)
##      userId          movieId           rating        timestamp        
##  Min.   :     1   Min.   :     1   Min.   :0.500   Min.   :7.897e+08  
##  1st Qu.: 34395   1st Qu.:   902   1st Qu.:3.000   1st Qu.:9.668e+08  
##  Median : 69141   Median :  2167   Median :3.500   Median :1.104e+09  
##  Mean   : 69046   Mean   :  9042   Mean   :3.526   Mean   :1.101e+09  
##  3rd Qu.:103637   3rd Qu.:  4770   3rd Qu.:4.000   3rd Qu.:1.226e+09  
##  Max.   :138493   Max.   :131262   Max.   :5.000   Max.   :1.428e+09
rating <- select(ratings, userId, movieId, rating)
rating <- rating[1:1500,]
str(rating)
## 'data.frame':    1500 obs. of  3 variables:
##  $ userId : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ movieId: int  2 29 32 47 50 112 151 223 253 260 ...
##  $ rating : num  3.5 3.5 3.5 3.5 3.5 3.5 4 4 4 4 ...
scores <- select(g.scores, tagId, relevance)
scores <- scores[1:1000,]

K-Means Clustering

Using the elbow method to find the optimal number of clusters

dataset <- rating[,2:3]
set.seed(3)
rate = vector()
for (i in 1:10) {rate[i] = sum(kmeans(dataset, i)$withinss)}
plot(1:10,
     rate,
     type = 'b',
     main = paste('The Elbow Method'),
     xlab = 'Number of clusters',
     ylab = 'rating')

There are 2 clusterings

Fitting K-Means to the dataset

set.seed(29)
kmeans = kmeans(x = dataset, centers = 2)
y_kmeans = kmeans$cluster

Visualising the clusters

library(cluster)
clusplot(dataset,
         y_kmeans,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 2,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Clusters of movies rating'),
         xlab = 'movies id',
         ylab = 'Movie rating')

Support Vector Machine (SVM)

Setting the dataset

dataset <- rating
head(dataset)
##   userId movieId rating
## 1      1       2    3.5
## 2      1      29    3.5
## 3      1      32    3.5
## 4      1      47    3.5
## 5      1      50    3.5
## 6      1     112    3.5

Encoding the target feature as factor

#dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
dataset$rating = factor(dataset$rating, levels = c(1, 1.5, 2, 2.5, 3, 3.5, 4 , 4.5, 5))

Splitting the dataset into the Training set and Test set

set.seed(123)
#split = sample.split(dataset$Purchased, SplitRatio = 0.75)
split = sample.split(dataset$rating, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

Feature Scaling

training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

Fitting SVM to the Training set

classifier = svm(formula = rating ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

Predicting the Test set results

y_pred = predict(classifier, newdata = test_set[-3])

Making the Confusion Matrix

cm = table(test_set[, 3], y_pred)
cm
##      y_pred
##         1 1.5   2 2.5   3 3.5   4 4.5   5
##   1     0   0   0   0   0   0   4   0   1
##   1.5   0   0   0   0   0   0   0   0   2
##   2     0   0   0   0   0   0  21   0   3
##   2.5   0   0   0   0   0   0   4   0   0
##   3     0   0   0   0   0   0  72   0   3
##   3.5   0   0   0   0   0   0  27   0   3
##   4     0   0   0   0   0   0 106   0   8
##   4.5   0   0   0   0   0   0  21   0   5
##   5     0   0   0   0   0   0  81   0   9

Visualising the Training set results

library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
     main = 'SVM (Training set)',
     xlab = 'UserId', ylab = 'MovieId',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Visualising the Test set results

library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
     xlab = 'UserId', ylab = 'MovieId',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Applying k-Fold Cross Validation to SVM model

folds = createFolds(training_set$rating, k = 10)
cv = lapply(folds, function(x) {
  training_fold = training_set[-x, ]
  test_fold = training_set[x, ]
  classifier = svm(formula = rating ~ .,
                   data = training_fold,
                   type = 'C-classification',
                   kernel = 'radial')
  y_pred = predict(classifier, newdata = test_fold[-3])
  cm = table(test_fold[, 3], y_pred)
  s=0
  for (i in 1:9) {
    s= s+cm[i,i]      
  }
  accuracy = s /sum(cm)
  return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy
## [1] 0.3651316

K-Nearest Neighbors (K-NN)

Importing the dataset

dataset <- rating

Fitting K-NN to the Training set and Predicting the Test set results

library(class)
y_pred = knn(train = training_set[, -3],
             test = test_set[, -3],
             cl = training_set[, 3],
             k = 5,
             prob = TRUE)

Making the Confusion Matrix

cm = table(test_set[, 3], y_pred)
cm
##      y_pred
##        1 1.5  2 2.5  3 3.5  4 4.5  5
##   1    0   0  0   1  2   0  1   0  1
##   1.5  0   0  0   0  0   0  0   0  2
##   2    0   0  1   0 10   0  4   5  4
##   2.5  0   0  0   0  0   1  2   1  0
##   3    0   0  3   0 29   4 22   2 15
##   3.5  0   0  1   0  0  12 10   1  6
##   4    0   0  4   1 22  12 48   7 20
##   4.5  0   0  0   1  0   3  1  11 10
##   5    0   0  3   0 15   2 22  12 36

Visualising the Training set results

library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = knn(train = training_set[, -3], test = grid_set, cl = training_set[, 3], k = 5)
plot(set[, -3],
     main = 'K-NN (Training set)',
     xlab = 'User Id', ylab = 'Movie Id',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Visualising the Test set results

library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = knn(train = training_set[, -3], test = grid_set, cl = training_set[, 3], k = 5)
plot(set[, -3],
     main = 'K-NN (Test set)',
     xlab = 'User Id', ylab = 'Movie Id',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Applying k-Fold Cross Validation to KNN model

folds = createFolds(training_set$rating, k = 10)
cv = lapply(folds, function(x) {
  training_fold = training_set[-x, ]
  test_fold = training_set[x, ]
  classifier = svm(formula = rating ~ .,
                   data = training_fold,
                   type = 'C-classification',
                   kernel = 'radial')
  y_pred = predict(classifier, newdata = test_fold[-3])
  cm = table(test_fold[, 3], y_pred)
  s=0
  for (i in 1:9) {
    s= s+cm[i,i]      
  }
  accuracy = s /sum(cm)
  return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy
## [1] 0.3697651

comparing the two models

Her the K-NN model perform a little bit better than the SVM (36.98% > 36.51%)

                            Citation

Kirill Eremenko and others, UDEMY