Final Project

Loading the Required Libraries

library(tidyverse)

## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(stringr)
library(tidytext)

library(caret)
library(tm)
library(e1071)
library(caTools)

Loading The Data

ratings <- read.csv('C:\\DATA607\\FinalProject\\ml-20m\\ratings.csv')
head(ratings)

##   userId movieId rating  timestamp
## 1      1       2    3.5 1112486027
## 2      1      29    3.5 1112484676
## 3      1      32    3.5 1112484819
## 4      1      47    3.5 1112484727
## 5      1      50    3.5 1112484580
## 6      1     112    3.5 1094785740

Missing data

ratings$rating = ifelse(is.na(ratings$rating),
                     ave(ratings$rating, FUN = function(x) mean(x, na.rm = TRUE)),
                     ratings$rating)

g.scores <- read.csv('C:\\DATA607\\FinalProject\\ml-20m\\genome-scores.csv')
head(g.scores)

##   movieId tagId relevance
## 1       1     1   0.02500
## 2       1     2   0.02500
## 3       1     3   0.05775
## 4       1     4   0.09675
## 5       1     5   0.14675
## 6       1     6   0.21700

summary(ratings)

##      userId          movieId           rating        timestamp        
##  Min.   :     1   Min.   :     1   Min.   :0.500   Min.   :7.897e+08  
##  1st Qu.: 34395   1st Qu.:   902   1st Qu.:3.000   1st Qu.:9.668e+08  
##  Median : 69141   Median :  2167   Median :3.500   Median :1.104e+09  
##  Mean   : 69046   Mean   :  9042   Mean   :3.526   Mean   :1.101e+09  
##  3rd Qu.:103637   3rd Qu.:  4770   3rd Qu.:4.000   3rd Qu.:1.226e+09  
##  Max.   :138493   Max.   :131262   Max.   :5.000   Max.   :1.428e+09

rating <- select(ratings, userId, movieId, rating)
rating <- rating[1:1500,]
str(rating)

## 'data.frame':    1500 obs. of  3 variables:
##  $ userId : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ movieId: int  2 29 32 47 50 112 151 223 253 260 ...
##  $ rating : num  3.5 3.5 3.5 3.5 3.5 3.5 4 4 4 4 ...

scores <- select(g.scores, tagId, relevance)
scores <- scores[1:1000,]

K-Means Clustering

Using the elbow method to find the optimal number of clusters

dataset <- rating[,2:3]

set.seed(3)
rate = vector()
for (i in 1:10) {rate[i] = sum(kmeans(dataset, i)$withinss)}
plot(1:10,
     rate,
     type = 'b',
     main = paste('The Elbow Method'),
     xlab = 'Number of clusters',
     ylab = 'rating')

There are 2 clusterings

Fitting K-Means to the dataset

set.seed(29)
kmeans = kmeans(x = dataset, centers = 2)
y_kmeans = kmeans$cluster

Visualising the clusters

library(cluster)
clusplot(dataset,
         y_kmeans,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 2,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Clusters of movies rating'),
         xlab = 'movies id',
         ylab = 'Movie rating')

Support Vector Machine (SVM)

Setting the dataset

dataset <- rating
head(dataset)

##   userId movieId rating
## 1      1       2    3.5
## 2      1      29    3.5
## 3      1      32    3.5
## 4      1      47    3.5
## 5      1      50    3.5
## 6      1     112    3.5

Encoding the target feature as factor

#dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
dataset$rating = factor(dataset$rating, levels = c(1, 1.5, 2, 2.5, 3, 3.5, 4 , 4.5, 5))

Splitting the dataset into the Training set and Test set

set.seed(123)
#split = sample.split(dataset$Purchased, SplitRatio = 0.75)
split = sample.split(dataset$rating, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

Feature Scaling

training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

Fitting SVM to the Training set

classifier = svm(formula = rating ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

Predicting the Test set results

y_pred = predict(classifier, newdata = test_set[-3])

Making the Confusion Matrix

cm = table(test_set[, 3], y_pred)
cm

##      y_pred
##         1 1.5   2 2.5   3 3.5   4 4.5   5
##   1     0   0   0   0   0   0   4   0   1
##   1.5   0   0   0   0   0   0   0   0   2
##   2     0   0   0   0   0   0  21   0   3
##   2.5   0   0   0   0   0   0   4   0   0
##   3     0   0   0   0   0   0  72   0   3
##   3.5   0   0   0   0   0   0  27   0   3
##   4     0   0   0   0   0   0 106   0   8
##   4.5   0   0   0   0   0   0  21   0   5
##   5     0   0   0   0   0   0  81   0   9

Visualising the Training set results

library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
     main = 'SVM (Training set)',
     xlab = 'UserId', ylab = 'MovieId',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Visualising the Test set results

library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
     xlab = 'UserId', ylab = 'MovieId',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Applying k-Fold Cross Validation to SVM model

folds = createFolds(training_set$rating, k = 10)
cv = lapply(folds, function(x) {
  training_fold = training_set[-x, ]
  test_fold = training_set[x, ]
  classifier = svm(formula = rating ~ .,
                   data = training_fold,
                   type = 'C-classification',
                   kernel = 'radial')
  y_pred = predict(classifier, newdata = test_fold[-3])
  cm = table(test_fold[, 3], y_pred)
  s=0
  for (i in 1:9) {
    s= s+cm[i,i]      
  }
  accuracy = s /sum(cm)
  return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy

## [1] 0.3651316

K-Nearest Neighbors (K-NN)

Importing the dataset

dataset <- rating

Fitting K-NN to the Training set and Predicting the Test set results

library(class)
y_pred = knn(train = training_set[, -3],
             test = test_set[, -3],
             cl = training_set[, 3],
             k = 5,
             prob = TRUE)

Making the Confusion Matrix

cm = table(test_set[, 3], y_pred)
cm

##      y_pred
##        1 1.5  2 2.5  3 3.5  4 4.5  5
##   1    0   0  0   1  2   0  1   0  1
##   1.5  0   0  0   0  0   0  0   0  2
##   2    0   0  1   0 10   0  4   5  4
##   2.5  0   0  0   0  0   1  2   1  0
##   3    0   0  3   0 29   4 22   2 15
##   3.5  0   0  1   0  0  12 10   1  6
##   4    0   0  4   1 22  12 48   7 20
##   4.5  0   0  0   1  0   3  1  11 10
##   5    0   0  3   0 15   2 22  12 36

Visualising the Training set results

library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = knn(train = training_set[, -3], test = grid_set, cl = training_set[, 3], k = 5)
plot(set[, -3],
     main = 'K-NN (Training set)',
     xlab = 'User Id', ylab = 'Movie Id',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Visualising the Test set results

library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = knn(train = training_set[, -3], test = grid_set, cl = training_set[, 3], k = 5)
plot(set[, -3],
     main = 'K-NN (Test set)',
     xlab = 'User Id', ylab = 'Movie Id',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Applying k-Fold Cross Validation to KNN model

folds = createFolds(training_set$rating, k = 10)
cv = lapply(folds, function(x) {
  training_fold = training_set[-x, ]
  test_fold = training_set[x, ]
  classifier = svm(formula = rating ~ .,
                   data = training_fold,
                   type = 'C-classification',
                   kernel = 'radial')
  y_pred = predict(classifier, newdata = test_fold[-3])
  cm = table(test_fold[, 3], y_pred)
  s=0
  for (i in 1:9) {
    s= s+cm[i,i]      
  }
  accuracy = s /sum(cm)
  return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy

## [1] 0.3697651

comparing the two models

Her the K-NN model perform a little bit better than the SVM (36.98% > 36.51%)

                            Citation

Kirill Eremenko and others, UDEMY

Final Project

Alain T Kuiete

12/11/2019

Loading the Required Libraries

Loading The Data

Missing data

K-Means Clustering

Using the elbow method to find the optimal number of clusters

Fitting K-Means to the dataset

Visualising the clusters

Support Vector Machine (SVM)

Setting the dataset

Encoding the target feature as factor

Splitting the dataset into the Training set and Test set

Feature Scaling

Fitting SVM to the Training set

Predicting the Test set results

Making the Confusion Matrix

Visualising the Training set results

Visualising the Test set results

Applying k-Fold Cross Validation to SVM model

K-Nearest Neighbors (K-NN)

Importing the dataset

Fitting K-NN to the Training set and Predicting the Test set results

Making the Confusion Matrix

Visualising the Training set results

Visualising the Test set results

Applying k-Fold Cross Validation to KNN model

comparing the two models