library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(tidytext)
library(caret)
library(tm)
library(e1071)
library(caTools)
ratings <- read.csv('C:\\DATA607\\FinalProject\\ml-20m\\ratings.csv')
head(ratings)
## userId movieId rating timestamp
## 1 1 2 3.5 1112486027
## 2 1 29 3.5 1112484676
## 3 1 32 3.5 1112484819
## 4 1 47 3.5 1112484727
## 5 1 50 3.5 1112484580
## 6 1 112 3.5 1094785740
ratings$rating = ifelse(is.na(ratings$rating),
ave(ratings$rating, FUN = function(x) mean(x, na.rm = TRUE)),
ratings$rating)
g.scores <- read.csv('C:\\DATA607\\FinalProject\\ml-20m\\genome-scores.csv')
head(g.scores)
## movieId tagId relevance
## 1 1 1 0.02500
## 2 1 2 0.02500
## 3 1 3 0.05775
## 4 1 4 0.09675
## 5 1 5 0.14675
## 6 1 6 0.21700
summary(ratings)
## userId movieId rating timestamp
## Min. : 1 Min. : 1 Min. :0.500 Min. :7.897e+08
## 1st Qu.: 34395 1st Qu.: 902 1st Qu.:3.000 1st Qu.:9.668e+08
## Median : 69141 Median : 2167 Median :3.500 Median :1.104e+09
## Mean : 69046 Mean : 9042 Mean :3.526 Mean :1.101e+09
## 3rd Qu.:103637 3rd Qu.: 4770 3rd Qu.:4.000 3rd Qu.:1.226e+09
## Max. :138493 Max. :131262 Max. :5.000 Max. :1.428e+09
rating <- select(ratings, userId, movieId, rating)
rating <- rating[1:1500,]
str(rating)
## 'data.frame': 1500 obs. of 3 variables:
## $ userId : int 1 1 1 1 1 1 1 1 1 1 ...
## $ movieId: int 2 29 32 47 50 112 151 223 253 260 ...
## $ rating : num 3.5 3.5 3.5 3.5 3.5 3.5 4 4 4 4 ...
scores <- select(g.scores, tagId, relevance)
scores <- scores[1:1000,]
dataset <- rating[,2:3]
set.seed(3)
rate = vector()
for (i in 1:10) {rate[i] = sum(kmeans(dataset, i)$withinss)}
plot(1:10,
rate,
type = 'b',
main = paste('The Elbow Method'),
xlab = 'Number of clusters',
ylab = 'rating')
There are 2 clusterings
set.seed(29)
kmeans = kmeans(x = dataset, centers = 2)
y_kmeans = kmeans$cluster
library(cluster)
clusplot(dataset,
y_kmeans,
lines = 0,
shade = TRUE,
color = TRUE,
labels = 2,
plotchar = FALSE,
span = TRUE,
main = paste('Clusters of movies rating'),
xlab = 'movies id',
ylab = 'Movie rating')
dataset <- rating
head(dataset)
## userId movieId rating
## 1 1 2 3.5
## 2 1 29 3.5
## 3 1 32 3.5
## 4 1 47 3.5
## 5 1 50 3.5
## 6 1 112 3.5
#dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
dataset$rating = factor(dataset$rating, levels = c(1, 1.5, 2, 2.5, 3, 3.5, 4 , 4.5, 5))
set.seed(123)
#split = sample.split(dataset$Purchased, SplitRatio = 0.75)
split = sample.split(dataset$rating, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
classifier = svm(formula = rating ~ .,
data = training_set,
type = 'C-classification',
kernel = 'linear')
y_pred = predict(classifier, newdata = test_set[-3])
cm = table(test_set[, 3], y_pred)
cm
## y_pred
## 1 1.5 2 2.5 3 3.5 4 4.5 5
## 1 0 0 0 0 0 0 4 0 1
## 1.5 0 0 0 0 0 0 0 0 2
## 2 0 0 0 0 0 0 21 0 3
## 2.5 0 0 0 0 0 0 4 0 0
## 3 0 0 0 0 0 0 72 0 3
## 3.5 0 0 0 0 0 0 27 0 3
## 4 0 0 0 0 0 0 106 0 8
## 4.5 0 0 0 0 0 0 21 0 5
## 5 0 0 0 0 0 0 81 0 9
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
main = 'SVM (Training set)',
xlab = 'UserId', ylab = 'MovieId',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
xlab = 'UserId', ylab = 'MovieId',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
folds = createFolds(training_set$rating, k = 10)
cv = lapply(folds, function(x) {
training_fold = training_set[-x, ]
test_fold = training_set[x, ]
classifier = svm(formula = rating ~ .,
data = training_fold,
type = 'C-classification',
kernel = 'radial')
y_pred = predict(classifier, newdata = test_fold[-3])
cm = table(test_fold[, 3], y_pred)
s=0
for (i in 1:9) {
s= s+cm[i,i]
}
accuracy = s /sum(cm)
return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy
## [1] 0.3651316
dataset <- rating
library(class)
y_pred = knn(train = training_set[, -3],
test = test_set[, -3],
cl = training_set[, 3],
k = 5,
prob = TRUE)
cm = table(test_set[, 3], y_pred)
cm
## y_pred
## 1 1.5 2 2.5 3 3.5 4 4.5 5
## 1 0 0 0 1 2 0 1 0 1
## 1.5 0 0 0 0 0 0 0 0 2
## 2 0 0 1 0 10 0 4 5 4
## 2.5 0 0 0 0 0 1 2 1 0
## 3 0 0 3 0 29 4 22 2 15
## 3.5 0 0 1 0 0 12 10 1 6
## 4 0 0 4 1 22 12 48 7 20
## 4.5 0 0 0 1 0 3 1 11 10
## 5 0 0 3 0 15 2 22 12 36
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = knn(train = training_set[, -3], test = grid_set, cl = training_set[, 3], k = 5)
plot(set[, -3],
main = 'K-NN (Training set)',
xlab = 'User Id', ylab = 'Movie Id',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('userId', 'movieId')
y_grid = knn(train = training_set[, -3], test = grid_set, cl = training_set[, 3], k = 5)
plot(set[, -3],
main = 'K-NN (Test set)',
xlab = 'User Id', ylab = 'Movie Id',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
folds = createFolds(training_set$rating, k = 10)
cv = lapply(folds, function(x) {
training_fold = training_set[-x, ]
test_fold = training_set[x, ]
classifier = svm(formula = rating ~ .,
data = training_fold,
type = 'C-classification',
kernel = 'radial')
y_pred = predict(classifier, newdata = test_fold[-3])
cm = table(test_fold[, 3], y_pred)
s=0
for (i in 1:9) {
s= s+cm[i,i]
}
accuracy = s /sum(cm)
return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy
## [1] 0.3697651
Her the K-NN model perform a little bit better than the SVM (36.98% > 36.51%)
Citation
Kirill Eremenko and others, UDEMY