if (!require("knitr")) install.packages("knitr")
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("kableExtra")) install.packages("kableExtra")
if (!require("dplyr")) install.packages("dplyr")
Briefly describe the recommender system that you’re going to build out from a business perspective, e.g. “This system recommends data science books to readers.”
This system recommends members watched movies to other family members
Find a dataset, or build out your own toy dataset. As a minimum requirement for complexity, please include numeric ratings for at least five users, across at least five items, with some missing data.
For dataset, I created a survey in family, and I asked them to give ratings to movie which was picked by family members.
There are 6 members and 8 movies.
Load your data into (for example) an R or pandas dataframe, a Python dictionary or list of lists, (or another data structure of your choosing).
#Load results
results <- read.csv("https://raw.githubusercontent.com/Vinayak234/Data612/master/movie_ratings.csv", header = TRUE)
results %>% kable()%>%
kable_styling(bootstrap_options = "striped", full_width = F)| Movie_Name | User_Name | Ratings |
|---|---|---|
| Company(2002) | Member1 | NA |
| Deewaar(1975) | Member1 | 4.5 |
| Dil Chahta Hai(2001) | Member1 | 4.0 |
| Gully Boy(2019) | Member1 | NA |
| Mr. India(1987) | Member1 | 3.0 |
| Mughal-e-Azam(1960) | Member1 | 4.5 |
| Sholay(1975) | Member1 | 5.0 |
| Swades(2004) | Member1 | 3.0 |
| Company(2002) | Member2 | NA |
| Deewaar(1975) | Member2 | 3.0 |
| Dil Chahta Hai(2001) | Member2 | NA |
| Gully Boy(2019) | Member2 | 2.5 |
| Mr. India(1987) | Member2 | 4.0 |
| Mughal-e-Azam(1960) | Member2 | 3.0 |
| Sholay(1975) | Member2 | 4.0 |
| Swades(2004) | Member2 | NA |
| Company(2002) | Member3 | 5.0 |
| Deewaar(1975) | Member3 | NA |
| Dil Chahta Hai(2001) | Member3 | 4.5 |
| Gully Boy(2019) | Member3 | 3.0 |
| Mr. India(1987) | Member3 | 4.5 |
| Mughal-e-Azam(1960) | Member3 | 4.0 |
| Sholay(1975) | Member3 | 4.0 |
| Swades(2004) | Member3 | 4.0 |
| Company(2002) | Member4 | 4.0 |
| Deewaar(1975) | Member4 | NA |
| Dil Chahta Hai(2001) | Member4 | 4.5 |
| Gully Boy(2019) | Member4 | 3.0 |
| Mr. India(1987) | Member4 | 3.0 |
| Mughal-e-Azam(1960) | Member4 | 3.5 |
| Sholay(1975) | Member4 | 4.0 |
| Swades(2004) | Member4 | 4.0 |
| Company(2002) | Member5 | 2.0 |
| Deewaar(1975) | Member5 | 4.0 |
| Dil Chahta Hai(2001) | Member5 | 4.5 |
| Gully Boy(2019) | Member5 | 2.0 |
| Mr. India(1987) | Member5 | 3.0 |
| Mughal-e-Azam(1960) | Member5 | 3.0 |
| Sholay(1975) | Member5 | 4.0 |
| Swades(2004) | Member5 | 3.0 |
| Company(2002) | Member6 | 4.5 |
| Deewaar(1975) | Member6 | NA |
| Dil Chahta Hai(2001) | Member6 | 4.5 |
| Gully Boy(2019) | Member6 | 4.0 |
| Mr. India(1987) | Member6 | 2.5 |
| Mughal-e-Azam(1960) | Member6 | NA |
| Sholay(1975) | Member6 | 4.0 |
| Swades(2004) | Member6 | 4.0 |
The dimensions of the results dataframe are (48, 3)
## 'data.frame': 48 obs. of 3 variables:
## $ Movie_Name: Factor w/ 8 levels "Company(2002)",..: 1 2 3 4 5 6 7 8 1 2 ...
## $ User_Name : Factor w/ 6 levels "Member1","Member2",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ Ratings : num NA 4.5 4 NA 3 4.5 5 3 NA 3 ...
## Movie_Name User_Name Ratings
## Company(2002) : 6 Member1:8 Min. :2.000
## Deewaar(1975) : 6 Member2:8 1st Qu.:3.000
## Dil Chahta Hai(2001): 6 Member3:8 Median :4.000
## Gully Boy(2019) : 6 Member4:8 Mean :3.705
## Mr. India(1987) : 6 Member5:8 3rd Qu.:4.250
## Mughal-e-Azam(1960) : 6 Member6:8 Max. :5.000
## (Other) :12 NA's :9
I use pivot_wider to convert the above format into a table with 8 rows and 6 columns.
#### use pivot_wider to make a user matrix
results %>% pivot_wider(names_from = Movie_Name, values_from = Ratings) -> UI
UI %>% column_to_rownames("User_Name") -> UI
UII will make a matrix of ones and zeros which will facilitate extracting the desired elements from the overall matrix.
test_rows <- c(1, 2, 3, 4, 5, 6, 5, 4)
test_cols <- c(3, 4, 6, 8, 5, 1, 2, 7)
test_extractor <- cbind(test_rows, test_cols)
UI_train <- UI
UI_train[test_extractor] <- NA
UI_train <- as.matrix(UI_train)
UI_test <- UI
UI_test[test_extractor] <- 0
UI_test[UI_test > 0] <- NA
UI_test[test_extractor] <- UI[test_extractor]
UI_test <- as.matrix(UI_test)
### display UI_train
UI_train %>% kable(caption = "TRAINING MATRIX")%>%
kable_styling(bootstrap_options = "striped", full_width = F)| Company(2002) | Deewaar(1975) | Dil Chahta Hai(2001) | Gully Boy(2019) | Mr. India(1987) | Mughal-e-Azam(1960) | Sholay(1975) | Swades(2004) | |
|---|---|---|---|---|---|---|---|---|
| Member1 | NA | 4.5 | NA | NA | 3.0 | 4.5 | 5 | 3 |
| Member2 | NA | 3.0 | NA | NA | 4.0 | 3.0 | 4 | NA |
| Member3 | 5 | NA | 4.5 | 3 | 4.5 | NA | 4 | 4 |
| Member4 | 4 | NA | 4.5 | 3 | 3.0 | 3.5 | NA | NA |
| Member5 | 2 | NA | 4.5 | 2 | NA | 3.0 | 4 | 3 |
| Member6 | NA | NA | 4.5 | 4 | 2.5 | NA | 4 | 4 |
### display UI_test
UI_test %>% kable(caption = "TEST MATRIX")%>%
kable_styling(bootstrap_options = "striped", full_width = F)| Company(2002) | Deewaar(1975) | Dil Chahta Hai(2001) | Gully Boy(2019) | Mr. India(1987) | Mughal-e-Azam(1960) | Sholay(1975) | Swades(2004) | |
|---|---|---|---|---|---|---|---|---|
| Member1 | NA | NA | 4 | NA | NA | NA | NA | NA |
| Member2 | NA | NA | NA | 2.5 | NA | NA | NA | NA |
| Member3 | NA | NA | NA | NA | NA | 4 | NA | NA |
| Member4 | NA | NA | NA | NA | NA | NA | 4 | 4 |
| Member5 | NA | 4 | NA | NA | 3 | NA | NA | NA |
| Member6 | 4.5 | NA | NA | NA | NA | NA | NA | NA |
## [1] 3.693548
### make a matrix with same rownames and colnames as UI_train, but replace the values
mean_rating <- UI_train
mean_rating[T] <- mean_value
mean_rating %>% kable(caption = "MEAN-RATING MATRIX")%>%
kable_styling(bootstrap_options = "striped", full_width = F)| Company(2002) | Deewaar(1975) | Dil Chahta Hai(2001) | Gully Boy(2019) | Mr. India(1987) | Mughal-e-Azam(1960) | Sholay(1975) | Swades(2004) | |
|---|---|---|---|---|---|---|---|---|
| Member1 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 |
| Member2 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 |
| Member3 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 |
| Member4 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 |
| Member5 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 |
| Member6 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 | 3.693548 |
### Training RMSE
train_RMSE_raw <- sqrt( mean ( (UI_train - mean_rating)^2, na.rm=T ) )
train_RMSE_raw## [1] 0.8099922
## [1] 0.6149689
### Movie Bias
movie_bias <- colMeans(UI_train,na.rm = T) - mean_value
movie_bias %>% t %>% t %>% kable(caption = "MOVIE BIAS")%>%
kable_styling(bootstrap_options = "striped", full_width = F)| Company(2002) | -0.0268817 |
| Deewaar(1975) | 0.0564516 |
| Dil Chahta Hai(2001) | 0.8064516 |
| Gully Boy(2019) | -0.6935484 |
| Mr. India(1987) | -0.2935484 |
| Mughal-e-Azam(1960) | -0.1935484 |
| Sholay(1975) | 0.5064516 |
| Swades(2004) | -0.1935484 |
### user bias
user_bias <- rowMeans(UI_train,na.rm = T) - mean_value
user_bias %>% t %>% t %>% kable(caption = "USER BIAS")%>%
kable_styling(bootstrap_options = "striped", full_width = F)| Member1 | 0.3064516 |
| Member2 | -0.1935484 |
| Member3 | 0.4731183 |
| Member4 | -0.0935484 |
| Member5 | -0.6102151 |
| Member6 | 0.1064516 |
### start from the matrix of the mean_rating
baseline_predictor <- mean_rating
minrating = 1
maxrating = 5
for (r in 1:nrow(baseline_predictor))
for (c in 1:ncol(baseline_predictor))
baseline_predictor[r,c] <-
### We have to ensure that the results are in the range [minrating,maxrating]
### which is why we have the min(max()) wrapper
min(
max(
baseline_predictor[r,c] + movie_bias[c] + user_bias[r],
1),
5)
baseline_predictor %>% kable()%>%
kable_styling(bootstrap_options = "striped", full_width = F)| Company(2002) | Deewaar(1975) | Dil Chahta Hai(2001) | Gully Boy(2019) | Mr. India(1987) | Mughal-e-Azam(1960) | Sholay(1975) | Swades(2004) | |
|---|---|---|---|---|---|---|---|---|
| Member1 | 3.973118 | 4.056452 | 4.806452 | 3.306452 | 3.706452 | 3.806452 | 4.506452 | 3.806452 |
| Member2 | 3.473118 | 3.556452 | 4.306452 | 2.806452 | 3.206452 | 3.306452 | 4.006452 | 3.306452 |
| Member3 | 4.139785 | 4.223118 | 4.973118 | 3.473118 | 3.873118 | 3.973118 | 4.673118 | 3.973118 |
| Member4 | 3.573118 | 3.656452 | 4.406452 | 2.906452 | 3.306452 | 3.406452 | 4.106452 | 3.406452 |
| Member5 | 3.056452 | 3.139785 | 3.889785 | 2.389785 | 2.789785 | 2.889785 | 3.589785 | 2.889785 |
| Member6 | 3.773118 | 3.856452 | 4.606452 | 3.106452 | 3.506452 | 3.606452 | 4.306452 | 3.606452 |
### Training RMSE
train_RMSE_baseline <- sqrt( mean ( (UI_train-baseline_predictor)^2, na.rm=T ) )
train_RMSE_baseline## [1] 0.5490571
### Test RMSE
test_RMSE_baseline <- sqrt( mean ( (UI_test-baseline_predictor)^2, na.rm=T ) )
test_RMSE_baseline## [1] 0.5501305
Lets calculate the percentage improvements based on the original (simple average) and baseline predictor (including bias) RMSE numbers for both Test and Train data sets.
### improvement in TRAIN RMSE when moving from raw average to baseline predictor
train_RMSE_improvement = 1 - train_RMSE_baseline/ train_RMSE_raw
train_RMSE_improvement## [1] 0.3221452
### improvement in Test RMSE when moving from raw average to baseline predictor
test_RMSE_improvement = 1 - test_RMSE_baseline / test_RMSE_raw
test_RMSE_improvement## [1] 0.1054337
The training RMSE declined from 0.81 to 0.549, which is an improvement of 32.215 percent.
The testing RMSE declined from 0.615 to 0.55, which is an improvement of 10.543 percent.