This system recommends different data science books to students in CUNY Class according to their ratings.
Dataset which is used here is from the book ratings which by the stutends of CUNY MSDA program. There are around 12 users and 9 books. Some of them have rated randomly on these books. That data set will be used as to make predictions using prediction metods.
ratings_na <- read.csv("data/book_ratings.csv", sep = ",", header = TRUE, na.strings = c("Did not read",
""))
datatable(ratings_na, options = list(scrollX = "400px"))
# Set temp names
ratings_na <- setNames(ratings_na, c("userId", 1:9))
# Remove unused columns
ratings_na <- ratings_na %>% select(-userId)
## Warning in combine_vars(vars, ind_list): '.Random.seed' is not an integer
## vector but of type 'NULL', so ignored
# backup of full data to calculate RMSE
full_data <- ratings_na
After analyzing the dataset, we can see that the dataset is dense. But there are some missing ratings from each user.
As this is a small dataset, we are allocating 30% to test dataset. Others will be part of training dataset.
set.seed(10)
#Random selection of rows from test dataset
test_rows <- sort(sample(nrow(ratings_na),nrow(ratings_na)*.30))
test = ratings_na[test_rows,]
#Selecting only training dataset
ratings_na = ratings_na[-test_rows,]
Calculating raw average of training dataset by converting into a matrix.
(train_raw_mean <- mean(as.matrix(ratings_na),na.rm = TRUE))
## [1] 4.064516
Once we calculate the raw average, we can utilize it for calculating RMSE for training and test dataset.
#Create function for RMSE calculation
rmse_calc = function(df,mean_val){
#As it utilizes NA, converting to 0 to get correct RMSE.
df[is.na(df)] =0
#Using mean or a dataframe
if(is.data.frame(df)) mean_val[is.na(mean_val)] =0
return(rmse(df,mean_val))
}
rmse_calc(ratings_na,train_raw_mean)
## [1] 3.269548
#Calculate RMSE for test dataset
rmse_calc(test,train_raw_mean)
## [1] 3.258201
To calculate baseline predictor, we need to calculate bias. Bias has to be calculated in the prespective of user and books.
#Create function for baseline predictor
baseline_predictor = function(df) {
#User bias: means of each user - raw mean
user_mean = c(rowMeans(df,na.rm=TRUE)-train_raw_mean)
#book bias: means of each book- raw mean
movie_mean = c(colMeans(df,na.rm=TRUE)-train_raw_mean)
temp_df = data.frame()
for(i in 1:nrow(df)){
#add all the user and book bias
final_bias <- train_raw_mean+ user_mean[i] +movie_mean
temp_df <- rbind(temp_df,final_bias)
}
#Set temp names
temp_df = setNames(temp_df, c(1:ncol(temp_df))) %>% data.frame()
#Return the baseline predicted value
return(temp_df)
}
Calculate RMSE by comparing baseline predicors and original training, testing and full dataset.
#Training dataset RMSE
rmse_calc(ratings_na,baseline_predictor(ratings_na))
## [1] 2.346029
#Test dataset RMSE
rmse_calc(test,baseline_predictor(test))
## [1] 3.028421
#Full dataset RMSE
rmse_calc(full_data,baseline_predictor(full_data))
## [1] 2.92825
Below is the summary of all the baseline predictors
#predicted ratings for each user
prediction = baseline_predictor(full_data)
#Setting max and min value
prediction[prediction>5 & !is.na(prediction)] =5
prediction[prediction<1 & !is.na(prediction)] =1
prediction <- setNames(prediction,c("Signal and the Noise","Data Science for Business","Automated Data Collection with R","R for Data Science","Introduction to Statistical Learning","Machine Learning with Scikit-Learn and Tensorflow","Weapons of Math Destruction","Programming Collective Intelligence","R in Action"))
names = c("David Stern","Andy","Walt","Dan Fanelli","James T","Robert Sellers","Tulasi ","Logan","Shyam BV","Yun","Kumudini","Jason Joseph")
datatable(cbind(names,prediction),options=list(scrollX='400px'))
After calculating predictions, we will rank the recommended books according to their score.
indexes = which(is.na(full_data), arr.ind = TRUE) %>% data.frame()
indexes['score'] = 0
#Create function for fetching the values
cal_recommendation = function(x){
prediction[x[1],x[2]]
}
indexes['score'] = apply(indexes,1,cal_recommendation)
#Setting Na's to 0
indexes['score'][is.na(indexes['score'])] = 0
names_lookup = data.frame(row = c(1:12), names=c("David Stern","Andy","Walt","Dan Fanelli","James T","Robert Sellers","Tulasi ","Logan","Shyam BV","Yun","Kumudini","Jason Joseph"))
book_lookup = data.frame(col = c(1:9),books = c("Signal and the Noise","Data Science for Business","Automated Data Collection with R","R for Data Science","Introduction to Statistical Learning","Machine Learning with Scikit-Learn and Tensorflow","Weapons of Math Destruction","Programming Collective Intelligence","R in Action"))
full_recommendations = inner_join(indexes,names_lookup) %>% data.frame() %>% inner_join(book_lookup) %>% arrange(row,-score) %>% group_by(row) %>% mutate(rank=row_number()) %>% select(-col)
## Joining, by = "row"
## Joining, by = "col"
datatable(full_recommendations,options=list(scrollY='300px'))