分析ml-100k里面的rating分布情况

u = read.table("C:/Users/Zero/Desktop/学校课程学习/推荐系统/ml-100k/u.data")
findIndexByValues = function(vect, values) {
    ret = rep(FALSE, length(vect))
    for (j in values) {
        ret = ret | vect == j
    }
    return(ret)
}
res = table(u[, 2])
res = sort(res, decreasing = TRUE)
dwf = function(lower, upper = Inf) {
    itemid = as.integer(names(res[res >= lower & res < upper]))
    retrived_ratings = u[findIndexByValues(u[, 2], itemid), 3]
    hist(retrived_ratings, 10, labels = TRUE, main = paste("#Rating in [", lower, 
        ",", upper, ")"), xlab = paste("std=", round(sd(retrived_ratings), 2), 
        "mean=", round(mean(retrived_ratings), 2)), cex.lab = 2)
}

下面直方图显示了不同热门程度的物品的rating分布直方图(热门程度也就是一个物品获得的rating的数量)

par(mfrow = c(2, 3))
dwf(400)
dwf(300, 400)
dwf(200, 300)
dwf(100, 200)
dwf(50, 100)
dwf(0, 50)

plot of chunk unnamed-chunk-2