library(magrittr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(dplyr)
movie.rating <- read.csv("C:\\DATA607\\Week912 Material\\Movie_Ratings.csv")
m.r <- movie.rating
#str(m.r)
#head(m.r)
#tail(m.r)
m.r %>% filter(!is.na(Patrick.C), !is.na(Bryan))%>%
mutate(delta = (Patrick.C-Bryan)**2) %>%
summarise(distance = sqrt(sum(delta)))
## distance
## 1 4.472136
m.x <- select(m.r, -X)
ed <- sqrt(dist(m.x, upper = TRUE))
eds <- matrix(rep(6),nrow = 24, ncol = 24, byrow = TRUE)
j=1
for (i in 1:24) {
k <- 24*i
eds[i:24,i] <- ed[j:k]
j <- j+25
}
e.dist <- ed[1:24]
Patrick.C <- data.frame(e.dist, neighbor = colnames(m.r)[3:26])
arrange(Patrick.C, e.dist)
## e.dist neighbor
## 1 2.080896 ben
## 2 2.340347 Chris
## 3 2.340347 Stephen
## 4 2.432299 Chris.1
## 5 2.514867 vanessa
## 6 2.514867 Gary
## 7 2.571843 Thomas
## 8 2.571843 Amy
## 9 2.610336 Bryan
## 10 2.610336 Matt
## 11 2.695335 Patrick.T
## 12 2.811707 Josh
## 13 2.827199 Erin
## 14 2.886751 greg
## 15 2.977256 Heather
## 16 3.010526 Jonathan
## 17 3.070520 aaron
## 18 3.117411 Katherine
## 19 3.162278 Zak
## 20 3.201086 Zwe
## 21 3.343702 brian
## 22 3.364407 Jessica
## 23 3.384736 Jeff
## 24 3.899603 Valerie
pat.c <- m.r[(m.r$ben==5|m.r$Chris==5),c("X","Patrick.C", "ben","Chris")]
filter(pat.c, is.na(Patrick.C))
## X Patrick.C ben Chris
## 1 <NA> NA NA NA
## 2 <NA> NA NA NA
## 3 Gladiator NA 5 3
## 4 <NA> NA NA NA
## 5 Pulp Fiction NA 4 5
## 6 <NA> NA NA NA
## 7 <NA> NA NA NA
squaroot <- function(A,B){
m1 <- sum((A-B)**2, na.rm = TRUE)
m1 <- sqrt(m1)
return(m1)
}
squaroot(m.r$Patrick.C, m.r$Heather)
## [1] 4.123106
distance <- squaroot(m.r[,2], m.r[,2])
for (j in 3:26) {
distance <- c(distance, squaroot(m.r[,2], m.r[,j]))
}
euclidist <- distance
for (i in 3:26) {
distance <- squaroot(m.r[,i], m.r[,3])
for (j in 3:26) {
distance <- c(distance, squaroot(m.r[,i], m.r[,j]))
}
euclidist <- c(euclidist, distance)
}
euclidist <- matrix(euclidist, ncol = 25)
movie.rating <- read.csv("C:\\DATA607\\Project4\\ml-20m\\ratings.csv")
m.r <- movie.rating
str(m.r)
## 'data.frame': 20000263 obs. of 4 variables:
## $ userId : int 1 1 1 1 1 1 1 1 1 1 ...
## $ movieId : int 2 29 32 47 50 112 151 223 253 260 ...
## $ rating : num 3.5 3.5 3.5 3.5 3.5 3.5 4 4 4 4 ...
## $ timestamp: int 1112486027 1112484676 1112484819 1112484727 1112484580 1094785740 1094785734 1112485573 1112484940 1112484826 ...
Ru.hat <- m.r %>%
group_by(userId)%>%
summarize(Ru.hat= mean(rating, na.rm = TRUE))
str(Ru.hat)
## Classes 'tbl_df', 'tbl' and 'data.frame': 138493 obs. of 2 variables:
## $ userId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Ru.hat: num 3.74 4 4.12 3.57 4.27 ...
m.r <- merge(m.r, Ru.hat, by = "userId")
Computing \(x-\bar x\) and \((x-\bar x)^2\)
m.r <- m.r %>% mutate(D.Ru = rating-Ru.hat, DS.Ru = (rating-Ru.hat)**2)
mvi <- m.r %>%
group_by(movieId)%>%
summarise(nm = n())
ma1 <- subset(m.r, m.r$movieId == mvi$movieId[1])
ma2 <- subset(m.r, m.r$movieId == mvi$movieId[2])
mer <- merge(ma1, ma2, by="userId")
s <- cor(mer$D.Ru.x, mer$D.Ru.y)
for (i in 1:length(mvi-1)) {
ma1 <- subset(m.r, m.r$movieId == mvi$movieId[i])
for ( j in i+1:length(mvi)) {
ma2 <- subset(m.r, m.r$movieId == mvi$movieId[j])
mer <- merge(ma1, ma2, by="userId")
s1 <- cor(mer$D.Ru.x, mer$D.Ru.y)
s <- c(s,s1)
}
}
mering <- merge(ma1, ma2, by="userId")
cor(mering$D.Ru.x, mering$D.Ru.y)
## [1] 0.08631851
For axample, movie 2 and movie 29 are similar The movie.csv file has all movies name with the corresponding Id Jumanji and Citie of The lost children are two adventurous movies for children that associate Fantasy.