Data607-Assignment2

Data table setup

We use three data tables one movies, one for raters and then a join table for ratings.

dbExecute(conn, "CREATE TABLE IF NOT EXISTS movies(
    movie_id  INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    movie_name VARCHAR NOT NULL
);")

## [1] 0

dbExecute(conn, "CREATE TABLE IF NOT EXISTS raters(
    rater_id  INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    rater_name VARCHAR NOT NULL
);")

## [1] 0

dbExecute(conn, "CREATE TABLE IF NOT EXISTS movieratings(
    rating_id  INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    rater_id INT NOT NULL REFERENCES raters(rater_id),
    movie_id INT NOT NULL REFERENCES movies(movie_id),
    rating decimal
);")

## [1] 0

Table setup

Movies

field	type
movie_id	int (auto)
movie_name	varchar

Raters

field	type
rater_id	int (auto)
rater_name	varchar

MovieRatings

field	type
rating_id	int (auto)
rater_id	int (fk)
movie_id	int (fk)
rating	decimal

get some names

We shall grab some name data from NYC and sample a number of them

names <- read.csv("https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv?accessType=DOWNLOAD")
someNames <- sample(as.character(names[,4]),50)
head(someNames)

## [1] "Matthew" "Leon"    "Isla"    "IBRAHIM" "MAXWELL" "CHELSEA"

We then write them to the database, and read it back for the IDs.

dbBegin(conn)

## [1] TRUE

tblRaters <- as.data.frame(someNames)
names(tblRaters) <- c("rater_name")
dbExecute(conn,sqlAppendTable(conn, "raters", tblRaters,row.names = FALSE ))

## [1] 50

dbCommit(conn)

## [1] TRUE

tblRaters = dbReadTable(conn,"raters")
head(tblRaters)

##   rater_id rater_name
## 1        1    Matthew
## 2        2       Leon
## 3        3       Isla
## 4        4    IBRAHIM
## 5        5    MAXWELL
## 6        6    CHELSEA

Get some movies

We get some movies sample them and load them into a database table much like names

load(url("https://stat.duke.edu/~mc301/data/movies.Rdata"))
someMovies <- sample(as.character(movies$title),6)
head(someMovies)

## [1] "Man on Wire"       "Cop & 1/2"         "Heaven & Earth"   
## [4] "Battlefield Earth" "Dirty Love"        "Volcano"

dbBegin(conn)

## [1] TRUE

tblMovies <- as.data.frame(someMovies)
names(tblMovies) <- c("movie_name")
dbExecute(conn,sqlAppendTable(conn, "movies", tblMovies,row.names = FALSE ))

## [1] 6

dbCommit(conn)

## [1] TRUE

tblMovies = dbReadTable(conn,"movies")
kable(tblMovies)

movie_id	movie_name
1	Man on Wire
2	Cop & 1/2
3	Heaven & Earth
4	Battlefield Earth
5	Dirty Love
6	Volcano

Add some ratings

To generate some ratings we first cross apply movie_id and rater_id and then provide ratings up to 6. We then take any over 5 (our upper bound) to be nulls.

tblRatings<-expand.grid(tblMovies$movie_id,tblRaters$rater_id)
names(tblRatings) <- c("movie_id", "rater_id")
tblRatings$rating <-  sample(6, size = nrow(tblRatings), replace = TRUE)
tblRatings[which(tblRatings["rating"] > 5),]$rating <- NA
tail(tblRatings)

##     movie_id rater_id rating
## 295        1       50      1
## 296        2       50      1
## 297        3       50      5
## 298        4       50      5
## 299        5       50      5
## 300        6       50      2

We then store and fetch our ratings

dbBegin(conn)

## [1] TRUE

dbExecute(conn,sqlAppendTable(conn, "movieratings", tblRatings,row.names = FALSE ))

## [1] 300

dbCommit(conn)

## [1] TRUE

tblRatings = dbReadTable(conn,"movieratings")
kable(head(tblRatings))

rating_id	rater_id	movie_id	rating
1	1	1	2
2	1	2	1
3	1	3	3
4	1	4	2
5	1	5	NA
6	1	6	3

Joining and getting summary data

If we want we can get some summary data for our ratings but first we must join them.

tblAvg <- dbGetQuery(conn,"SELECT movie_name, avg(rating) as avgrating
   from movieratings INNER JOIN movies ON movieratings.movie_id = movies.movie_id  INNER JOIN raters ON movieratings.rater_id = raters.rater_id  GROUP BY movie_name ORDER BY movie_name")
kable(head(tblAvg))

movie_name	avgrating
Battlefield Earth	2.707317
Cop & 1/2	2.930233
Dirty Love	2.707317
Heaven & Earth	3.078947
Man on Wire	3.000000
Volcano	2.690476