1 Database setup and data upload

Data was directly downloaded to R environment from provided Google Drive link as a .csv file. Initial data cleaning of provided raw files was carried out in R before uploading the data to the server. In detail, records in BX_Books and BX_Book_Ratings tables that were obviously erroneous in respect to ISBN code were cleaned and if necessary, dropped.

Data has been uploaded to the server (Amazon AWS RDS MySQL instance) by using R package DBI.

Following code chuck depicts the steps taken in data downloading, cleaning and uploading to the server. Of note, following code chunk also outlines the user credentials (AWS MySQL instance’s public endpoint address, root user and root user password).

Downloading, cleaning and writing BX_Book_Ratings table to database:

## Writing BX_Book_Ratings cvs file to a database table in Amazon MySQL database instance
BX_Book_Ratings <- read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", 
                                    "1hPRzifGuNosveSProVY0qqNLOvTUL3z9"), 
                                    stringsAsFactors = FALSE, 
                                    na.strings = c("NULL"), sep = ";")

names(BX_Book_Ratings) <- c("User_ID", "ISBN", "Book_Rating")


## Data cleaning to remove invalid ISBN numbers

## Keep only numeric characters from ISBN string and keep only valid ISBN strings of length 10 (old system) or 13 (new system)
BX_Book_Ratings_cleaned <- BX_Book_Ratings
BX_Book_Ratings_cleaned[,"ISBN"] <-  gsub("[^0-9]", "", BX_Book_Ratings_cleaned[,"ISBN"]) 
BX_Book_Ratings_cleaned <- BX_Book_Ratings_cleaned[nchar(BX_Book_Ratings_cleaned[,"ISBN"]) %in% c(10,13),]


con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")

DBI::dbWriteTable(con, "BX_Book_Ratings", BX_Book_Ratings_cleaned, overwrite = TRUE, row.names = FALSE)
DBI::dbDisconnect(con)

Downloading and writing BX_Users table to database:

## Writing BX_Users cvs file to a database table in Amazon MySQL database instance
BX_Users <- read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", 
                                    "1skUfmoBA9wo8GnbES8dbVlbKtHN6mi4C"), 
                            stringsAsFactors = FALSE, 
                            na.strings = c("NULL"), sep = ";")

names(BX_Users) <- c("User_ID","Location","Age")

con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")

DBI::dbWriteTable(con, "BX_Users", BX_Users, overwrite = TRUE, row.names = FALSE)
DBI::dbDisconnect(con)

Downloading, cleaning and writing BX_Books table to database:

## Writing BX_Books cvs file to a database table in Amazon MySQL database instance
BX_Books <- read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", 
                             "1XaLXUfhS7G84VICIz-nhmWhTe154h3Re"), 
                     stringsAsFactors = FALSE, 
                     na.strings = c("NULL"), sep = ";")

names(BX_Books) <- c("ISBN","Book_Title","Book_Author","Year_Of_Publication","Publisher","Image_URL_S","Image_URL_M","Image_URL_L")

## Data cleaning to remove invalid ISBN numbers

## Keep only numeric characters from ISBN string and keep only valid ISBN strings of length 10 (old system) or 13 (new system)
BX_Books_cleaned <- BX_Books
BX_Books_cleaned[,"ISBN"] <-  gsub("[^0-9]", "", BX_Books_cleaned[,"ISBN"]) 
BX_Books_cleaned <- BX_Books_cleaned[nchar(BX_Books_cleaned[,"ISBN"]) %in% c(10,13),]


con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")

DBI::dbWriteTable(con, "BX_Books", BX_Books_cleaned, overwrite = TRUE, row.names = FALSE)
DBI::dbDisconnect(con)

2 Answers for questions

No1: How many users have made exactly 2 ratings?

con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")

users_with_two_ratings <-
  DBI::dbGetQuery(con,
'
SELECT COUNT(User_ID) AS N_ratings_equalling_two
FROM (SELECT User_ID, COUNT(Book_Rating) AS N_ratings_given 
FROM  BX_Book_Ratings 
GROUP BY User_ID 
HAVING N_ratings_given = 2) 
AS agregated_table
'
)
  
DBI::dbDisconnect(con)
## [1] TRUE

Answer: Number of persons having given exactly two ratings is 5215.

No2.1 and 2.4: Find the mean (2.1), and range (2.4) for number of ratings made for book

con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")

average_and_range_of_ratings <-
  DBI::dbGetQuery(con,
'
SELECT AVG(Book_Rating) AS AVG_unaggergated_rating, MIN(Book_Rating) AS MIN_unaggergated_rating, MAX(Book_Rating) AS MIN_unaggergated_rating
FROM BX_Book_Ratings
'
)

average_and_range_of_number_of_ratings <-
  DBI::dbGetQuery(con,
                  '
SELECT AVG(d.number_of_ratings_by_user) AS AVG_no_of_ratings_by_user, 
MIN(d.number_of_ratings_by_user) AS MIN_no_of_ratings_by_user, 
MAX(d.number_of_ratings_by_user) AS MAX_no_of_ratings_by_user 
FROM (SELECT User_ID, COUNT(Book_Rating) AS number_of_ratings_by_user
                  FROM BX_Book_Ratings
                  GROUP BY User_ID) d
'
  )

average_and_range_of_number_of_ratings_excluding_zeros <-
  DBI::dbGetQuery(con,
                  '
SELECT AVG(d.number_of_ratings_by_user) AS AVG_no_of_ratings_by_user, 
MIN(d.number_of_ratings_by_user) AS MIN_no_of_ratings_by_user, 
MAX(d.number_of_ratings_by_user) AS MAX_no_of_ratings_by_user 
FROM (SELECT User_ID, COUNT(Book_Rating) AS number_of_ratings_by_user
                  FROM BX_Book_Ratings
                  WHERE Book_Rating > 0
                  GROUP BY User_ID) d

'
  )

  
DBI::dbDisconnect(con)
## [1] TRUE

Answer: If I include to the analysis ratings that equal zero, therefore assuming those ratings to be valid entries, the average number of book ratings recorded in database table BX_Book_Ratings by single user is 10.6765 and the number of ratings given by single user ranges from 1 to 6891.

Alternatively, if I exclude ratings equalling zero, the average number of book ratings recorded in database table BX_Book_Ratings by single user results to 5.1517 and the number of ratings given by single user ranges from 1 to 1719.

No2.2 Find the mode (2.2) for number of ratings made for book

con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")

pattern_of_number_of_ratings_given_by_user <-
  DBI::dbGetQuery(con,
                  '
SELECT db1.ratings_by_given_user AS ratings_by_given_user, COUNT(*) AS number_of_times_given_number_of_ratings_is_present
FROM(SELECT COUNT(Book_Rating) AS ratings_by_given_user, User_ID
FROM BX_Book_Ratings
GROUP BY User_ID) AS db1
GROUP BY ratings_by_given_user
ORDER BY number_of_times_given_number_of_ratings_is_present DESC
LIMIT 10
'
  )

DBI::dbDisconnect(con)
## [1] TRUE
names(pattern_of_number_of_ratings_given_by_user) <- c("Ratings by given user", "Number of times given number of ratings is present")

knitr::kable(pattern_of_number_of_ratings_given_by_user,
             caption = "Table 1: Pattern of number of ratings given by user (TOP 10)") %>%

  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                            full_width = FALSE,
                            position = "c")
Table 1: Pattern of number of ratings given by user (TOP 10)
Ratings by given user Number of times given number of ratings is present
1 23519
2 5215
3 2585
4 1749
5 1293
6 928
7 671
8 541
9 454
10 407

Answer: The most common (mode) number of ratings given for book by user is 1 and this pattern is repeated in 2.351910^{4} cases.

No2.3 Find the median (2.3) for number of ratings made for book

con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")

median_of_number_of_ratings_given_by_user <-
  DBI::dbGetQuery(con,
                  '
SELECT ROUND(AVG(ratings_by_given_user), 1) AS median_result
FROM (SELECT db1.ratings_by_given_user, @rownum:=@rownum+1 AS `Row_number`, @Rows_in_total:=@rownum
  FROM (SELECT COUNT(Book_Rating) AS ratings_by_given_user, User_ID FROM BX_Book_Ratings GROUP BY User_ID) AS db1, (select @rownum:=0) AS start_index 
  WHERE db1.ratings_by_given_user IS NOT NULL
ORDER BY db1.ratings_by_given_user) AS db2
WHERE db2.Row_number IN(floor((@Rows_in_total+1)/2), floor((@Rows_in_total+2)/2))
                  '
  )

DBI::dbDisconnect(con)
## [1] TRUE

Answer: The median of number of ratings given by individual user across the users, who have given at least one book rating is 1

No3 Question of Pareto-like distribution of number of ratings given by users

con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")

SUM_of_ratings_given_by_TOP20_rater_based_on_activity_rank <-
  DBI::dbGetQuery(con,
                  '
SELECT ROUND(SUM(ratings_by_given_user), 1) AS SUM_of_ratings_in_TOP20_prec_most_active_raters
FROM (SELECT db1.ratings_by_given_user, @rownum:=@rownum+1 AS `Row_number`, @Rows_in_total:=@rownum
      FROM (SELECT COUNT(Book_Rating) AS ratings_by_given_user, User_ID FROM BX_Book_Ratings GROUP BY User_ID) AS db1, (select @rownum:=0) AS start_index 
      WHERE db1.ratings_by_given_user IS NOT NULL
      ORDER BY db1.ratings_by_given_user DESC) AS db2
WHERE db2.Row_number <= ROUND((@Rows_in_total* 0.2),1)
                  '
  )

SUM_of_ratings_given_by_BOTTOM80_rater_based_on_activity_rank <-
  DBI::dbGetQuery(con,
                  '
SELECT ROUND(SUM(ratings_by_given_user), 1) AS SUM_of_ratings_in_BOTTOM80_prec_most_active_raters
FROM (SELECT db1.ratings_by_given_user, @rownum:=@rownum+1 AS `Row_number`, @Rows_in_total:=@rownum
      FROM (SELECT COUNT(Book_Rating) AS ratings_by_given_user, User_ID FROM BX_Book_Ratings GROUP BY User_ID) AS db1, (select @rownum:=0) AS start_index 
      WHERE db1.ratings_by_given_user IS NOT NULL
      ORDER BY db1.ratings_by_given_user DESC) AS db2
WHERE db2.Row_number > ROUND((@Rows_in_total* 0.2),1)
                  '
  )


DBI::dbDisconnect(con)
## [1] TRUE
## Rough estimation whether Pareto-like pattern is present in current dataset.  
proportion_of_ratings_by_top_20_prec_most_active_users <- SUM_of_ratings_given_by_TOP20_rater_based_on_activity_rank/ (SUM_of_ratings_given_by_TOP20_rater_based_on_activity_rank+SUM_of_ratings_given_by_BOTTOM80_rater_based_on_activity_rank)

formatted_output <- paste((round(proportion_of_ratings_by_top_20_prec_most_active_users,2)*100),"%")

Answer: Based on the fact, that majority of ratings given to book originate from TOP 20% most active users, I would argue that by and large the Pareto distribution like pattern in data holds. In detail, top 20% most active users have given 88 % of all the book ratings.

3 Some plots to illustrate the book rating activity of individual users

con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")



number_of_ratings_by_user <-
  DBI::dbGetQuery(con,
                  '
                  SELECT User_ID, COUNT(Book_Rating) AS number_of_ratings_by_user
                  FROM BX_Book_Ratings
                  GROUP BY User_ID
                  '
  )

DBI::dbDisconnect(con)
## [1] TRUE
plot_1_data <- number_of_ratings_by_user[number_of_ratings_by_user[,"number_of_ratings_by_user"] <= 100,]
plot_1 <- ggplot(plot_1_data, aes(number_of_ratings_by_user)) +
  geom_histogram() +
  labs(title="Distribution of book rating activity in group of \nusers that have given less than or equal to 100 ratings", 
       subtitle="Expected valid records") +
  xlab("Number of book ratings by user") +
  ylab("Number of users at given book rating activity") +
  scale_x_continuous(breaks=seq(10, 100, 10), 
                     limits = c(1, 100))+
  scale_y_continuous(breaks=seq(0, 15000, 1000), 
                     limits = c(0, 15000))+
  theme_tufte()
  

plot_2_data <- number_of_ratings_by_user[number_of_ratings_by_user[,"number_of_ratings_by_user"] > 100,]
plot_2 <- ggplot(plot_2_data, aes(number_of_ratings_by_user)) +
  geom_histogram() +
  labs(title="Distribution of book rating activity in group of \nusers that have given more than 100 ratings", 
       subtitle="Records that need further investigation") +
  xlab("Number of book ratings by user") +
  ylab("Number of users at given book rating activity") +
  scale_x_continuous(breaks=seq(200, 2600, 200), 
                     limits = c(101, 2500))+
  scale_y_continuous(breaks=seq(0, 500, 100), 
                     limits = c(0, 500))+
  theme_tufte()

Following plots illustrate the book rating activity of individual users. Of note, I have divided the book raters into two groups. Firstly, raters that have given 1 to 100 ratings (for details, see: Plot NO1) and secondly, raters that have given over 100 individual book ratings (for details, see: Plot NO2). Reading and rating over 100 book may indicate an unlogic value that might need to be investigated further in order to achieve better data quality. I.e. cases where single user has rated more than 1000 books might indicate that book re-seller or bot might be the “over-active” book rater.

Plot NO1

Plot NO1

Plot NO2

Plot NO2

4 Limitations of data analysis performed

con <- DBI::dbConnect(RMySQL::MySQL(),    
                      host="assignmentno1instance.ci9a2b8vvtvc.us-east-2.rds.amazonaws.com",
                      port=3306,
                      dbname="Assignment1DB",
                      user="UserMait",
                      password="Test77Test77")


Ratings_merged_with_Users_table <-
  DBI::dbGetQuery(con,
                  '
SELECT u.User_ID,  u.Location, r.Number_of_ratings_for_a_book 
FROM BX_Users u 
                  LEFT JOIN (SELECT User_ID, COUNT(ISBN) AS Number_of_ratings_for_a_book 
                  FROM BX_Book_Ratings GROUP BY User_ID ORDER BY Number_of_ratings_for_a_book DESC) r  
                  ON u.User_ID = r.User_ID
                  '
  )


number_of_inactive_users <- sum(is.na(Ratings_merged_with_Users_table[,"Number_of_ratings_for_a_book"]))

DBI::dbDisconnect(con)
## [1] TRUE

To test the correspondence between primary key variable (User_ID) between tables BX_Book_Ratings and BX_Users, I have performed simple table join.

To be more precise, the number of book ratings per user is computed based on BX_Book_Ratings and this data is added to the BX_Users table based on the matchings between Use_ID columns. NULL values in BX_Users table in merged column indicate user that haven reviewed any book yet. This pattern is present for 117467 of unique users in BX_Users table. Therefore, data indicates that there exists sizable number of inactive users. Of note, previous analysis about average number of ratings given by user and median, mode and range of book ratings given does not take into account the subset of inactive users. This constitutes the main limitation of data analysis “logic” that I have employed in my HW assignment.

The second main limitation concerns the amount of zero valued book ratings in BX_Book_Ratings table, which can be NULL entries converted mistakenly to zero during database dump. Further, my ISBN formatting check-based data cleaning is quite crude and can be further improved.