Start Date: 4 April 2025

Report Date: 30 April 2025

Source: Psi Chi R

knitr::opts_chunk$set(echo = T,message = F,warning = F)

#setwd("C:/Users/alanh/Documents/R/Psi_Chi_R")

rm(list=ls())

setwd("~/R/Psi_Chi_R")

library(tidyverse)

#total for bottom row
sum_rows = function(x) {
  x = as.data.frame(x)
  sums = sapply(x,function(col) if (is.numeric(col)) sum(col, na.rm = T) else NA)
  sums = as.data.frame(t(sums))
  names(sums) = names(x)
  rbind(x, sums)
}

## right column for total
sum_cols = function(x) {
  x$Total = rowSums(x[sapply(x, is.numeric)], na.rm = T)
  x
}

#dollar format function
dollars = function(x) {
  paste0("$",format(x,big.mark= ",",scientific=F))
}

desc_stats = function(x){
  c(min = min(x,na.rm=T),
    median = median(x,na.rm=T),
    max = max(x,na.rm=T),
    mean = mean(x,na.rm=T),
    sd = sd(x,na.rm=T))
}

#numeric notations
options(scipen=9999)

Clean and EDA

data=read_csv('data.csv')

names(data) = make.names(colnames(data))

SmartEDA::ExpData(data,type=2) %>% 
  arrange(desc(Per_of_Missing))

Data set:

Level 1: Let’s examine the average rating column for normality and outliers.

Create a histogram, boxplot, and Q-Q plot. Please paste your graphs into the code/output you provide.

data1 = data %>% 
  mutate(DATE = mdy(publication_date))

data1 %>% 
  ggplot(aes(x=average_rating))+
  geom_histogram(bins=30)

data1 %>% 
  ggplot(aes(x=average_rating))+
  geom_boxplot()

data1 %>% 
  ggplot(aes(sample=average_rating))+
  stat_qq()+
  stat_qq_line()

Finally, run a Shapiro-Wilks test. Is the data normally distributed? Are there any outliers?

No, data is not normally distributed.

#shapiro.test(data1$average_rating)

ks.test(data1$average_rating, "pnorm", mean=mean(data1$average_rating), sd=sd(data1$average_rating))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  data1$average_rating
## D = 0.081985, p-value < 0.00000000000000022
## alternative hypothesis: two-sided

Level 3: Is there any relationship between when a book was published and its average ratings?

No relationship detected.

#no relationship
data1 %>% 
  ggplot(aes(x=DATE,y=average_rating))+
  geom_point()

Level 4: Pick your favorite author (or any author of your choosing) in the dataset and plot the relationship between the number of ratings they receive (column J) and the average ratings for their book (column D). What kind of relationship do you see, if any?

Picking JRR Tolkien. And no relationship detected.

data2 = data1 %>% 
  filter(str_detect(authors,regex("tolkien",ignore_case = T)))

data2 %>% 
  ggplot(aes(x=ratings_count,y=average_rating))+
  geom_point()

#no normally distributed - run spearman or kendall
shapiro.test(data2$ratings_count)
## 
##  Shapiro-Wilk normality test
## 
## data:  data2$ratings_count
## W = 0.24031, p-value = 0.000000000000002428
shapiro.test(data2$average_rating)
## 
##  Shapiro-Wilk normality test
## 
## data:  data2$average_rating
## W = 0.93809, p-value = 0.007075
hist(data2$ratings_count)

hist(data2$average_rating)

cor.test(data2$ratings_count,data2$average_rating,method = "k")
## 
##  Kendall's rank correlation tau
## 
## data:  data2$ratings_count and data2$average_rating
## z = -1.87, p-value = 0.06149
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## -0.176176
LS0tDQp0aXRsZTogIlBzaSBDaGkgUiAtIEFwciAyMDI1Ig0KI2RhdGU6ICJgciBTeXMuRGF0ZSgpYCINCm91dHB1dDoNCiAgaHRtbF9kb2N1bWVudDoNCiAgICBsaWdodGJveDogdHJ1ZQ0KICAgIHRoZW1lOiByZWFkYWJsZQ0KICAgIGFsd2F5c19hbGxvd19odG1sOiB5ZXMNCiAgICBkZl9wcmludDogcGFnZWQNCiAgICB0b2M6IHllcw0KICAgIHRvY19mbG9hdDogeWVzDQogICAgbnVtYmVyX3NlY3Rpb25zOiBubw0KICAgIGFuY2hvcl9zZWN0aW9uczogVFJVRQ0KICAgIGNvZGVfZm9sZGluZzogaGlkZQ0KICAgIGNvZGVfZG93bmxvYWQ6IHRydWUNCi0tLQ0KDQpTdGFydCBEYXRlOiA0IEFwcmlsIDIwMjUNCg0KUmVwb3J0IERhdGU6IGByIGZvcm1hdChTeXMuRGF0ZSgpLCAnJWQgJUIgJVknKWANCg0KW1NvdXJjZTogUHNpIENoaSBSXShodHRwczovL29zZi5pby9qYWg1eS93aWtpL2hvbWUvKQ0KDQpgYGB7ciB3YXJuaW5nPUYsbWVzc2FnZT1GfQ0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBULG1lc3NhZ2UgPSBGLHdhcm5pbmcgPSBGKQ0KDQojc2V0d2QoIkM6L1VzZXJzL2FsYW5oL0RvY3VtZW50cy9SL1BzaV9DaGlfUiIpDQoNCnJtKGxpc3Q9bHMoKSkNCg0Kc2V0d2QoIn4vUi9Qc2lfQ2hpX1IiKQ0KDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCg0KI3RvdGFsIGZvciBib3R0b20gcm93DQpzdW1fcm93cyA9IGZ1bmN0aW9uKHgpIHsNCiAgeCA9IGFzLmRhdGEuZnJhbWUoeCkNCiAgc3VtcyA9IHNhcHBseSh4LGZ1bmN0aW9uKGNvbCkgaWYgKGlzLm51bWVyaWMoY29sKSkgc3VtKGNvbCwgbmEucm0gPSBUKSBlbHNlIE5BKQ0KICBzdW1zID0gYXMuZGF0YS5mcmFtZSh0KHN1bXMpKQ0KICBuYW1lcyhzdW1zKSA9IG5hbWVzKHgpDQogIHJiaW5kKHgsIHN1bXMpDQp9DQoNCiMjIHJpZ2h0IGNvbHVtbiBmb3IgdG90YWwNCnN1bV9jb2xzID0gZnVuY3Rpb24oeCkgew0KICB4JFRvdGFsID0gcm93U3Vtcyh4W3NhcHBseSh4LCBpcy5udW1lcmljKV0sIG5hLnJtID0gVCkNCiAgeA0KfQ0KDQojZG9sbGFyIGZvcm1hdCBmdW5jdGlvbg0KZG9sbGFycyA9IGZ1bmN0aW9uKHgpIHsNCiAgcGFzdGUwKCIkIixmb3JtYXQoeCxiaWcubWFyaz0gIiwiLHNjaWVudGlmaWM9RikpDQp9DQoNCmRlc2Nfc3RhdHMgPSBmdW5jdGlvbih4KXsNCiAgYyhtaW4gPSBtaW4oeCxuYS5ybT1UKSwNCiAgICBtZWRpYW4gPSBtZWRpYW4oeCxuYS5ybT1UKSwNCiAgICBtYXggPSBtYXgoeCxuYS5ybT1UKSwNCiAgICBtZWFuID0gbWVhbih4LG5hLnJtPVQpLA0KICAgIHNkID0gc2QoeCxuYS5ybT1UKSkNCn0NCg0KI251bWVyaWMgbm90YXRpb25zDQpvcHRpb25zKHNjaXBlbj05OTk5KQ0KYGBgDQoNCiMjIENsZWFuIGFuZCBFREENCg0KYGBge3J9DQpkYXRhPXJlYWRfY3N2KCdkYXRhLmNzdicpDQoNCm5hbWVzKGRhdGEpID0gbWFrZS5uYW1lcyhjb2xuYW1lcyhkYXRhKSkNCg0KU21hcnRFREE6OkV4cERhdGEoZGF0YSx0eXBlPTIpICU+JSANCiAgYXJyYW5nZShkZXNjKFBlcl9vZl9NaXNzaW5nKSkNCmBgYA0KDQojIyMgRGF0YSBzZXQ6DQoNCmBgYHtyIGVjaG89Rn0NCmRvd25sb2FkdGhpczo6ZG93bmxvYWRfdGhpcyhkYXRhLG91dHB1dF9uYW1lID0gJzIwMjVBcHJfZGF0YScsb3V0cHV0X2V4dGVuc2lvbiA9ICcueGxzeCcpDQpgYGANCg0KIyMgTGV2ZWwgMTogTGV04oCZcyBleGFtaW5lIHRoZSBhdmVyYWdlIHJhdGluZyBjb2x1bW4gZm9yIG5vcm1hbGl0eSBhbmQgb3V0bGllcnMuIA0KDQojIyMgQ3JlYXRlIGEgaGlzdG9ncmFtLCBib3hwbG90LCBhbmQgUS1RIHBsb3QuIFBsZWFzZSBwYXN0ZSB5b3VyIGdyYXBocyBpbnRvIHRoZSBjb2RlL291dHB1dCB5b3UgcHJvdmlkZS4NCg0KYGBge3J9DQpkYXRhMSA9IGRhdGEgJT4lIA0KICBtdXRhdGUoREFURSA9IG1keShwdWJsaWNhdGlvbl9kYXRlKSkNCg0KZGF0YTEgJT4lIA0KICBnZ3Bsb3QoYWVzKHg9YXZlcmFnZV9yYXRpbmcpKSsNCiAgZ2VvbV9oaXN0b2dyYW0oYmlucz0zMCkNCg0KZGF0YTEgJT4lIA0KICBnZ3Bsb3QoYWVzKHg9YXZlcmFnZV9yYXRpbmcpKSsNCiAgZ2VvbV9ib3hwbG90KCkNCg0KZGF0YTEgJT4lIA0KICBnZ3Bsb3QoYWVzKHNhbXBsZT1hdmVyYWdlX3JhdGluZykpKw0KICBzdGF0X3FxKCkrDQogIHN0YXRfcXFfbGluZSgpDQpgYGANCg0KIyMjIEZpbmFsbHksIHJ1biBhIFNoYXBpcm8tV2lsa3MgdGVzdC4gSXMgdGhlIGRhdGEgbm9ybWFsbHkgZGlzdHJpYnV0ZWQ/IEFyZSB0aGVyZSBhbnkgb3V0bGllcnM/IA0KDQpObywgZGF0YSBpcyBub3Qgbm9ybWFsbHkgZGlzdHJpYnV0ZWQuDQpgYGB7cn0NCiNzaGFwaXJvLnRlc3QoZGF0YTEkYXZlcmFnZV9yYXRpbmcpDQoNCmtzLnRlc3QoZGF0YTEkYXZlcmFnZV9yYXRpbmcsICJwbm9ybSIsIG1lYW49bWVhbihkYXRhMSRhdmVyYWdlX3JhdGluZyksIHNkPXNkKGRhdGExJGF2ZXJhZ2VfcmF0aW5nKSkNCg0KYGBgDQoNCg0KIyMgTGV2ZWwgMjogVGFraW5nIGludG8gY29uc2lkZXJhdGlvbiBib3RoIGF2ZXJhZ2UgcmF0aW5nIEFORCB0aGUgbnVtYmVyIG9mIHJhdGluZ3MgYSBib29rIGhhcyByZWNlaXZlZCwgd2hhdCBhcmUgdGhlIHRvcCBmaXZlIG1vc3QgcG9wdWxhciBib29rcz8NCg0KYGBge3J9DQpkYXRhMV90b3A1PWRhdGExICU+JSANCiAgZ3JvdXBfYnkodGl0bGUscmF0aW5nc19jb3VudCxhdmVyYWdlX3JhdGluZykgJT4lDQogIHJlZnJhbWUocmF0aW5nc19jb3VudCkgJT4lIA0KICBhcnJhbmdlKGRlc2MocmF0aW5nc19jb3VudCksZGVzYyhhdmVyYWdlX3JhdGluZykpICU+JSANCiAgaGVhZCg1KQ0KDQpwcmludChkYXRhMV90b3A1KQ0KYGBgDQoNCiMjIExldmVsIDM6IElzIHRoZXJlIGFueSByZWxhdGlvbnNoaXAgYmV0d2VlbiB3aGVuIGEgYm9vayB3YXMgcHVibGlzaGVkIGFuZCBpdHMgYXZlcmFnZSByYXRpbmdzPyANCg0KTm8gcmVsYXRpb25zaGlwIGRldGVjdGVkLg0KYGBge3J9DQojbm8gcmVsYXRpb25zaGlwDQpkYXRhMSAlPiUgDQogIGdncGxvdChhZXMoeD1EQVRFLHk9YXZlcmFnZV9yYXRpbmcpKSsNCiAgZ2VvbV9wb2ludCgpDQoNCmBgYA0KDQojIyBMZXZlbCA0OiBQaWNrIHlvdXIgZmF2b3JpdGUgYXV0aG9yIChvciBhbnkgYXV0aG9yIG9mIHlvdXIgY2hvb3NpbmcpIGluIHRoZSBkYXRhc2V0IGFuZCBwbG90IHRoZSByZWxhdGlvbnNoaXAgYmV0d2VlbiB0aGUgbnVtYmVyIG9mIHJhdGluZ3MgdGhleSByZWNlaXZlIChjb2x1bW4gSikgYW5kIHRoZSBhdmVyYWdlIHJhdGluZ3MgZm9yIHRoZWlyIGJvb2sgKGNvbHVtbiBEKS4gV2hhdCBraW5kIG9mIHJlbGF0aW9uc2hpcCBkbyB5b3Ugc2VlLCBpZiBhbnk/DQoNClBpY2tpbmcgSlJSIFRvbGtpZW4uIEFuZCBubyByZWxhdGlvbnNoaXAgZGV0ZWN0ZWQuDQpgYGB7cn0NCmRhdGEyID0gZGF0YTEgJT4lIA0KICBmaWx0ZXIoc3RyX2RldGVjdChhdXRob3JzLHJlZ2V4KCJ0b2xraWVuIixpZ25vcmVfY2FzZSA9IFQpKSkNCg0KZGF0YTIgJT4lIA0KICBnZ3Bsb3QoYWVzKHg9cmF0aW5nc19jb3VudCx5PWF2ZXJhZ2VfcmF0aW5nKSkrDQogIGdlb21fcG9pbnQoKQ0KDQojbm8gbm9ybWFsbHkgZGlzdHJpYnV0ZWQgLSBydW4gc3BlYXJtYW4gb3Iga2VuZGFsbA0Kc2hhcGlyby50ZXN0KGRhdGEyJHJhdGluZ3NfY291bnQpDQoNCnNoYXBpcm8udGVzdChkYXRhMiRhdmVyYWdlX3JhdGluZykNCg0KaGlzdChkYXRhMiRyYXRpbmdzX2NvdW50KQ0KDQpoaXN0KGRhdGEyJGF2ZXJhZ2VfcmF0aW5nKQ0KDQpjb3IudGVzdChkYXRhMiRyYXRpbmdzX2NvdW50LGRhdGEyJGF2ZXJhZ2VfcmF0aW5nLG1ldGhvZCA9ICJrIikNCmBgYA0KDQpgYGB7ciBpbmNsdWRlPUZ9DQojYmVlcCB3aGVuIGRvbmUNCmlmIChyZXF1aXJlKCJiZWVwciIpKQ0KICBiZWVwcjo6YmVlcCgyKQ0KYGBg