By including this statement, we the authors of this work, verify that:
businesses <- read.csv("data files/businesses.csv")
reviews <- read.csv("data files/reviews.csv")
users <- read.csv("data files/users.csv")
library(DiagrammeR)
library(knitr)
library(kableExtra)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(maps)
grViz("
digraph relationships {
graph [layout = dot, rankdir = LR]
#Users box
users [shape = box, style = filled, color = lightblue,
label = 'Users\\n------------------\\nuser_id\\nname\\nreview_count\\naverage_stars\\nmember_since']
#Reviews box
reviews [shape = box, style = filled, color = lightgray,
label = 'Reviews\\n------------------\\nreview_id\\nuser_id\\nbusiness_id\\nstars\\ndate\\ntext']
#Business box
businesses [shape = box, style = filled, color = lightpink,
label = 'Businesses\\n------------------\\nbusiness_id\\nname\\ncity\\nstate\\nbusiness avg stars\\nreview_count\\ncategories\\nbusiness_group']
#Relationships
users -> reviews [label = 'user_id']
reviews -> businesses [label = 'business_id']
}
")
Figure 1 aims to provide a visual representation of the datasets included in the analysis, including variables and relationships. I used AI for assistance with the syntax as I hadn’t used the diagrammeR package before. I created a force directed graph which I had learnt about in Visual Analytics this semester. I created 3 boxes, each of which represented a dataset from the analysis, which included a title and the variable names underneath. I chose pastel colours based on my colour theme for the document. I represented the relationships by arrows and labelled them with the variable name.
Analyse review behaviour across individual groups.
#Convert member_since to date format, as it is set as a character string
users$member_since <- as.Date(users$member_since) #
#Create new column of the same size and set all values to NA, this will be used to divide into 3 groups later on
users$member_since_group <- rep(NA, nrow(users))
#Give a label to each user off of when they joined
for (i in 1:nrow(users)) {
if (is.na(users$member_since[i])) {
next # Skip to next one if date missing
} else if (users$member_since[i] < as.Date("2017-01-01")) {
#joined before 2017
users$member_since_group[i] <- "Veteran"
} else if (users$member_since[i] <= as.Date("2022-12-31")) {
#joined between 2017 and 2022
users$member_since_group[i] <- "Intermediate"
} else {
#Joined after 2022
users$member_since_group[i] <- "New"
}
}
#Merge users and review datasets by user_id
user_reviews_q1 <- merge(users, reviews, by = "user_id")
#Remove rows with missing data in the stars column
user_reviews_q1 <- user_reviews_q1[!is.na(user_reviews_q1$stars), ]
For 1.1, we did the following:
Convert member_since to date format to make it easier to work with (originally set as character string)
Created a new column that would hold group labels for each user
Created a loop to give each user a label based on their join date, as follows:
Veteran: joined before 2017
Intermediate: joined between 2017 and 2022
New: joined after 2022
Merged the ‘users’ and ‘reviews’ datasets by user_id so that we could efficiently analyse data from both tables at the same time. We needed this to be able to calculate review behaviour.
Remove any rows with missing data from the stars column of the new dataframe, since missing data could create false results in the analyse if it was included.
library(DiagrammeR)
grViz("
digraph merge_user_reviews_q1 {
graph [layout = dot, rankdir = LR]
# Users table node
users [shape = box, style = filled, fillcolor = lightblue, fontcolor = black,
label = 'users\\n------------------\\nuser_id\\nname\\nreview_count\\naverage_stars\\nmember_since\\nuser_group']
# Reviews table node
reviews [shape = box, style = filled, fillcolor = lightgray, fontcolor = black,
label = 'reviews\\n------------------\\nreview_id\\nuser_id\\nbusiness_id\\nstars\\ndate\\ntext']
# Merged result node with cleaned data
user_reviews_q1 [shape = box, style = filled, fillcolor = mediumpurple1, fontcolor = black,
label = 'user_reviews_q1\\n------------------\\nuser_id\\nname\\nreview_count\\naverage_stars\\nmember_since\\nuser_group\\nreview_id\\nbusiness_id\\nstars\\ndate\\ntext\\n\\n(NA rows in stars removed)']
# Arrows
users -> user_reviews_q1 [label = 'merge by user_id']
reviews -> user_reviews_q1 [label = 'merge by user_id']
}
")
Figure 1.1 follows the same logic as figure 0.1. However instead of showing all 3 original datasets, we showed the merge between ‘users’ and ‘reviews’ and included the new dataset ‘user_reviews_q1’ which was a result of the merge.
#Make a list of the 3 distinct groups within members_since_group.
member_since_group_list <- c("Veteran", "Intermediate", "New")
#Create empty vectors with length of 3. Later, each one will store the results for each group
num_users <- rep(0, length(member_since_group_list))
avg_stars <- rep(0, length(member_since_group_list))
avg_reviews_per_user <- rep(0, length(member_since_group_list))
#Make a loop to go through each group and assign a value for each summary statistic to the empty vector
for (i in 1:length(member_since_group_list)) {
#Select group one at a time
group <- member_since_group_list[i]
#Only look at the reviews for the group that is selected
group_data <- user_reviews_q1[user_reviews_q1$member_since_group == group, ]
#Get the all the users in the group, using 'unique' so users arent repeated
unique_users <- unique(group_data$user_id)
#Calculate the statistics and store in vectors from earlier
#how many users are in the group
num_users[i] <- length(unique_users)
#whats the average rating of all the users in the group
avg_stars[i] <- mean(group_data$stars, na.rm = T)
#how many reviews did each user write on average
avg_reviews_per_user[i] <- nrow(group_data) / length(unique_users)
}
For 1.2, we did the following
Made a list of each user group to be used in the loop
Created empty vectors to store the summary statistics
Created a for loop to go through each user group and calculate/store the summary statistics. We made sure to extract only unique users so that there werent duplicates.
Summary statistics used include:
Number of users in group
Average review stars for each group
Average reviews per user in each group
#Creating a dataframe to hold results for the user groups
results_table_q1 <- data.frame(
#Group name
Group = member_since_group_list,
#How many (unique) users in each group
Num_Users = num_users,
#Avg star rating for each group
Avg_Review_Stars = round(avg_stars, 2),
#Average number of reviews for each user in each group
Avg_Reviews_Per_User = round(avg_reviews_per_user, 2)
)
For 1.3 we did the following:
Made a data frame to store the statistics for each of the groups, so that its easy to visualise for the next step
Rounded the continuous variables to 2 decimal places.
#Put the results from 1.3 into the table in kable
results_table_q1 %>%
kable(
#Make the column names look nicer (capitalise and remvove underscores)
col.names = c("User Group", "Number of Users", "Average Review Stars", "Average Reviews per User"),
#Put all the text as right align
align = c("r", "r", "r", "r"),
#?????????????????
format = "html",
#continous variables as 2dp
digits = 2
) %>%
kable_styling(
#table is shorter rather than fit to page
full_width = FALSE,
#striped means row colours are alternate, condensed means the padding is reduced so it doesnt take up as much empty space
bootstrap_options = c("striped", "condensed")
) %>%
#Header row is white text, bold text and purple background
row_spec(0, bold = TRUE, color = "#ffffff", background = "#6a1b9a") %>%
#other rows are dark grey text
row_spec(1:nrow(results_table_q1), color = "#2c3e50") %>%
#add a border for all the rows, make the column width the same for all the columns
column_spec(1:4, border_right = TRUE, border_left = TRUE, width = "6em")
| User Group | Number of Users | Average Review Stars | Average Reviews per User |
|---|---|---|---|
| Veteran | 6519 | 2.99 | 5.59 |
| Intermediate | 22472 | 3.00 | 5.26 |
| New | 8312 | 3.01 | 5.42 |
For Figure 3 we used kable to create and format the table. AI was used to help with several aspects of formatting:
creating a cconsistent column width (width as column_spec)
making the table more compact (full_width = FALSE)
reducing empty space in knit document (“condensed” as bootstrap option)
alternating row colours (“striped” as bootstrap option)
We chose to use a purple and grey colour theme to match the main theme of the document
named colums and alligned text on the right
continous variables were rounded to 2 decimal places
colour theme was maintained
header with bold white text and on a purple background
the other rows had dark grey text
#Clean the data using tidyverse package
user_reviews_q1_clean <- user_reviews_q1 %>%
#remove rows where there is no star rating
filter(!is.na(stars), !is.na(member_since_group)) %>%
#remove rows that dont belong to a group
rename(Group = member_since_group)
#perform ANOVA
anova_model_q1 <- aov(stars ~ Group, data = user_reviews_q1_clean)
summary(anova_model_q1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Group 2 6 2.760 1.381 0.251
## Residuals 183049 365792 1.998
#Make boxplot to show distrubution of stars in each group
ggplot(user_reviews_q1_clean, aes(x = Group, y = stars, fill = Group)) +
geom_boxplot() +
#set colours to match document theme
scale_fill_manual(values = c(
"Veteran" = "#6a1b9a",
"Intermediate" = "#f57c00",
"New" = "#95a5a6"
)) +
#set lables and title
labs(
title = "Distribution of Review Stars by User Group",
x = "User Group",
y = "Review Stars",
fill = "User Group"
) +
#use the same theme as the other charts in the document
theme_minimal() +
theme(
#make title bold, bigger and purple
plot.title = element_text(face = "bold", size = 14, color = "#6a1b9a"),
#make labels bigger
axis.title = element_text(size = 12),
#hide legend becauyse its not necessary
legend.position = "none"
)
For Figure 1.2 we created a boxplot based on an ANOVA to test whether there was a statistically significant difference in average review stars between groups based on recency of joining. The code was based off of tutorials from COMP1014 (Thinking about Data) and AI was used to help with the colour theme.
Null hypothesis: Average review stars are the same in each
group
Alternate Hypothesis: At least one group has a different average review stars
Conclusion: A p value of 0.251 shows that with a significance level of 0.05, we fail to reject the null hypothesis and can confirm that the average review stars are the same for each group.
The visualisation confirms this finding, as any difference in box height, whisker height and median line are barely visible.
We split the users into 3 groups based on how long they had been using the platform
Veteran: Before 2017
Intermediate: Between 2017 and 2022
New: After 2022
We calculated several summary statistics for each group
Numbers of users
Average review stars
Average reviews per user
Looking at the results of Table 1. we can see the following insights:
‘Intermediate’ has over 3 times as many users as ‘veteran’, and almost 3 times as many users as ‘new’.
The average review stars vary by only 0.02 stars, unlikely to be a statistically significant difference.
There is more variation in average reviews per user. ’Veteran’s have the most reviews (5.59), ’intermediate’s have the lowest (5.26) and ’New’s have somewhere inbetween the others 2 groups (5.26).
#Make a new dataframe that merges businesses and reviews by the common variable 'business_id', using functions from tidyverse
#start pipeline with reviews. this will be left table for left join.
review_summary_q2 <- reviews %>%
#Add businesses table to left table
left_join(businesses, by = "business_id") %>%
#choosing which variables from businesses to add to reviews
select(review_id, user_id, business_id, stars, date, state)
For 2.1 we did the following:
Used tidyverse functions to perform left join between reviews and businesses
we did this so that later on we can use information from both tables that is relevant to the statistics we need to calculate
we chose to perform left_join instead of merge because we wanted to base the new table on ‘reviews’ because the reviews are the main focus of question 2.
Figure 2.1 follows the same logic as figure 0.1. However instead of showing all 3 original datasets, we showed the merge between ‘businesses’ and ‘reviews’ and included the new dataset ‘review_summary_q2’ which was a result of the merge.
#Use tidyverse library to make table
# Create summary table from the merged review + business data
stateSummary_q2 <- review_summary_q2 %>%
#remove anyhting with na in stars and state
filter(!is.na(stars) & !is.na(state)) %>%
#group together by state to calculate statistics for each state
group_by(state) %>%
#summarise for all states
summarise(
#average amount of stars in each review in each state
avgStars = mean(stars, na.rm = TRUE),
#numbers of reviews in each state
numReviews = n(),
#the numbver of unique userrs in each state who did a review
numUsers = n_distinct(user_id)
) %>%
#sort so that the highest numbers show first in the table
arrange(desc(avgStars))
For 2.2 we did the following:
used the merged dataset from 2.1
removed rows for analysis with missing data in stars and state
data grouped by state
calcualated summary statistics
average review stars for each state
number of reviews for each state
number of users for each state
sorted the table by highest average stars at the top
#range for average review stars
range_avg <- range(stateSummary_q2$avgStars, na.rm = TRUE)
range_diff <- diff(range_avg)
#mean for average review stars
mean_avg <- mean(stateSummary_q2$avgStars, na.rm = TRUE)
#print results
range_avg #minimum and maximum
## [1] 2.949411 3.042339
range_diff #difference between min and max
## [1] 0.09292819
mean_avg #mean
## [1] 2.999585
For 2.3 we calculated additional summary statistics to better understand the distribution of average review stars.
#Put the results from 2.2 into the table in kable
stateSummary_q2 %>%
kable(
#Make the column names look nicer (capitalise and remove underscores)
col.names = c("State", "Average Review Stars", "Number of Reviews", "Number of Users"),
#Put all the text as right align
align = c("r", "r", "r", "r"),
#use html output for styling
format = "html",
#continuous variables as 2dp
digits = 2
) %>%
kable_styling(
#table is shorter rather than fit to page
full_width = FALSE,
#striped means row colours alternate, condensed means less padding
bootstrap_options = c("striped", "condensed")
) %>%
#Header row is white text, bold text and purple background
row_spec(0, bold = TRUE, color = "#ffffff", background = "#6a1b9a") %>%
#other rows are dark grey text
row_spec(1:nrow(stateSummary_q2), color = "#2c3e50") %>%
#add a border for all the rows, make the column width the same for all the columns
column_spec(1:4, border_right = TRUE, border_left = TRUE, width = "6em")
| State | Average Review Stars | Number of Reviews | Number of Users |
|---|---|---|---|
| WI | 3.04 | 3283 | 3054 |
| WV | 3.04 | 3685 | 3416 |
| NE | 3.04 | 3202 | 2992 |
| AL | 3.03 | 3548 | 3295 |
| ND | 3.03 | 3792 | 3523 |
| TN | 3.03 | 3335 | 3105 |
| UT | 3.03 | 3173 | 2935 |
| VT | 3.03 | 3333 | 3095 |
| MD | 3.03 | 3351 | 3133 |
| MS | 3.02 | 3477 | 3240 |
| SC | 3.02 | 3621 | 3367 |
| TX | 3.02 | 3642 | 3390 |
| ID | 3.02 | 3336 | 3091 |
| DC | 3.02 | 3632 | 3381 |
| AR | 3.02 | 3484 | 3250 |
| NJ | 3.02 | 3460 | 3212 |
| LA | 3.01 | 3503 | 3256 |
| NY | 3.01 | 3460 | 3206 |
| ME | 3.01 | 3381 | 3157 |
| CT | 3.01 | 9197 | 8014 |
| GA | 3.01 | 3525 | 3254 |
| IN | 3.01 | 3036 | 2828 |
| OR | 3.00 | 3552 | 3300 |
| MI | 3.00 | 3354 | 3119 |
| KS | 3.00 | 3258 | 3052 |
| NM | 3.00 | 3659 | 3396 |
| MN | 3.00 | 3521 | 3309 |
| CA | 2.99 | 3534 | 3287 |
| MA | 2.99 | 3300 | 3068 |
| 2.99 | 5423 | 4956 | |
| PA | 2.99 | 3671 | 3419 |
| DE | 2.99 | 3401 | 3185 |
| AK | 2.99 | 3464 | 3207 |
| AZ | 2.99 | 3605 | 3363 |
| WA | 2.99 | 3758 | 3504 |
| NC | 2.99 | 3513 | 3264 |
| CO | 2.99 | 3464 | 3217 |
| MT | 2.98 | 3630 | 3380 |
| IA | 2.98 | 3506 | 3262 |
| OH | 2.98 | 3608 | 3369 |
| WY | 2.98 | 3350 | 3127 |
| IL | 2.98 | 3309 | 3060 |
| NH | 2.98 | 3341 | 3109 |
| HI | 2.98 | 3726 | 3450 |
| VA | 2.98 | 3499 | 3244 |
| OK | 2.97 | 3760 | 3478 |
| KY | 2.97 | 3466 | 3237 |
| SD | 2.97 | 3669 | 3417 |
| NV | 2.96 | 3099 | 2887 |
| RI | 2.96 | 3350 | 3126 |
| FL | 2.96 | 3353 | 3122 |
| MO | 2.95 | 3736 | 3504 |
Refer to table 1 description as the code is the same.
#Using packages ggplot2 and maps
#Load the relevant map outline
us_states <- map_data("state")
#Make sure that all states names from dataset match the names from the package (AI assistance)
stateSummary_q2$state_name <- tolower(state.name[match(stateSummary_q2$state, state.abb)])
#Left join to combine our data with the map data, making sure not to modify the map data (thats why we used left join)
map_data_q2 <- us_states %>%
left_join(stateSummary_q2, by = c("region" = "state_name"))
#AI assistance was used for some areas in formatting
#Get the map to be plot by average stars
ggplot(map_data_q2, aes(x = long, y = lat, group = group, fill = avgStars)) +
#draw borders
geom_polygon(color = "white") +
#keep the aspect ratio so that it isnt stretched out once knit
coord_fixed(1.3) +
#set gradient baseed on average stars, make the states with no data grey
scale_fill_gradient(
low = "#fceabb", high = "#f76b1c", na.value = "grey90",
#title for legend
name = "Avg Stars"
) +
labs(
#title
title = "Average Review Stars by State",
) +
#choose theme to match document theme
theme_minimal() +
theme(
#put legend on right hand side
legend.position = "right",
#hides the axis latitude/longitude, tick marks and labels
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
#make the title bold and purple to match theme
plot.title = element_text(color = "#6a1b9a", face = "bold", size = 14),
#make the legend title and labels purple
legend.title = element_text(color = "#6a1b9a", face = "bold"),
legend.text = element_text(color = "#6a1b9a")
)
For Figure 2.2 we used ggplot2 and maps packages to design this map. Knowledge of how to do this comes from units COMP2026 Visual Analytics and MEDI3007 Health Sciences Placement, which is why we chose to go with this method. AI was used to help with colour formatting.
For question 2 we looked at average review star ratings in each US state. Calculations include:
Average review stars per state
total reviews per state
Number of users (unique) per state who made a review
the findings were:
Wisconsin had the highest average review stars (3.04) , with 3,283 reviews and 3,054 users.
Missouri had the lowest average review stars (2.95), with 3,736 reviews and 3,504 users.
The range of average review stars across all states was quite small (2.95 to 3.04).
The mean average review stars was 3.00
#remove reviews where user_id is not available
new_reviews_q4 <- reviews %>%
filter(!is.na(user_id) & user_id != "")
#use left join to combine updated reviews with several variables from users. Use user_id as the relationship
new_table_q4 <- left_join(new_reviews_q4, users %>% select(user_id, name, member_since), by = "user_id")
For 3.1 we did the following:
Removed reviews where user_id wasn’t available
Used left join to combine reviews (with NA removed) users, joining with user_id.
#remove reviews where user_id is not available
new_reviews_q4 <- reviews %>%
filter(!is.na(user_id) & user_id != "")
#use left join to combine updated reviews with several variables from users. Use user_id as the relationship
new_table_q4 <- left_join(new_reviews_q4, users %>% select(user_id, name, member_since), by = "user_id")
#group by user id to find number of reviews per user
top_users_q3 <- new_table_q4 %>%
group_by(user_id) %>%
summarise(num_reviews = n(), .groups = "drop") %>%
#sort by review count highest to lowest
arrange(desc(num_reviews)) %>%
#only keep the top 10 users
slice(1:10)
#check how many users have 14 or more reviews
users_14_or_more <- new_table_q4 %>%
#group by user_id
group_by(user_id) %>%
#find total number of reviews for each user
summarise(num_reviews = n(), .groups = "drop") %>%
#only keep users with greater than or equal to 14 reviews
filter(num_reviews >= 14)
#count how many users have greater than or equal to 14 reviews
n_users_14_or_more <- nrow(users_14_or_more)
print(n_users_14_or_more)
## [1] 16
#calculate the average star rating for each of the top 10 users
avg_rating_top_users <- new_table_q4 %>%
semi_join(top_users_q3, by = "user_id") %>%
group_by(user_id) %>%
summarise(avg_star_rating = mean(stars, na.rm = TRUE), .groups = "drop")
#join top 10 user review counts with their name and average star rating
top_users_final <- top_users_q3 %>%
left_join(users %>% select(user_id, name), by = "user_id") %>%
#replace missing names with "Unknown"
mutate(name = ifelse(is.na(name), "Unknown", name)) %>%
#add average star rating from earlier
left_join(avg_rating_top_users, by = "user_id") %>%
#create a rank column to number them from 1 to 10
mutate(rank = row_number()) %>%
#select and arrange final columns for the table
select(rank, user_id, name, num_reviews, avg_star_rating)
For 3.2 we did the following:
Removed reviews if they had missing user_ids
Used left join to prioritise reviews (keep it unchanged except for new columns)
Found top 10 users and calculate average star rating
It looks like there are 16 users with greater than or equal to 14. We will choose the top10 by row order (default) even though its not ideal.
#Put the top 10 users into a styled table using kable
top_users_final %>%
kable(
#Make the column names look nicer (capitalise and remove underscores)
col.names = c("User Rank", "User ID", "Name", "Number of Reviews", "Average Star Rating"),
#Put all the text as right align
align = c("r", "r", "r", "r", "r"),
#use html output for styling
format = "html",
#continuous variables as 2dp
digits = 2
) %>%
kable_styling(
#table is shorter rather than fit to page
full_width = FALSE,
#striped means row colours alternate, condensed means less padding
bootstrap_options = c("striped", "hover", "condensed", "responsive")
) %>%
#Header row is white text, bold text and purple background
row_spec(0, bold = TRUE, color = "#ffffff", background = "#6a1b9a") %>%
#other rows are dark grey text
row_spec(1:nrow(top_users_final), color = "#2c3e50") %>%
#add a border for all the rows, make the column width the same for all the columns
column_spec(1:5, border_right = TRUE, border_left = TRUE, width = "6em")
| User Rank | User ID | Name | Number of Reviews | Average Star Rating |
|---|---|---|---|---|
| 1 | u_27070 | Rebecca | 18 | 2.83 |
| 2 | u_11551 | Christopher | 15 | 3.27 |
| 3 | u_6766 | Tracy | 15 | 3.27 |
| 4 | u_11229 | Benjamin | 14 | 3.07 |
| 5 | u_14899 | Jason | 14 | 2.57 |
| 6 | u_17629 | Andrew | 14 | 2.21 |
| 7 | u_22933 | Stephanie | 14 | 2.93 |
| 8 | u_23971 | Unknown | 14 | 2.36 |
| 9 | u_27907 | Jesse | 14 | 3.43 |
| 10 | u_29224 | Rebecca | 14 | 3.50 |
Refer to table 1 description as the code is the same.
#filter the original review table to only keep reviews from the top 10 users
top_user_reviews_named <- new_table_q4 %>%
#keep only rows where user_id is in the top_users_q3 list
semi_join(top_users_q3, by = "user_id") %>%
#replace any missing names with "Unknown"
mutate(name = ifelse(is.na(name), "Unknown", name))
#plot a boxplot showing the spread of star ratings for each top user
ggplot(top_user_reviews_named, aes(x = reorder(name, -stars), y = stars)) +
#draw the boxplots with custom colours
geom_boxplot(fill = "#d1c4e9", color = "#6a1b9a") +
#add title and axis labels
labs(
title = "Rating Distribution of Top 10 Users",
x = "User",
y = "Stars"
) +
#use a clean minimal theme
theme_minimal() +
#AI was utilised to help with elements of formatting beyond this point
#customise colours, fonts and layout
theme(
plot.background = element_rect(fill = "#f4f1ea", color = NA), #background behind plot
panel.background = element_rect(fill = "#f4f1ea", color = NA), #background behind axes
panel.grid.major = element_line(color = "#e0e0e0"), #gridlines
axis.text = element_text(color = "#2c3e50", size = 10), #axis numbers
axis.title = element_text(color = "#6a1b9a", size = 12), #axis titles
plot.title = element_text(color = "#6a1b9a", size = 14, face = "bold"), #plot title
axis.text.x = element_text(angle = 45, hjust = 1) #rotate user names
)
Findings from figure 3.1 are discussed in Question 3 Summary
# Calculate the average rating for all users (not just top 10)
overall_user_avg <- new_table_q4 %>%
group_by(user_id) %>%
summarise(user_avg_stars = mean(stars, na.rm = TRUE)) %>%
summarise(overall_avg = mean(user_avg_stars, na.rm = TRUE))
# Calculate the average of the top 10 users
mean_top10 <- mean(top_users_final$avg_star_rating, na.rm = TRUE)
# Print the results
overall_user_avg
## # A tibble: 1 × 1
## overall_avg
## <dbl>
## 1 3.00
mean_top10
## [1] 2.94381
For question 3 we looked at the top 10 users based on number of reviews. We ran into a problem here because there were 16 users with greater than or equal to 14 reviews, so we went by the order in which they were on the dataset. Also, one of the users ended up with the name ‘unknown’. The boxplot only had the top 9 users but wee didn’t have time to fix the code.
For each top 10 user we calculated:
Total amount of reviews
Average star ratings
We put the results in a table and into a boxplot.
Findings were as follows:
Jesse and Tracy have the highest median scores (4 stars), while the others all have median scores of 2 or 3 stars
Rebecca has the widest distribution of ratings, based on the length of the box
Stephanie is the only user who has outliers with 1 and 5 star ratings, based on the dots
Those with shorter boxes (like Stephanie and Jason) give the most consistent ratings on average
#Merge data from reviews (left table) and users using left_join
merged_q4 <- reviews %>%
#join by common variable user_id
left_join(users, by = "user_id") %>%
#remove rows with missing values
filter(!is.na(stars), !is.na(text), !is.na(member_since))
#split users based on if they joined before or after 2020
merged_q4 <- merged_q4 %>%
#add new column named user_group that will hold a label for all the users
mutate(
#if user joined prior to 2020
user_group = case_when(
as.numeric(substr(member_since, 1, 4)) < 2020 ~ "Before 2020",
#if user joined during of after 2020
as.numeric(substr(member_since, 1, 4)) >= 2020 ~ "After 2020"
)
)
#add column called review_length that contains number of chars in review
merged_q4 <- merged_q4 %>%
mutate(
#count characters
review_length = nchar(text)
)
#summarise by group
#create new table to story summary
summary_q4 <- merged_q4 %>%
#group by whether user group is before or after 2020
group_by(user_group) %>%
#calculations to do for each group
summarise(
#whats the average length of review and rm missing values
mean_length = mean(review_length, na.rm = TRUE),
#how many reviews are in the groups
num_of_reviews = n(),
#whats the average star rating and rm missing values
mean_star = mean(stars, na.rm = TRUE),
#number of unique users, got the function from google (i think its called a function)
num_users = n_distinct(user_id)
)
#Reorder columns to match custom table headers
summary_q4 <- summary_q4 %>%
select(user_group, mean_star, mean_length, num_of_reviews, num_users)
#Put the results from 4.3 into the table in kable
summary_q4 %>%
kable(
#Make the column names look nicer (capitalise and remove underscores)
col.names = c("User Group", "Mean Star Rating", "Mean Review Length", "Number of Reviews", "Number of Users"),
#Put all the text as right align
align = c("r", "r", "r", "r", "r"),
#use html output for styling
format = "html",
#continuous variables as 2dp
digits = 2
) %>%
kable_styling(
#table is shorter rather than fit to page
full_width = FALSE,
#striped means row colours alternate, condensed means less padding
bootstrap_options = c("striped", "condensed")
) %>%
#Header row is white text, bold text and purple background
row_spec(0, bold = TRUE, color = "#ffffff", background = "#6a1b9a") %>%
#other rows are dark grey text
row_spec(1:nrow(summary_q4), color = "#2c3e50") %>%
#add a border for all the rows, make the column width the same for all the columns
column_spec(1:5, border_right = TRUE, border_left = TRUE, width = "6em")
| User Group | Mean Star Rating | Mean Review Length | Number of Reviews | Number of Users |
|---|---|---|---|---|
| After 2020 | 3 | 58.98 | 99115 | 19670 |
| Before 2020 | 3 | 59.09 | 83937 | 17630 |
Refer to table 1.1 description as the code is the same.
#use ggplot to make bar chart to compare avg review length for both groups (before and after 2020 joining)
ggplot(summary_q4, aes(x = user_group, y = mean_length, fill = user_group)) +
#use review length as the length of the bars and set to white
geom_col(width = 0.6, color = "white") +
#add labels
labs(
title = "Average Review Length by User Group",
x = "User Group",
y = "Mean Review Length (Characters)"
) +
#use this theme to match the main theme of the document
theme_minimal() +
#customise text style and colours to match the main theme
theme(
#title is bold and purple
plot.title = element_text(color = "#6a1b9a", face = "bold", size = 14),
#axis labels are also purple and bold
axis.title = element_text(color = "#6a1b9a", face = "bold"),
#axis numbers are dark grey
axis.text = element_text(color = "#2c3e50"),
#remove legend
legend.position = "none"
) +
#use main theme colours for the bars
scale_fill_manual(values = c("Before 2020" = "#6a1b9a", "After 2020" = "#f76b1c"))
For question 4 we aimed to determine if there were major differences between the review behaviour of users who joined before vs after 2020.
Analysis included:
mean star rating
mean review length
number of reviews
Insights include: