# Always good practice to load tidyverse and dataset
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.1
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
sales <- read_csv("sales2019.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## user_submitted_review = col_character(),
## title = col_character(),
## total_purchased = col_double(),
## customer_type = col_character()
## )
#Any other library that we need for this script?
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Determine How big is the dataset?
dim(sales)
## [1] 5000 5
# There are 5000 rows and 5 columns
# Determining column names
colnames(sales)
## [1] "date" "user_submitted_review" "title"
## [4] "total_purchased" "customer_type"
# Determining column types
for (col in colnames(sales)) {
typeof(sales[[col]]) %>% print
}
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "double"
## [1] "character"
# Column Specification:
# cols (
# date = col_character(),
# user_submitted review = col_character()
# title = col_character,
# total_purchased = col_double(),
# customer_type = col_character()
# From looking at the dataset, user_submitted_review and total_purchased seem to have missing data.
# Also from looking at the dataset, we notice what the data columns seem to represent:
## date is date
## user_submitted_review is how users felt after reading.
## title is for title of the book
## total purchased is how many copies the customer bought
## customer_type represents whether it was an individual or business entity that purchased.
#Data Cleaning / Processing ## Dealing with missing data by either 1) removing columns/rows or 2) filling in, or imputation
complete_copy_sales <- sales %>%
filter(
!(is.na(user_submitted_review)) # note we are filtering user_submitted_review column as it has incomplete values
)
# determine new dimensions of filtered dataset
dim(complete_copy_sales)
## [1] 4115 5
# There are 4115 rows and 5 columns
# calculate the average number of books purchased on an order
purchase_mean <- complete_copy_sales %>%
filter(!is.na(total_purchased)) %>%
pull(total_purchased) %>%
mean
# Fill all of the missing values in total_purchased with the calculated mean
complete_copy_sales <- complete_copy_sales %>%
mutate(
complete_total_purchased = if_else(is.na(total_purchased),
purchase_mean,
total_purchased)
)
# Examine the unique sentences that are present in user_submitted_review
complete_copy_sales %>% pull(user_submitted_review) %>% unique
## [1] "it was okay"
## [2] "Awesome!"
## [3] "Hated it"
## [4] "Never read a better book"
## [5] "OK"
## [6] "The author's other books were better"
## [7] "A lot of material was not needed"
## [8] "Would not recommend"
## [9] "I learned a lot"
## unique values of user_submitted reviews are below
# [1] "it was okay" "Awesome!"
# [3] "Hated it" "Never read a better book"
# [5] "OK" "The author's other books were better"
# [7] "A lot of material was not needed" "Would not recommend"
# [9] "I learned a lot
# Detect specific words or phrases that help indicate if the review is positive or not
is_positive_review <- function(review)
positive_review = case_when(
str_detect(review, "okay") ~ TRUE,
str_detect(review, "Awesome") ~ TRUE,
str_detect(review, "Never") ~ TRUE,
str_detect(review, "OK") ~ TRUE,
str_detect(review, "a lot") ~ TRUE,
TRUE ~ FALSE # the review is negative
)
# Create a new column in the dataset that indicates whether or not the review in a given row is positive or not
complete_copy_sales <- complete_copy_sales %>%
mutate(
is_positive_review = unlist(map(user_submitted_review, is_positive_review))
)
complete_copy_sales <- complete_copy_sales %>%
mutate(
formatted_date = if_else(mdy(date) < ymd("2019/07/01"), "Pre", "Post")
)
# Using group_by() and summarize() to create a summary tibble
complete_copy_sales %>%
group_by(formatted_date) %>%
summarize(
sum_books_purchased = sum(complete_total_purchased)
)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## formatted_date sum_books_purchased
## <chr> <dbl>
## 1 Post 8190.
## 2 Pre 8211.
# A tibble: 2 x 2
# formatted_date sum_books_purchased
# <chr> <dbl>
# 1 Post 8190.
# 2 Pre 8211.
# judge whether or not the program was actually effective in terms of increasing the number of book sold
complete_copy_sales %>%
group_by(formatted_date, title) %>%
summarize(
sum_books_purchased = sum(complete_total_purchased)
) %>%
arrange(title, formatted_date)
## `summarise()` regrouping output by 'formatted_date' (override with `.groups` argument)
## # A tibble: 12 x 3
## # Groups: formatted_date [2]
## formatted_date title sum_books_purchased
## <chr> <chr> <dbl>
## 1 Post Fundamentals of R For Beginners 2832.
## 2 Pre Fundamentals of R For Beginners 3093.
## 3 Post R For Dummies 2779.
## 4 Pre R For Dummies 2626.
## 5 Post R Made Easy 24
## 6 Pre R Made Easy 15
## 7 Post R vs Python: An Essay 1172.
## 8 Pre R vs Python: An Essay 1271.
## 9 Post Secrets Of R For Advanced Students 1154.
## 10 Pre Secrets Of R For Advanced Students 965.
## 11 Post Top 10 Mistakes R Beginners Make 228.
## 12 Pre Top 10 Mistakes R Beginners Make 241.
# A tibble: 12 x 3
# Groups: formatted_date [2]
# formatted_date title sum_books_purchased
# <chr> <chr> <dbl>
# 1 Post Fundamentals of R For Beginners 2832.
# 2 Pre Fundamentals of R For Beginners 3093.
# 3 Post R For Dummies 2779.
# 4 Pre R For Dummies 2626.
# 5 Post R Made Easy 24
# 6 Pre R Made Easy 15
# 7 Post R vs Python: An Essay 1172.
# 8 Pre R vs Python: An Essay 1271.
# 9 Post Secrets Of R For Advanced Students 1154.
# 10 Pre Secrets Of R For Advanced Students 965.
# 11 Post Top 10 Mistakes R Beginners Make 228.
# 12 Pre Top 10 Mistakes R Beginners Make 241.
## Books that improved in sales after the program were 'R For Dummies', 'R Made Easy', and 'Secrets Of R For Advanced Students'.
# Further judge program effectiveness in terms of sales by customer type
complete_copy_sales %>%
group_by(formatted_date, customer_type) %>%
summarize(
sum_books_purchased = sum(complete_total_purchased)
) %>%
arrange(customer_type, formatted_date)
## `summarise()` regrouping output by 'formatted_date' (override with `.groups` argument)
## # A tibble: 4 x 3
## # Groups: formatted_date [2]
## formatted_date customer_type sum_books_purchased
## <chr> <chr> <dbl>
## 1 Post Business 5742.
## 2 Pre Business 5612.
## 3 Post Individual 2448.
## 4 Pre Individual 2599.
# A tibble: 4 x 3
# Groups: formatted_date [2]
# formatted_date customer_type sum_books_purchased
# <chr> <chr> <dbl>
# 1 Post Business 5742.
# 2 Pre Business 5612.
# 3 Post Individual 2448.
# 4 Pre Individual 2599.
# Businesses actually purchased more books after the program while Individual sales decreased.
complete_copy_sales %>%
group_by(formatted_date) %>%
summarize(
total_positive_reviews = sum(is_positive_review)
)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## formatted_date total_positive_reviews
## <chr> <int>
## 1 Post 1128
## 2 Pre 1134