# Always good practice to load tidyverse and dataset
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.1
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
sales <- read_csv("sales2019.csv")
## Parsed with column specification:
## cols(
##   date = col_character(),
##   user_submitted_review = col_character(),
##   title = col_character(),
##   total_purchased = col_double(),
##   customer_type = col_character()
## )
#Any other library that we need for this script?
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Get acquainted with our Dataset

# Determine How big is the dataset?
dim(sales)
## [1] 5000    5
# There are 5000 rows and 5 columns
# Determining column names
colnames(sales)
## [1] "date"                  "user_submitted_review" "title"                
## [4] "total_purchased"       "customer_type"
# Determining column types
for (col in colnames(sales)) {
  typeof(sales[[col]]) %>% print
}
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "double"
## [1] "character"
# Column Specification: 
# cols (
#   date = col_character(),
#   user_submitted review = col_character()
#   title = col_character,
#   total_purchased = col_double(),
#   customer_type = col_character()

# From looking at the dataset, user_submitted_review and total_purchased seem to have missing data.

# Also from looking at the dataset, we notice what the data columns seem to represent:
## date is date
## user_submitted_review is how users felt after reading.
## title is for title of the book
## total purchased is how many copies the customer bought
## customer_type represents whether it was an individual or business entity that purchased.

#Data Cleaning / Processing ## Dealing with missing data by either 1) removing columns/rows or 2) filling in, or imputation

complete_copy_sales <- sales %>%
  filter(
    !(is.na(user_submitted_review)) # note we are filtering user_submitted_review column as it has incomplete values
  )

# determine new dimensions of filtered dataset
dim(complete_copy_sales)
## [1] 4115    5
# There are 4115 rows and 5 columns
# calculate the average number of books purchased on an order
purchase_mean <- complete_copy_sales %>% 
  filter(!is.na(total_purchased)) %>% 
  pull(total_purchased) %>% 
  mean

# Fill all of the missing values in total_purchased with the calculated mean

complete_copy_sales <- complete_copy_sales %>%
  mutate(
    complete_total_purchased = if_else(is.na(total_purchased),
                                       purchase_mean,
                                       total_purchased)
    )

Classify reviews as either positive or negative

# Examine the unique sentences that are present in user_submitted_review

complete_copy_sales %>% pull(user_submitted_review) %>% unique
## [1] "it was okay"                         
## [2] "Awesome!"                            
## [3] "Hated it"                            
## [4] "Never read a better book"            
## [5] "OK"                                  
## [6] "The author's other books were better"
## [7] "A lot of material was not needed"    
## [8] "Would not recommend"                 
## [9] "I learned a lot"
## unique values of user_submitted reviews are below

# [1] "it was okay"                          "Awesome!"                            
# [3] "Hated it"                             "Never read a better book"            
# [5] "OK"                                   "The author's other books were better"
# [7] "A lot of material was not needed"     "Would not recommend"                 
# [9] "I learned a lot
# Detect specific words or phrases that help indicate if the review is positive or not

is_positive_review <- function(review)
  positive_review = case_when(
      str_detect(review, "okay") ~ TRUE,
      str_detect(review, "Awesome") ~ TRUE,
      str_detect(review, "Never") ~ TRUE,
      str_detect(review, "OK") ~ TRUE,
      str_detect(review, "a lot") ~ TRUE,
      TRUE ~ FALSE # the review is negative
      )
# Create a new column in the dataset that indicates whether or not the review in a given row is positive or not

complete_copy_sales <- complete_copy_sales %>% 
  mutate(
    is_positive_review = unlist(map(user_submitted_review, is_positive_review))
  )

Was the new book program effective in increasing book sales?

First is properly format all dates

Second is distinguish between sales before and after program start

complete_copy_sales <- complete_copy_sales %>% 
  mutate(
    formatted_date = if_else(mdy(date) < ymd("2019/07/01"), "Pre", "Post")
  )

# Using group_by() and summarize() to create a summary tibble
complete_copy_sales %>%
  group_by(formatted_date) %>%
  summarize(
    sum_books_purchased = sum(complete_total_purchased)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##   formatted_date sum_books_purchased
##   <chr>                        <dbl>
## 1 Post                         8190.
## 2 Pre                          8211.
# A tibble: 2 x 2
#  formatted_date sum_books_purchased
#  <chr>                        <dbl>
# 1 Post                         8190.
# 2 Pre                          8211.

Examine Pre- and Post- sales by Book Title

# judge whether or not the program was actually effective in terms of increasing the number of book sold
complete_copy_sales %>%
  group_by(formatted_date, title) %>%
  summarize(
    sum_books_purchased = sum(complete_total_purchased)
  ) %>%
  arrange(title, formatted_date)
## `summarise()` regrouping output by 'formatted_date' (override with `.groups` argument)
## # A tibble: 12 x 3
## # Groups:   formatted_date [2]
##    formatted_date title                              sum_books_purchased
##    <chr>          <chr>                                            <dbl>
##  1 Post           Fundamentals of R For Beginners                  2832.
##  2 Pre            Fundamentals of R For Beginners                  3093.
##  3 Post           R For Dummies                                    2779.
##  4 Pre            R For Dummies                                    2626.
##  5 Post           R Made Easy                                        24 
##  6 Pre            R Made Easy                                        15 
##  7 Post           R vs Python: An Essay                            1172.
##  8 Pre            R vs Python: An Essay                            1271.
##  9 Post           Secrets Of R For Advanced Students               1154.
## 10 Pre            Secrets Of R For Advanced Students                965.
## 11 Post           Top 10 Mistakes R Beginners Make                  228.
## 12 Pre            Top 10 Mistakes R Beginners Make                  241.
# A tibble: 12 x 3
# Groups:   formatted_date [2]
#   formatted_date title                              sum_books_purchased
#   <chr>          <chr>                                            <dbl>
# 1 Post           Fundamentals of R For Beginners                  2832.
# 2 Pre            Fundamentals of R For Beginners                  3093.
# 3 Post           R For Dummies                                    2779.
# 4 Pre            R For Dummies                                    2626.
# 5 Post           R Made Easy                                        24 
# 6 Pre            R Made Easy                                        15 
# 7 Post           R vs Python: An Essay                            1172.
# 8 Pre            R vs Python: An Essay                            1271.
# 9 Post           Secrets Of R For Advanced Students               1154.
# 10 Pre            Secrets Of R For Advanced Students                965.
# 11 Post           Top 10 Mistakes R Beginners Make                  228.
# 12 Pre            Top 10 Mistakes R Beginners Make                  241.

## Books that improved in sales after the program were 'R For Dummies', 'R Made Easy', and 'Secrets Of R For Advanced Students'.

Examine Pre- and Post- sales by customer type

# Further judge program effectiveness in terms of sales by customer type
complete_copy_sales %>% 
  group_by(formatted_date, customer_type) %>% 
  summarize(
    sum_books_purchased = sum(complete_total_purchased)
  ) %>% 
  arrange(customer_type, formatted_date)
## `summarise()` regrouping output by 'formatted_date' (override with `.groups` argument)
## # A tibble: 4 x 3
## # Groups:   formatted_date [2]
##   formatted_date customer_type sum_books_purchased
##   <chr>          <chr>                       <dbl>
## 1 Post           Business                    5742.
## 2 Pre            Business                    5612.
## 3 Post           Individual                  2448.
## 4 Pre            Individual                  2599.
# A tibble: 4 x 3
# Groups:   formatted_date [2]
#  formatted_date customer_type sum_books_purchased
#  <chr>          <chr>                       <dbl>
# 1 Post           Business                    5742.
# 2 Pre            Business                    5612.
# 3 Post           Individual                  2448.
# 4 Pre            Individual                  2599.

# Businesses actually purchased more books after the program while Individual sales decreased.

Did review scores improve as a result of the program?

complete_copy_sales %>%
  group_by(formatted_date) %>%
  summarize(
    total_positive_reviews = sum(is_positive_review)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##   formatted_date total_positive_reviews
##   <chr>                           <int>
## 1 Post                             1128
## 2 Pre                              1134

There was no significant change in the number of positive reviews as a result of the program. Technically, there was a decrease…