# Always good practice to load tidyverse and dataset
library(tidyverse)

## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.1
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

sales <- read_csv("sales2019.csv")

## Parsed with column specification:
## cols(
##   date = col_character(),
##   user_submitted_review = col_character(),
##   title = col_character(),
##   total_purchased = col_double(),
##   customer_type = col_character()
## )

#Any other library that we need for this script?
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Get acquainted with our Dataset

# Determine How big is the dataset?
dim(sales)

## [1] 5000    5

# There are 5000 rows and 5 columns

# Determining column names
colnames(sales)

## [1] "date"                  "user_submitted_review" "title"                
## [4] "total_purchased"       "customer_type"

# Determining column types
for (col in colnames(sales)) {
  typeof(sales[[col]]) %>% print
}

## [1] "character"
## [1] "character"
## [1] "character"
## [1] "double"
## [1] "character"

# Column Specification: 
# cols (
#   date = col_character(),
#   user_submitted review = col_character()
#   title = col_character,
#   total_purchased = col_double(),
#   customer_type = col_character()

# From looking at the dataset, user_submitted_review and total_purchased seem to have missing data.

# Also from looking at the dataset, we notice what the data columns seem to represent:
## date is date
## user_submitted_review is how users felt after reading.
## title is for title of the book
## total purchased is how many copies the customer bought
## customer_type represents whether it was an individual or business entity that purchased.

#Data Cleaning / Processing ## Dealing with missing data by either 1) removing columns/rows or 2) filling in, or imputation

complete_copy_sales <- sales %>%
  filter(
    !(is.na(user_submitted_review)) # note we are filtering user_submitted_review column as it has incomplete values
  )

# determine new dimensions of filtered dataset
dim(complete_copy_sales)

## [1] 4115    5

# There are 4115 rows and 5 columns

# calculate the average number of books purchased on an order
purchase_mean <- complete_copy_sales %>% 
  filter(!is.na(total_purchased)) %>% 
  pull(total_purchased) %>% 
  mean

# Fill all of the missing values in total_purchased with the calculated mean

complete_copy_sales <- complete_copy_sales %>%
  mutate(
    complete_total_purchased = if_else(is.na(total_purchased),
                                       purchase_mean,
                                       total_purchased)
    )

Classify reviews as either positive or negative

# Examine the unique sentences that are present in user_submitted_review

complete_copy_sales %>% pull(user_submitted_review) %>% unique

## [1] "it was okay"                         
## [2] "Awesome!"                            
## [3] "Hated it"                            
## [4] "Never read a better book"            
## [5] "OK"                                  
## [6] "The author's other books were better"
## [7] "A lot of material was not needed"    
## [8] "Would not recommend"                 
## [9] "I learned a lot"

## unique values of user_submitted reviews are below

# [1] "it was okay"                          "Awesome!"                            
# [3] "Hated it"                             "Never read a better book"            
# [5] "OK"                                   "The author's other books were better"
# [7] "A lot of material was not needed"     "Would not recommend"                 
# [9] "I learned a lot

# Detect specific words or phrases that help indicate if the review is positive or not

is_positive_review <- function(review)
  positive_review = case_when(
      str_detect(review, "okay") ~ TRUE,
      str_detect(review, "Awesome") ~ TRUE,
      str_detect(review, "Never") ~ TRUE,
      str_detect(review, "OK") ~ TRUE,
      str_detect(review, "a lot") ~ TRUE,
      TRUE ~ FALSE # the review is negative
      )

# Create a new column in the dataset that indicates whether or not the review in a given row is positive or not

complete_copy_sales <- complete_copy_sales %>% 
  mutate(
    is_positive_review = unlist(map(user_submitted_review, is_positive_review))
  )

Was the new book program effective in increasing book sales?

First is properly format all dates

Second is distinguish between sales before and after program start

complete_copy_sales <- complete_copy_sales %>% 
  mutate(
    formatted_date = if_else(mdy(date) < ymd("2019/07/01"), "Pre", "Post")
  )

# Using group_by() and summarize() to create a summary tibble
complete_copy_sales %>%
  group_by(formatted_date) %>%
  summarize(
    sum_books_purchased = sum(complete_total_purchased)
  )

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   formatted_date sum_books_purchased
##   <chr>                        <dbl>
## 1 Post                         8190.
## 2 Pre                          8211.

# A tibble: 2 x 2
#  formatted_date sum_books_purchased
#  <chr>                        <dbl>
# 1 Post                         8190.
# 2 Pre                          8211.

Examine Pre- and Post- sales by Book Title

# judge whether or not the program was actually effective in terms of increasing the number of book sold
complete_copy_sales %>%
  group_by(formatted_date, title) %>%
  summarize(
    sum_books_purchased = sum(complete_total_purchased)
  ) %>%
  arrange(title, formatted_date)

## `summarise()` regrouping output by 'formatted_date' (override with `.groups` argument)

## # A tibble: 12 x 3
## # Groups:   formatted_date [2]
##    formatted_date title                              sum_books_purchased
##    <chr>          <chr>                                            <dbl>
##  1 Post           Fundamentals of R For Beginners                  2832.
##  2 Pre            Fundamentals of R For Beginners                  3093.
##  3 Post           R For Dummies                                    2779.
##  4 Pre            R For Dummies                                    2626.
##  5 Post           R Made Easy                                        24 
##  6 Pre            R Made Easy                                        15 
##  7 Post           R vs Python: An Essay                            1172.
##  8 Pre            R vs Python: An Essay                            1271.
##  9 Post           Secrets Of R For Advanced Students               1154.
## 10 Pre            Secrets Of R For Advanced Students                965.
## 11 Post           Top 10 Mistakes R Beginners Make                  228.
## 12 Pre            Top 10 Mistakes R Beginners Make                  241.

# A tibble: 12 x 3
# Groups:   formatted_date [2]
#   formatted_date title                              sum_books_purchased
#   <chr>          <chr>                                            <dbl>
# 1 Post           Fundamentals of R For Beginners                  2832.
# 2 Pre            Fundamentals of R For Beginners                  3093.
# 3 Post           R For Dummies                                    2779.
# 4 Pre            R For Dummies                                    2626.
# 5 Post           R Made Easy                                        24 
# 6 Pre            R Made Easy                                        15 
# 7 Post           R vs Python: An Essay                            1172.
# 8 Pre            R vs Python: An Essay                            1271.
# 9 Post           Secrets Of R For Advanced Students               1154.
# 10 Pre            Secrets Of R For Advanced Students                965.
# 11 Post           Top 10 Mistakes R Beginners Make                  228.
# 12 Pre            Top 10 Mistakes R Beginners Make                  241.

## Books that improved in sales after the program were 'R For Dummies', 'R Made Easy', and 'Secrets Of R For Advanced Students'.

Examine Pre- and Post- sales by customer type

# Further judge program effectiveness in terms of sales by customer type
complete_copy_sales %>% 
  group_by(formatted_date, customer_type) %>% 
  summarize(
    sum_books_purchased = sum(complete_total_purchased)
  ) %>% 
  arrange(customer_type, formatted_date)

## `summarise()` regrouping output by 'formatted_date' (override with `.groups` argument)

## # A tibble: 4 x 3
## # Groups:   formatted_date [2]
##   formatted_date customer_type sum_books_purchased
##   <chr>          <chr>                       <dbl>
## 1 Post           Business                    5742.
## 2 Pre            Business                    5612.
## 3 Post           Individual                  2448.
## 4 Pre            Individual                  2599.

# A tibble: 4 x 3
# Groups:   formatted_date [2]
#  formatted_date customer_type sum_books_purchased
#  <chr>          <chr>                       <dbl>
# 1 Post           Business                    5742.
# 2 Pre            Business                    5612.
# 3 Post           Individual                  2448.
# 4 Pre            Individual                  2599.

# Businesses actually purchased more books after the program while Individual sales decreased.

Did review scores improve as a result of the program?

complete_copy_sales %>%
  group_by(formatted_date) %>%
  summarize(
    total_positive_reviews = sum(is_positive_review)
  )

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   formatted_date total_positive_reviews
##   <chr>                           <int>
## 1 Post                             1128
## 2 Pre                              1134

There was no significant change in the number of positive reviews as a result of the program. Technically, there was a decrease…

Guided Project: Creating An Efficient Data Analysis Workflow, Part 2

Earnest Salgado

8/6/2020

Get acquainted with our Dataset

Classify reviews as either positive or negative

Was the new book program effective in increasing book sales?

First is properly format all dates

Second is distinguish between sales before and after program start

Examine Pre- and Post- sales by Book Title

Examine Pre- and Post- sales by customer type

Did review scores improve as a result of the program?

There was no significant change in the number of positive reviews as a result of the program. Technically, there was a decrease…