# Always good practice to load tidyverse and dataset
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.1
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
reviews <- read_csv("book_reviews.csv")
## Parsed with column specification:
## cols(
##   book = col_character(),
##   review = col_character(),
##   state = col_character(),
##   price = col_double()
## )

Get acquainted with our Dataset

# Determine How big is the dataset?
dim(reviews) 
## [1] 2000    4
# There are 2000 rows and 4 columns

reviews
## # A tibble: 2,000 x 4
##    book                               review    state      price
##    <chr>                              <chr>     <chr>      <dbl>
##  1 R Made Easy                        Excellent TX          20.0
##  2 R For Dummies                      Fair      NY          16.0
##  3 R Made Easy                        Excellent NY          20.0
##  4 R Made Easy                        Poor      FL          20.0
##  5 Secrets Of R For Advanced Students Great     Texas       50  
##  6 R Made Easy                        <NA>      California  20.0
##  7 R Made Easy                        Great     Florida     20.0
##  8 R Made Easy                        Poor      CA          20.0
##  9 Top 10 Mistakes R Beginners Make   Fair      CA          30.0
## 10 Secrets Of R For Advanced Students Fair      Texas       50  
## # ... with 1,990 more rows
# A tibble: 2,000 x 4
#   book                               review    state      price
#   <chr>                              <chr>     <chr>      <dbl>
# 1 R Made Easy                        Excellent TX          20.0
# 2 R For Dummies                      Fair      NY          16.0
# 3 R Made Easy                        Excellent NY          20.0
# 4 R Made Easy                        Poor      FL          20.0
# 5 Secrets Of R For Advanced Students Great     Texas       50  
# 6 R Made Easy                        NA        California  20.0
# 7 R Made Easy                        Great     Florida     20.0
# 8 R Made Easy                        Poor      CA          20.0
# 9 Top 10 Mistakes R Beginners Make   Fair      CA          30.0
#10 Secrets Of R For Advanced Students Fair      Texas       50  
# ... with 1,990 more rows
# Determining column names
colnames(reviews)
## [1] "book"   "review" "state"  "price"
# Determining column types
for (c in colnames(reviews)) {
  typeof(reviews[[c]])
}
#  Determine unique values are present in each of the columns
for (c in colnames(reviews)) {
  print("Unique values in the column:")
  print(c)
  print(unique(reviews[[c]]))
  print("")
}
## [1] "Unique values in the column:"
## [1] "book"
## [1] "R Made Easy"                        "R For Dummies"                     
## [3] "Secrets Of R For Advanced Students" "Top 10 Mistakes R Beginners Make"  
## [5] "Fundamentals of R For Beginners"   
## [1] ""
## [1] "Unique values in the column:"
## [1] "review"
## [1] "Excellent" "Fair"      "Poor"      "Great"     NA          "Good"     
## [1] ""
## [1] "Unique values in the column:"
## [1] "state"
## [1] "TX"         "NY"         "FL"         "Texas"      "California"
## [6] "Florida"    "CA"         "New York"  
## [1] ""
## [1] "Unique values in the column:"
## [1] "price"
## [1] 19.99 15.99 50.00 29.99 39.99
## [1] ""

#Data Cleaning / Processing ## Dealing with missing data by either 1) removing columns/rows or 2) filling in, or imputation

Complete_Copy_reviews <- reviews %>%
  filter(!(is.na(review)) # note we are filtering review column as it has incomplete values
  )

# determine new dimensions of filtered dataset
dim(Complete_Copy_reviews)
## [1] 1794    4
# There are 1794 rows and 4 columns

Continuing to clean data by correcting label inconsistencies

Complete_Copy_reviews <- Complete_Copy_reviews %>%
  mutate(
    state = case_when(
      state == "TX" ~ "Texas",
      state == "NY" ~ "New York",
      state == "FL" ~ "Florida",
      state == "CA" ~ "California",
      TRUE ~ state
    )
  )

Converting reviews to numerical form using mutate()

Complete_Copy_reviews <- Complete_Copy_reviews %>%
  mutate(
    review_num = case_when(
      review == "Poor" ~ 1,
      review == "Fair" ~ 2,
      review == "Good" ~ 3,
      review == "Great" ~ 4,
      review == "Excellent" ~ 5,
      ),
    is_high_review = if_else(review_num >= 4, TRUE, FALSE) 
    # if else vectorizes into two decision tree
  )

Quick Analysis

Complete_Copy_reviews %>% 
  group_by(state) %>% 
  summarize(
    purchased = n()
  ) %>%
  arrange(-purchased)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 2
##   state      purchased
##   <chr>          <int>
## 1 New York         484
## 2 California       464
## 3 Texas            440
## 4 Florida          406
question <- "Which state spent the most on books?"

answer <- "New York"