# Always good practice to load tidyverse and dataset
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.1
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
reviews <- read_csv("book_reviews.csv")
## Parsed with column specification:
## cols(
## book = col_character(),
## review = col_character(),
## state = col_character(),
## price = col_double()
## )
# Determine How big is the dataset?
dim(reviews)
## [1] 2000 4
# There are 2000 rows and 4 columns
reviews
## # A tibble: 2,000 x 4
## book review state price
## <chr> <chr> <chr> <dbl>
## 1 R Made Easy Excellent TX 20.0
## 2 R For Dummies Fair NY 16.0
## 3 R Made Easy Excellent NY 20.0
## 4 R Made Easy Poor FL 20.0
## 5 Secrets Of R For Advanced Students Great Texas 50
## 6 R Made Easy <NA> California 20.0
## 7 R Made Easy Great Florida 20.0
## 8 R Made Easy Poor CA 20.0
## 9 Top 10 Mistakes R Beginners Make Fair CA 30.0
## 10 Secrets Of R For Advanced Students Fair Texas 50
## # ... with 1,990 more rows
# A tibble: 2,000 x 4
# book review state price
# <chr> <chr> <chr> <dbl>
# 1 R Made Easy Excellent TX 20.0
# 2 R For Dummies Fair NY 16.0
# 3 R Made Easy Excellent NY 20.0
# 4 R Made Easy Poor FL 20.0
# 5 Secrets Of R For Advanced Students Great Texas 50
# 6 R Made Easy NA California 20.0
# 7 R Made Easy Great Florida 20.0
# 8 R Made Easy Poor CA 20.0
# 9 Top 10 Mistakes R Beginners Make Fair CA 30.0
#10 Secrets Of R For Advanced Students Fair Texas 50
# ... with 1,990 more rows
# Determining column names
colnames(reviews)
## [1] "book" "review" "state" "price"
# Determining column types
for (c in colnames(reviews)) {
typeof(reviews[[c]])
}
# Determine unique values are present in each of the columns
for (c in colnames(reviews)) {
print("Unique values in the column:")
print(c)
print(unique(reviews[[c]]))
print("")
}
## [1] "Unique values in the column:"
## [1] "book"
## [1] "R Made Easy" "R For Dummies"
## [3] "Secrets Of R For Advanced Students" "Top 10 Mistakes R Beginners Make"
## [5] "Fundamentals of R For Beginners"
## [1] ""
## [1] "Unique values in the column:"
## [1] "review"
## [1] "Excellent" "Fair" "Poor" "Great" NA "Good"
## [1] ""
## [1] "Unique values in the column:"
## [1] "state"
## [1] "TX" "NY" "FL" "Texas" "California"
## [6] "Florida" "CA" "New York"
## [1] ""
## [1] "Unique values in the column:"
## [1] "price"
## [1] 19.99 15.99 50.00 29.99 39.99
## [1] ""
#Data Cleaning / Processing ## Dealing with missing data by either 1) removing columns/rows or 2) filling in, or imputation
Complete_Copy_reviews <- reviews %>%
filter(!(is.na(review)) # note we are filtering review column as it has incomplete values
)
# determine new dimensions of filtered dataset
dim(Complete_Copy_reviews)
## [1] 1794 4
# There are 1794 rows and 4 columns
Complete_Copy_reviews <- Complete_Copy_reviews %>%
mutate(
state = case_when(
state == "TX" ~ "Texas",
state == "NY" ~ "New York",
state == "FL" ~ "Florida",
state == "CA" ~ "California",
TRUE ~ state
)
)
Complete_Copy_reviews <- Complete_Copy_reviews %>%
mutate(
review_num = case_when(
review == "Poor" ~ 1,
review == "Fair" ~ 2,
review == "Good" ~ 3,
review == "Great" ~ 4,
review == "Excellent" ~ 5,
),
is_high_review = if_else(review_num >= 4, TRUE, FALSE)
# if else vectorizes into two decision tree
)
Complete_Copy_reviews %>%
group_by(state) %>%
summarize(
purchased = n()
) %>%
arrange(-purchased)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 2
## state purchased
## <chr> <int>
## 1 New York 484
## 2 California 464
## 3 Texas 440
## 4 Florida 406
question <- "Which state spent the most on books?"
answer <- "New York"