books_Df <- read.csv("C:/Users/PC/Documents/R_4DS/Books/bestsellers with categories.csv")
str(books_Df)
## 'data.frame':    550 obs. of  7 variables:
##  $ Name       : Factor w/ 351 levels "10-Day Green Smoothie Cleanse",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Author     : Factor w/ 248 levels "Abraham Verghese",..: 125 220 135 96 175 97 97 13 115 90 ...
##  $ User.Rating: num  4.7 4.6 4.7 4.7 4.8 4.4 4.7 4.7 4.7 4.6 ...
##  $ Reviews    : int  17350 2052 18979 21424 7665 12643 19735 19699 5983 23848 ...
##  $ Price      : int  8 22 15 6 12 11 30 15 3 8 ...
##  $ Year       : int  2016 2011 2018 2017 2019 2011 2014 2017 2018 2016 ...
##  $ Genre      : Factor w/ 2 levels "Fiction","Non Fiction": 2 1 2 1 2 1 1 1 2 1 ...
cat("\n\n")
summary(books_Df)
##                                                                            Name    
##  Publication Manual of the American Psychological Association, 6th Edition   : 10  
##  StrengthsFinder 2.0                                                         :  9  
##  Oh, the Places You'll Go!                                                   :  8  
##  The 7 Habits of Highly Effective People: Powerful Lessons in Personal Change:  7  
##  The Very Hungry Caterpillar                                                 :  7  
##  Jesus Calling: Enjoying Peace in His Presence (with Scripture References)   :  6  
##  (Other)                                                                     :503  
##                                 Author     User.Rating       Reviews     
##  Jeff Kinney                       : 12   Min.   :3.300   Min.   :   37  
##  Gary Chapman                      : 11   1st Qu.:4.500   1st Qu.: 4058  
##  Rick Riordan                      : 11   Median :4.700   Median : 8580  
##  Suzanne Collins                   : 11   Mean   :4.618   Mean   :11953  
##  American Psychological Association: 10   3rd Qu.:4.800   3rd Qu.:17253  
##  Dr. Seuss                         :  9   Max.   :4.900   Max.   :87841  
##  (Other)                           :486                                  
##      Price            Year              Genre    
##  Min.   :  0.0   Min.   :2009   Fiction    :240  
##  1st Qu.:  7.0   1st Qu.:2011   Non Fiction:310  
##  Median : 11.0   Median :2014                    
##  Mean   : 13.1   Mean   :2014                    
##  3rd Qu.: 16.0   3rd Qu.:2017                    
##  Max.   :105.0   Max.   :2019                    
## 
## Missing Data
colSums(is.na(books_Df))
##        Name      Author User.Rating     Reviews       Price        Year 
##           0           0           0           0           0           0 
##       Genre 
##           0

Data Distribution

books_Df <- books_Df %>% select(-Price, Price)

summary(books_Df$Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0    11.0    13.1    16.0   105.0
books_Df %>%
  ggplot(aes(x = Price)) +
  geom_histogram(color = "white") + 
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boxplot(books_Df$Price)

# {r} # books_Df %>% # count(Name) %>% # mutate(pct = n / sum(Price), # pctlabel = paste0(round(pct * 100), "%")) # # # # # {r Top Book Sales} # # Fancy Cleveland plot for Top Sales # plotdata <- books_Df %>% # # # ggplot(plotdata, # aes(x=lifeExp, # y=reorder(country, lifeExp))) + # geom_point(color="blue", # size = 2) + # geom_segment(aes(x = 40, # xend = lifeExp, # y = reorder(country, lifeExp), # yend = reorder(country, lifeExp)), # color = "lightgrey") + # labs (x = "Life Expectancy (years)", # y = "", # title = "Life Expectancy by Country", # subtitle = "GapMinder data for Asia - 2007") + # theme_minimal() + # theme(panel.grid.major = element_blank(), # panel.grid.minor = element_blank()) #

numericVars <- select_if(books_Df, is.numeric)

summary(numericVars)
##   User.Rating       Reviews           Year          Price      
##  Min.   :3.300   Min.   :   37   Min.   :2009   Min.   :  0.0  
##  1st Qu.:4.500   1st Qu.: 4058   1st Qu.:2011   1st Qu.:  7.0  
##  Median :4.700   Median : 8580   Median :2014   Median : 11.0  
##  Mean   :4.618   Mean   :11953   Mean   :2014   Mean   : 13.1  
##  3rd Qu.:4.800   3rd Qu.:17253   3rd Qu.:2017   3rd Qu.: 16.0  
##  Max.   :4.900   Max.   :87841   Max.   :2019   Max.   :105.0
## Visual representation of Numerical data using Boxplots and Outliers
library(tidyr)
numericVars <- numericVars %>% 
                gather(variable, values, 1:dim(numericVars)[2])

numericVars %>% 
  ggplot() +
  geom_boxplot(aes(x = variable, y = values)) +
  facet_wrap(~variable, ncol = 6, scales = "free") +
  theme(strip.text.x = element_blank(), text = element_text(size = 9))+
  theme_minimal()

I find the difference between Reviews and User Rating interesting, while on the one hand Reviews typically start low and heaps around that only going high rarely, Ratings frrom Users take a reverse posture, starting from highly and then only reducing rarely - these rear occurence are our outliers. One of the reason for this might be because User Ratings have been containerized between 0 and 5, unlike otherwise where Reviews are indefinite.

factorVars <- select_if(books_Df, is.factor)

## function to Plot Graphs for Factor Variables
auto_factor_plot <- function(.data) {
  ## Type of Variable
  nm <- names(.data)
  
## loop to plot through
  for (i in seq_along(nm)) {
    plot <- .data %>%
              ggplot(aes_string(x = nm[i])) + 
                    geom_histogram(alpha = .5,fill = "black", stat = "count") +
                    theme_minimal() + 
                    ggtitle ("Percentage of Market Data by Categorical Variables") +
                    coord_flip() +
                    theme_minimal()
    
    print(plot)
    # print(plot)
  }
    
  
  
}

auto_factor_plot(factorVars %>% select(Genre)) ## filter here, because the other distributions are too dense.
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Bi-Variate Relationships

Factor by Factor Variables

## Function to Plot Factor by Factor Variables
factor_factor_bi_plot <- function(.data, factorVar_1, factorVar_2){
    
  # create a summary dataset
  library(scales)
  library(dplyr)
    plotdata <- .data %>%
      group_by({{factorVar_1}}, {{factorVar_2}}) %>%
      summarize(n = n()) %>% 
      mutate(pct = n/sum(n),
             lbl = percent(pct))
    ## Print Table
    # table(books_Df[[favtorVar_1]], books_Df[[favtorVar_2]])
    print(plotdata)
    
    
    # create segmented bar chart
    # adding labels to each segment
    
    plot <- ggplot(plotdata, 
               aes(x = factor({{factorVar_1}}),
                   y = pct,
                   fill = factor({{factorVar_2}}))) + 
          geom_bar(stat = "identity",
                   position = "fill") +
          scale_y_continuous(breaks = seq(0, 1, .2), 
                             label = percent) +
          geom_text(aes(label = lbl), 
                    size = 3, 
                    position = position_stack(vjust = 0.5)) +
          scale_fill_brewer(palette = "Set2") +
          labs(y = "Percentage Distribution with Categorical Variables", 
               fill = "Drive Train",
               x = "Class",
               title = "Books DataSet by Categories") +
          theme_minimal()
          
    print(plot)
}

## Example
# factor_factor_bi_plot(books_Df, Author, Genre ) ## Too much data!

Factor By Numeric Variables

# calculate mean Numeric Value for each rank
library(dplyr)
factor_num_bi_plot <- function(.data, numVar, factorVar){
  plotdata <- .data %>%
  group_by({{factorVar}}) %>%
  summarize(mean_num = mean({{numVar}}))

# plot mean salaries
  library(scales)
  plot <- ggplot(plotdata, 
               aes(x = {{factorVar}}, 
                   y = mean_num)) +
          geom_bar(stat = "identity")+
          geom_text(aes(label = round(mean_num,2)), ## For Price: dollar(mean_num))
                    vjust = -0.25) +
          labs(title = "Mean Numeric Variable by Factor",
               x = "",
               y = "") +
          theme_minimal()
  
  print(plot)
}

## Sample Bi-Plot using Numeric and Factor Variable
factor_num_bi_plot(books_Df, Price, Genre)
## `summarise()` ungrouping output (override with `.groups` argument)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor

Quantitative / Numeric by Numeric 1

# scatterplot with quadratic line of best fit
num_num_bi_plot <- function(.data, numVar_1, numVar_2){
  plot <- .data %>%
            ggplot(aes(x = {{numVar_1}}, 
                     y = {{numVar_2}})) +
            geom_point(color= "darkblue") +
            geom_smooth(method = "lm", 
                        formula = y ~ poly(x, 2), 
                        color = "indianred3") +
            theme_minimal()
  
  print(plot)
}

num_num_bi_plot(books_Df, Reviews, Price)

It is strange both the lower the Reviews, the Higher the price. This long regression line is due to the Outliers we have in the Reviews Data. To accurately measure this relationship we will now perform the Pairwise Correlation Statistical Analysis.

Quantitative / Numeric by Numeric 2

df_num <- select_if(books_Df, is.numeric)

corr <- cor(df_num, use="pairwise")
round(corr,2)
##             User.Rating Reviews  Year Price
## User.Rating        1.00    0.00  0.24 -0.13
## Reviews            0.00    1.00  0.26 -0.11
## Year               0.24    0.26  1.00 -0.15
## Price             -0.13   -0.11 -0.15  1.00

Visualise Correlation

library(ggcorrplot)

ggcorrplot(corr,, 
           hc.order = TRUE, 
           type = "lower",
           lab = TRUE)

Multi Variate Relationship

Linear Regression Analysis; where Price is Y the Numeric Dependent Variable.

books_lm <- lm(Price ~ ., 
                data = books_Df %>% select(-c(Name, Author)))

summary(books_lm)
## 
## Call:
## lm(formula = Price ~ ., data = books_Df %>% select(-c(Name, Author)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.687  -5.154  -2.088   2.380  89.698 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       9.280e+02  3.065e+02   3.028 0.002576 ** 
## User.Rating      -3.918e+00  2.069e+00  -1.893 0.058828 .  
## Reviews          -2.579e-05  4.182e-05  -0.617 0.537636    
## Year             -4.462e-01  1.535e-01  -2.907 0.003795 ** 
## GenreNon Fiction  3.672e+00  9.588e-01   3.830 0.000143 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.52 on 545 degrees of freedom
## Multiple R-squared:  0.06474,    Adjusted R-squared:  0.05788 
## F-statistic: 9.432 on 4 and 545 DF,  p-value: 2.233e-07
# conditional plot of price vs. living area
library(ggplot2)
library(visreg)
visreg(books_lm, "Year", gg = TRUE) 

# conditional plot of Price vs. Genre
visreg(books_lm, "Genre", gg = TRUE) +
  scale_y_continuous(label = dollar) +
  labs(title = "Relationship between Price and Genre",
       subtitle = "controlling for other possibble predictors",
       caption = "source: Kaggle Books DATA",
       y = "Book Price",
       x = "Genre")

Non-Fiction Books is ever slightly more predictive of Price than otherwise.