Reading in a csv file

If I want to ‘filter’ and ‘select’ data, I have to run the command twice

filtered_data <- filter(sales_data, STATE == 'NY' & PRODUCTLINE == 'Classic Cars')

## notice the first argument is filtered_data
select_data <- select(filtered_data, QUANTITYORDERED, PRICEEACH)

## notice the first argument is select_data
arrange_data <- arrange(select_data, PRICEEACH) 

It’s kind of tedious to have to run the command twice, so we will use a concept called piping (%>%) Piping is sending the output of one function into the input of another. The output will be the first argument of the next function The same command above can be written like this:

piped_data <- sales_data %>%
  filter(STATE == 'NY' & PRODUCTLINE == 'Classic Cars') %>%
  select(QUANTITYORDERED, PRICEEACH) %>%
  arrange(PRICEEACH)
  1. mutate() function create your own columns using mutate Same as above but using piping
mutated_data <- mutate(sales_data, discounted = 0.95 * SALES)

## same as above but using piping
mutated_data <- sales_data %>%
  mutate(discounted = 0.95 * SALES)

YOUR TURN!

sales_data <- read.csv('sales_data_sample.csv') # Reread the data in case you made any changes to it
## Q1: what is the most common deal size (column name: DEALSIZE)?
summary(sales_data$DEALSIZE)
##  Large Medium  Small 
##    157   1384   1282
## Q2: what is the average quantity ordered? (HINT: Can also use mean function)
mean(sales_data$QUANTITYORDERED)
## [1] 35.09281
summary(sales_data$QUANTITYORDERED)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   27.00   35.00   35.09   43.00   97.00
## Q3: create a new dataset called q3_data with 
##       - a new column called MSRP_REV which is equal to the MSRP * QUANTITYORDERED
##       - filtered to only have 'Large' sized deals 
##       - with only the selected columns ORDERNUMBER, QUANTITYORDERED, PRICEEACH, MSRP, SALES, MSRP_REV
##       - ordered in descending order by SALES
q3_data <- sales_data %>% 
  mutate(MSRP_REV = MSRP * QUANTITYORDERED) %>% 
  filter(DEALSIZE == 'Large') %>% 
  select(ORDERNUMBER, QUANTITYORDERED, PRICEEACH, MSRP, SALES, MSRP_REV) %>% 
  arrange(-SALES)
## 5. group_by() and summarise() function
##    group_by() and summarise() will help us solve questions such as, what are the total sales by country?
grouped_data <- group_by(sales_data, COUNTRY)
summarised_data <- summarise(grouped_data, total_sales = sum(SALES))

## same as above, But using piping
summarised_data <- sales_data %>%
  group_by(COUNTRY) %>%
  summarise(total_sales = sum(SALES))
## Instead of sum(), can also do max(), min(), mean(), n() for count, and others

## Q4: what is the average SALE by PRODUCTLINE?
summarised_data <- sales_data %>% 
  group_by(PRODUCTLINE) %>% 
  summarise(average_sales = mean(SALES))
## create a simple dot plot
ggplot(sales_data, aes(x = QUANTITYORDERED, y = SALES)) +
  geom_point() +
  theme_classic()

## change color
ggplot(sales_data, aes(x = QUANTITYORDERED, y = SALES)) +
  geom_point(aes(color = 'red')) +
  theme_classic()

## add labels and remove legend
ggplot(sales_data, aes(x = QUANTITYORDERED, y = SALES)) +
  geom_point(aes(color = 'red')) +
  labs(title = 'Sales and Quantity Ordered',
       y = 'Unit Price ($)',
       x = 'Quantity Ordered (Units)') + 
  theme(legend.position="none") +
  theme_classic()

## add regression line
ggplot(sales_data, aes(x = QUANTITYORDERED, y = SALES)) +
  geom_point(aes(color = 'red')) +
  labs(title = 'Sales and Quantity Ordered',
       y = 'Unit Price ($)',
       x = 'Quantity Ordered (Units)') + 
  theme(legend.position = "none")+
  geom_smooth(method = "lm") +
  theme_classic()

## bar charts
## will first create a grouped by and summarised dataset
status_data <- sales_data %>%
  group_by(STATUS) %>%
  summarise(total_count = n()) %>%
  select(STATUS, total_count)
## now we will create a bar chart
ggplot(status_data, aes(x = STATUS, y = total_count)) +
  geom_bar(stat = 'identity', color = 'red', fill = 'blue') +
  theme_classic()

## your turn!

## Q5: create a bar chart of the total sales by country with the following properties:
##     - x axis label: Product Line
##     - y axis label: Total Sales ($)
##     - title: Sales by Product Line
##     - Outline of bars: red
##     - Fill of bars: pink

## Will first create a grouped by and summarised dataset
status_data <- sales_data %>%
  group_by(PRODUCTLINE) %>%
  summarise(total_sales = sum(SALES))
## bar plot
ggplot(status_data, aes(x = PRODUCTLINE, y = total_sales)) +
  geom_bar(stat = 'identity', color = 'red', fill = 'pink') +
  labs(title = 'Sales by Product Line') +
  theme_classic() +
  coord_flip()

## Q6: using the ggplot2 cheat sheet, try constructing your own plot of choice!
## bar plot
ggplot(status_data, aes(x = PRODUCTLINE, y = total_sales)) +
  geom_bar(stat = 'identity', color = 'red', fill = 'yellow') +
  labs(title = 'Sales by Product Line') +
  theme_classic() +
  coord_polar()