# Load the libraries and the datast
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Laptopsales <- read_csv("Datasets/LaptopSalesJan.csv") # specify the path

## Rows: 7952 Columns: 15

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Date, CustomerPostcode, StorePostcode, IntegratedWireless, Bundled...
## dbl (10): Configuration, RetailPrice, ScreenSize, BatteryLife, RAM, Processo...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Pre-processing steps

# Preprocessing steps
LaptopsalesJan <-  Laptopsales %>%
  mutate(DateTime = mdy_hm(Date)) %>%
  mutate(DayofWeek = wday(DateTime, label = TRUE), # To get the day of the week
         DateVal = as.Date(DateTime),
         HDSize = as.factor(HDSize),
         ScreenSize = as.factor(ScreenSize),
         storeX = as.factor(storeX))

#Q1. At what prices are the laptops selling (hint: use a histogram). Use #the chart to comment on the range of prices and the most common price #range.

Ans: Between 400 and 500

# Q1 - 
ggplot(data = LaptopsalesJan, aes(RetailPrice)) +
      geom_histogram(fill = "#006EA1", color = "white", bins = 50) +
  scale_x_continuous(breaks=seq(0,1000,50)) +
      labs(title = "Laptop Prices in January", x = "Price(Dollars)", y = "Units Sold")

Question 2 - Part a What were the dates when Acell had its minimum sales and maximum sales in terms of counts of laptops.

# Generate the summary data
LaptopsalesJanSummary <- LaptopsalesJan %>%
  filter(RetailPrice == max(RetailPrice) | RetailPrice == min(RetailPrice))

# Plot
ggplot(data = LaptopsalesJanSummary, aes(x = DateVal, y = RetailPrice)) +
geom_point() + labs(title = "Days with min or max prices", x = "Date", y = "Price")

Q2 Part b Overall which HDSize was more popular throughout January, and which HDSize was least popular of the lot. (Hint: First summarize the data by grouping it based on HDSize and Date Val. To display all the HDSize, use the option group= HDSize, color= HDSize)

Ans: 300 was popular, 40 was not

# Generate the summary data
LaptopsalesJanSummaryHDSize <- LaptopsalesJan %>%
  group_by(HDSize,DateVal) %>%
  count(HDSize)

# Generate the plot
ggplot(data = LaptopsalesJanSummaryHDSize, aes(x = DateVal, y = n, group = HDSize, color = HDSize)) +
geom_col()

Q3 Does retail price correlate with customer store distance?(Hint: Use Scatter Plots) Use the chart to additionally answer, are there any outliers in the retail price or custom store distance? Would you include custom store distance as a predictor of store price?

Ans: No trend

# Generate plot
ggplot(data = LaptopsalesJan, aes(x = CustomerStoreDistance, y = RetailPrice)) +
       geom_point()

Q4. Do the median values of the distance traveled by the customers vary based on the day of the week? Which are the days when you see maximum outliers (Hint - use Boxplot). Use the chart to additionally answer, are customers willing to travel more than 20K when the Applications are bundled.(Hint: Check for the outliers, and use subplots)

Ans:No trend in days of week. If applications are bundled people will travel 20000.

# Generate plot
ggplot(data = LaptopsalesJan, aes(x = DayofWeek, y = CustomerStoreDistance, fill=BundledApplications)) + 
    geom_boxplot() +
  facet_wrap(~BundledApplications, nrow = 1)

Which stores(s) attracts customers who are willing to travel more? Create a new variable using case_when command to bin the store distance variable into following categories – “Under 5000”,”5001-10000”,”>10000”. (Hint: Use Stacked bar chart). Each store post code corresponds to a specific store. You can assume the store post code as the store ID. Ans: Under 5000

# Generate the summary data
LaptopsalesJanConfStore <- LaptopsalesJan %>%
 mutate(StoreDistanceBin = case_when((CustomerStoreDistance <= 5000) ~ "Under 5000",
                                         (CustomerStoreDistance > 5000 & CustomerStoreDistance <= 10000) ~ "5001-10000",
                                         TRUE ~ ">10000")) %>%
  group_by(StorePostcode,StoreDistanceBin) %>%
  summarize(Count = n())

## `summarise()` has grouped output by 'StorePostcode'. You can override using the `.groups` argument.

# Generate the plot
ggplot(LaptopsalesJanConfStore, aes(x = StoreDistanceBin, y = Count)) + 
    geom_col() +
  theme(axis.text.x = element_text(angle = 90))

Once complete, knit the document as word document, and upload it in Microsoft Teams. — End of Homework 2 —