# Load the libraries and the datast
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Laptopsales <- read_csv("Datasets/LaptopSalesJan.csv") # specify the path
## Rows: 7952 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Date, CustomerPostcode, StorePostcode, IntegratedWireless, Bundled...
## dbl (10): Configuration, RetailPrice, ScreenSize, BatteryLife, RAM, Processo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preprocessing steps
LaptopsalesJan <- Laptopsales %>%
mutate(DateTime = mdy_hm(Date)) %>%
mutate(DayofWeek = wday(DateTime, label = TRUE), # To get the day of the week
DateVal = as.Date(DateTime),
HDSize = as.factor(HDSize),
ScreenSize = as.factor(ScreenSize),
storeX = as.factor(storeX))
#Q1. At what prices are the laptops selling (hint: use a histogram). Use #the chart to comment on the range of prices and the most common price #range.
Ans: Between 400 and 500
# Q1 -
ggplot(data = LaptopsalesJan, aes(RetailPrice)) +
geom_histogram(fill = "#006EA1", color = "white", bins = 50) +
scale_x_continuous(breaks=seq(0,1000,50)) +
labs(title = "Laptop Prices in January", x = "Price(Dollars)", y = "Units Sold")
Question 2 - Part a What were the dates when Acell had its minimum sales and maximum sales in terms of counts of laptops.
# Generate the summary data
LaptopsalesJanSummary <- LaptopsalesJan %>%
filter(RetailPrice == max(RetailPrice) | RetailPrice == min(RetailPrice))
# Plot
ggplot(data = LaptopsalesJanSummary, aes(x = DateVal, y = RetailPrice)) +
geom_point() + labs(title = "Days with min or max prices", x = "Date", y = "Price")
Q2 Part b Overall which HDSize was more popular throughout January, and which HDSize was least popular of the lot. (Hint: First summarize the data by grouping it based on HDSize and Date Val. To display all the HDSize, use the option group= HDSize, color= HDSize)
Ans: 300 was popular, 40 was not
# Generate the summary data
LaptopsalesJanSummaryHDSize <- LaptopsalesJan %>%
group_by(HDSize,DateVal) %>%
count(HDSize)
# Generate the plot
ggplot(data = LaptopsalesJanSummaryHDSize, aes(x = DateVal, y = n, group = HDSize, color = HDSize)) +
geom_col()
Q3 Does retail price correlate with customer store distance?(Hint: Use Scatter Plots) Use the chart to additionally answer, are there any outliers in the retail price or custom store distance? Would you include custom store distance as a predictor of store price?
Ans: No trend
# Generate plot
ggplot(data = LaptopsalesJan, aes(x = CustomerStoreDistance, y = RetailPrice)) +
geom_point()
Q4. Do the median values of the distance traveled by the customers vary based on the day of the week? Which are the days when you see maximum outliers (Hint - use Boxplot). Use the chart to additionally answer, are customers willing to travel more than 20K when the Applications are bundled.(Hint: Check for the outliers, and use subplots)
Ans:No trend in days of week. If applications are bundled people will travel 20000.
# Generate plot
ggplot(data = LaptopsalesJan, aes(x = DayofWeek, y = CustomerStoreDistance, fill=BundledApplications)) +
geom_boxplot() +
facet_wrap(~BundledApplications, nrow = 1)
# Generate the summary data
LaptopsalesJanConfStore <- LaptopsalesJan %>%
mutate(StoreDistanceBin = case_when((CustomerStoreDistance <= 5000) ~ "Under 5000",
(CustomerStoreDistance > 5000 & CustomerStoreDistance <= 10000) ~ "5001-10000",
TRUE ~ ">10000")) %>%
group_by(StorePostcode,StoreDistanceBin) %>%
summarize(Count = n())
## `summarise()` has grouped output by 'StorePostcode'. You can override using the `.groups` argument.
# Generate the plot
ggplot(LaptopsalesJanConfStore, aes(x = StoreDistanceBin, y = Count)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90))
Once complete, knit the document as word document, and upload it in Microsoft Teams. — End of Homework 2 —