library(dplyr)
library(ggplot2)
library(ggsci)
library(reshape2)
library(broom)
library(magrittr)
theme_set(theme_minimal())
set.seed(907)
Data retrievd from:
library(devtools) install_github(“seankross/lego”)
library(lego)
lego = legosets
First, determine themes to include
legoc = lego #cleaned versoin
legoc %<>% filter(Pieces > 1)
legoc %<>% filter(USD_MSRP > 1)
legoc$StarWars = F
legoc$StarWars[legoc$Theme == "Star Wars"] = T
Price Per brick by theme:
legoc %<>% mutate(PPB = USD_MSRP / Pieces)
#Drop sets that are more than 1$ per brick
legoc %<>% filter(PPB < 1)
#Drop Sets older than 1990
legoc %<>% filter(Year > 1989)
#drop sets with fewer than 25 parts
legoc %<>% filter(Pieces > 24)
#convert NA minifigs to zeros
legoc$Minifigures[is.na(legoc$Minifigures)] = 0
legoc %>%
ggplot(aes(x = reorder(Theme, PPB), y = PPB,
color = StarWars)) +
geom_boxplot()+
theme(axis.text.x = element_text(angle = 90, size = 5)) +
xlab("") + ylab("Price Per Brick") + scale_color_aaas() +
theme(legend.position = "none")
Let’s try regression?
lm1 = lm(USD_MSRP ~ Pieces + Minifigures + StarWars, data = legoc)
lm1 %>% summary()
##
## Call:
## lm(formula = USD_MSRP ~ Pieces + Minifigures + StarWars, data = legoc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -202.61 -7.25 -3.95 2.61 542.78
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5778917 0.4558441 14.430 <2e-16 ***
## Pieces 0.0837595 0.0008947 93.615 <2e-16 ***
## Minifigures 1.6466214 0.1359542 12.112 <2e-16 ***
## StarWarsTRUE -0.0168267 1.2331133 -0.014 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.83 on 4189 degrees of freedom
## Multiple R-squared: 0.7178, Adjusted R-squared: 0.7176
## F-statistic: 3551 on 3 and 4189 DF, p-value: < 2.2e-16
lm1AIC = MASS::stepAIC(lm1)
## Start: AIC=25858.97
## USD_MSRP ~ Pieces + Minifigures + StarWars
##
## Df Sum of Sq RSS AIC
## - StarWars 1 0 1995568 25857
## <none> 1995568 25859
## - Minifigures 1 69881 2065449 26001
## - Pieces 1 4174873 6170441 30590
##
## Step: AIC=25856.97
## USD_MSRP ~ Pieces + Minifigures
##
## Df Sum of Sq RSS AIC
## <none> 1995568 25857
## - Minifigures 1 70738 2066306 26001
## - Pieces 1 4215712 6211280 30616
lm1AIC %>% summary()
##
## Call:
## lm(formula = USD_MSRP ~ Pieces + Minifigures, data = legoc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -202.60 -7.25 -3.94 2.61 542.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5772127 0.4530660 14.52 <2e-16 ***
## Pieces 0.0837583 0.0008903 94.08 <2e-16 ***
## Minifigures 1.6464152 0.1350953 12.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.82 on 4190 degrees of freedom
## Multiple R-squared: 0.7178, Adjusted R-squared: 0.7176
## F-statistic: 5328 on 2 and 4190 DF, p-value: < 2.2e-16
Now, let’s look at the ‘good deals’ in the Star Wars world?
legoc$pred = predict(lm1AIC)
legoc$UnderPrice = legoc$USD_MSRP - legoc$pred
SWD = legoc %>% filter(StarWars == TRUE)
subthemes = c("Episode I", "Episode II", "Episode III",
"Episode IV-VI",
"Episode V", "Episode VII",
"Rebels",
"The Old Republic",
"Yoda Chronicles")
SWD %>% filter(Subtheme %in% subthemes) %>%
ggplot(aes(x = USD_MSRP, y= pred, color = Subtheme)) +
geom_point() +
scale_y_continuous(labels = scales::dollar) +
scale_x_continuous(labels = scales::dollar)
hi = SWD %>% top_n(n = 5, wt = UnderPrice)
lo = SWD %>% top_n(n = 5, wt = -UnderPrice)
hi$Cost = "Over"
lo$Cost = "Under"
tot = rbind(hi, lo)
tot %>% ggplot(aes(x =
USD_MSRP, y = pred, label = Name, color = Cost)) +
geom_label(size = 2)