library(dplyr)
library(ggplot2)
library(ggsci)
library(reshape2)
library(broom)
library(magrittr)

theme_set(theme_minimal())

set.seed(907)

Data retrievd from:

library(devtools) install_github(“seankross/lego”)

library(lego)
lego = legosets

First, determine themes to include

legoc = lego #cleaned versoin
legoc %<>% filter(Pieces > 1)
legoc %<>% filter(USD_MSRP > 1)
legoc$StarWars = F
legoc$StarWars[legoc$Theme == "Star Wars"] = T

Price Per brick by theme:

legoc %<>% mutate(PPB = USD_MSRP / Pieces)

#Drop sets that are more than 1$ per brick

legoc %<>% filter(PPB < 1)

#Drop Sets older than 1990

legoc %<>% filter(Year > 1989)

#drop sets with fewer than 25 parts

legoc %<>% filter(Pieces > 24)

#convert NA minifigs to zeros

legoc$Minifigures[is.na(legoc$Minifigures)] = 0


legoc %>% 
  ggplot(aes(x = reorder(Theme, PPB), y = PPB, 
             color = StarWars)) + 
  geom_boxplot()+
  theme(axis.text.x = element_text(angle = 90, size = 5)) + 
  xlab("") + ylab("Price Per Brick") + scale_color_aaas() + 
  theme(legend.position = "none")

Let’s try regression?

lm1 = lm(USD_MSRP ~ Pieces + Minifigures + StarWars, data = legoc)

lm1 %>% summary()
## 
## Call:
## lm(formula = USD_MSRP ~ Pieces + Minifigures + StarWars, data = legoc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -202.61   -7.25   -3.95    2.61  542.78 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.5778917  0.4558441  14.430   <2e-16 ***
## Pieces        0.0837595  0.0008947  93.615   <2e-16 ***
## Minifigures   1.6466214  0.1359542  12.112   <2e-16 ***
## StarWarsTRUE -0.0168267  1.2331133  -0.014    0.989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.83 on 4189 degrees of freedom
## Multiple R-squared:  0.7178, Adjusted R-squared:  0.7176 
## F-statistic:  3551 on 3 and 4189 DF,  p-value: < 2.2e-16
lm1AIC = MASS::stepAIC(lm1)
## Start:  AIC=25858.97
## USD_MSRP ~ Pieces + Minifigures + StarWars
## 
##               Df Sum of Sq     RSS   AIC
## - StarWars     1         0 1995568 25857
## <none>                     1995568 25859
## - Minifigures  1     69881 2065449 26001
## - Pieces       1   4174873 6170441 30590
## 
## Step:  AIC=25856.97
## USD_MSRP ~ Pieces + Minifigures
## 
##               Df Sum of Sq     RSS   AIC
## <none>                     1995568 25857
## - Minifigures  1     70738 2066306 26001
## - Pieces       1   4215712 6211280 30616
lm1AIC %>% summary()
## 
## Call:
## lm(formula = USD_MSRP ~ Pieces + Minifigures, data = legoc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -202.60   -7.25   -3.94    2.61  542.79 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.5772127  0.4530660   14.52   <2e-16 ***
## Pieces      0.0837583  0.0008903   94.08   <2e-16 ***
## Minifigures 1.6464152  0.1350953   12.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.82 on 4190 degrees of freedom
## Multiple R-squared:  0.7178, Adjusted R-squared:  0.7176 
## F-statistic:  5328 on 2 and 4190 DF,  p-value: < 2.2e-16

Now, let’s look at the ‘good deals’ in the Star Wars world?

legoc$pred = predict(lm1AIC)
legoc$UnderPrice = legoc$USD_MSRP - legoc$pred

SWD = legoc %>% filter(StarWars == TRUE)
subthemes = c("Episode I", "Episode II", "Episode III",
              "Episode IV-VI",
              "Episode V", "Episode VII",
              "Rebels",
              "The Old Republic",
              "Yoda Chronicles")

SWD %>% filter(Subtheme %in% subthemes) %>% 
  ggplot(aes(x = USD_MSRP, y= pred, color = Subtheme)) + 
  geom_point()  + 
  scale_y_continuous(labels = scales::dollar) + 
    scale_x_continuous(labels = scales::dollar)

hi = SWD %>% top_n(n = 5, wt = UnderPrice)
lo = SWD %>% top_n(n = 5, wt = -UnderPrice)

hi$Cost = "Over"
lo$Cost = "Under"

tot = rbind(hi, lo)

tot %>% ggplot(aes(x =
                     USD_MSRP, y = pred, label = Name, color = Cost)) + 
  geom_label(size = 2)