Visualizing Inventory by Make, Model Across DMAs

*Limiting this analysis to Ford makes and models

dir()
##  [1] "carsEventFactQuery.csv"           "carsRecordFactQuery.csv"         
##  [3] "dma_lat_long.csv"                 "dma_name_fix.csv"                
##  [5] "figure"                           "Ford-VDPs.csv"                   
##  [7] "FordData.csv"                     "inventory_count_DMA.html"        
##  [9] "inventory_count_DMA.md"           "inventory_count_DMA.Rmd"         
## [11] "make_model_count_DMA.html"        "make_model_count_DMA.md"         
## [13] "make_model_count_DMA.Rmd"         "make_model_count.html"           
## [15] "make_model_count.md"              "make_model_count.Rmd"            
## [17] "redacted_longlat.csv"             "time-VDPs.csv"                   
## [19] "usersByLocationQuery.csv"         "Vast Data.Rproj"                 
## [21] "vdpsByLocationQuery.csv"          "vdpsByMakeModelLocationQuery.csv"
cars = read.csv("vdpsByMakeModelLocationQuery.csv", header=TRUE, sep=",")
dim(cars)
## [1] 53812     6
str(cars)
## 'data.frame':    53812 obs. of  6 variables:
##  $ make    : Factor w/ 62 levels "Acura","Alfa Romeo",..: 20 24 30 8 24 30 30 30 20 45 ...
##  $ model   : Factor w/ 810 levels "1 Series","100",..: 283 83 775 15 174 342 775 775 283 92 ...
##  $ dma_name: Factor w/ 198 levels "ABILENE-SWEETWATER",..: 46 123 123 123 123 123 186 46 123 123 ...
##  $ vdpCount: Factor w/ 1125 levels " DC (HAGRSTWN)",..: 1038 980 954 902 834 782 1 765 759 702 ...
##  $ X       : Factor w/ 389 levels ""," AL)","1",..: 1 1 1 1 1 1 301 1 1 1 ...
##  $ X.1     : int  NA NA NA NA NA NA NA NA NA NA ...
library(sqldf)
get_ford = sqldf("SELECT * FROM cars WHERE make = 'Ford'")
head(get_ford)
##   make   model         dma_name vdpCount X X.1
## 1 Ford   F-150 DALLAS-FT. WORTH     8574    NA
## 2 Ford   F-150         NEW YORK     5332    NA
## 3 Ford Mustang      LOS ANGELES     4204    NA
## 4 Ford   F-150          HOUSTON     4099    NA
## 5 Ford   F-150      LOS ANGELES     3908    NA
## 6 Ford Mustang         NEW YORK     3803    NA
#create csv to clean data in excel
#write.csv(get_ford, file = "FordData.csv",row.names=FALSE)

#clean data
clean = read.csv("FordData.csv", header=TRUE, sep=",")

library(dplyr)
clean = select(clean, -c(X, X.1))

#finding NA rows, if any
clean[!complete.cases(clean),]
## [1] make     model    dma_name vdpCount
## <0 rows> (or 0-length row.names)
head(clean)
##   make       model                   dma_name vdpCount
## 1 Ford Thunderbird                  ST. LOUIS        1
## 2 Ford  Expedition BLUEFIELD-BECKLEY-OAK HILL        1
## 3 Ford   Freestyle            BILOXI-GULFPORT        1
## 4 Ford    Windstar                  FT. WAYNE        1
## 5 Ford    Freestar                  JONESBORO        1
## 6 Ford   Econoline                   SAVANNAH        1
tail(clean)
##      make          model   dma_name vdpCount
## 4192 Ford           Flex TRI-CITIES       12
## 4193 Ford      Econoline TRI-CITIES       10
## 4194 Ford Crown Victoria TRI-CITIES        4
## 4195 Ford     Expedition TRI-CITIES        4
## 4196 Ford  Expedition EL TRI-CITIES        2
## 4197 Ford       Freestar TRI-CITIES        1
newdata = clean[order(-clean$vdpCount),]
head(newdata)
##      make   model         dma_name vdpCount
## 3855 Ford   F-150 DALLAS-FT. WORTH     8574
## 3854 Ford   F-150         NEW YORK     5332
## 3853 Ford Mustang      LOS ANGELES     4204
## 3852 Ford   F-150          HOUSTON     4099
## 3851 Ford   F-150      LOS ANGELES     3908
## 3850 Ford Mustang         NEW YORK     3803

Looking at inventory totals in the entire market:

attach(newdata)

#histograms in plot matrix: x=model, y=vdpCount
#contingency tables
#automatic function for detecting outliers

#Total number of Fords in the market in January
ford_total = sqldf("SELECT SUM(vdpCount) FROM newdata")
ford_total
##   SUM(vdpCount)
## 1        441178
#calculate sum for each distinct model and plot
model_totals = sqldf("SELECT model, SUM(vdpCount) as total FROM newdata GROUP BY model ORDER BY SUM(vdpCount) DESC")
model_totals
##                  model  total
## 1                F-150 106695
## 2              Mustang  58457
## 3                F-250  37050
## 4             Explorer  30520
## 5               Escape  30440
## 6                Focus  23796
## 7               Ranger  23556
## 8               Fusion  19436
## 9                 Edge  16708
## 10               F-350  16578
## 11              Taurus  11285
## 12          Expedition   9797
## 13           Econoline   8218
## 14 Explorer Sport Trac   6720
## 15                Flex   5371
## 16              Fiesta   5185
## 17     Transit Connect   3870
## 18               F-450   3159
## 19             Transit   2996
## 20              Bronco   2920
## 21       Expedition EL   2564
## 22           Excursion   2488
## 23      Crown Victoria   2118
## 24         Thunderbird   1872
## 25            Windstar   1386
## 26              Escort   1173
## 27        Five Hundred   1156
## 28           Freestyle   1102
## 29               C-Max    943
## 30            Freestar    845
## 31               F-650    628
## 32                  GT    466
## 33               Tempo    438
## 34            Taurus X    367
## 35               F-550    274
## 36             Contour    261
## 37           Bronco II    118
## 38               Probe     67
## 39            Aerostar     57
## 40                 LTD     48
## 41               F-750     21
## 42        F-Super Duty     15
## 43               F-800     10
## 44               F-700      4
library(ggplot2)

#SETTING ENTIRE "THEME"/GRAPH SIZE
theme_set(theme_grey(base_size=18))


#flip coordinate axis, reorder from most wins to least
g = ggplot(model_totals, aes(x = reorder(model,total), y=total, fill=total))
g + geom_bar(stat = "identity") + scale_fill_continuous(low="light blue", high="dark green", limits=c(0,108000)) + coord_flip() + ggtitle("Total Inventory in Market by Model") + xlab("Models") + ylab("Total in Market") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black"))

plot of chunk unnamed-chunk-2


Visualizing Distribution of Models by DMA

head(newdata)
##      make   model         dma_name vdpCount
## 3855 Ford   F-150 DALLAS-FT. WORTH     8574
## 3854 Ford   F-150         NEW YORK     5332
## 3853 Ford Mustang      LOS ANGELES     4204
## 3852 Ford   F-150          HOUSTON     4099
## 3851 Ford   F-150      LOS ANGELES     3908
## 3850 Ford Mustang         NEW YORK     3803
#fix(newdata)

#NY model count
ny_mod = sqldf("SELECT model, dma_name, vdpCount FROM newdata WHERE dma_name = 'NEW YORK'")
#ny_mod

#PHIL model count
ph_mod = sqldf("SELECT model, dma_name, vdpCount FROM newdata WHERE dma_name = 'PHILADELPHIA'")
#ph_mod

#boston model count
bos_mod = sqldf("SELECT model, dma_name, vdpCount FROM newdata WHERE dma_name = 'BOSTON (MANCHESTER)'")
#bos_mod

#dc model count
dc_mod = sqldf("SELECT model, dma_name, vdpCount FROM newdata WHERE dma_name = 'WASHINGTON'")
#dc_mod


#NY
ny = ggplot(ny_mod[1:10,], aes(x = reorder(model,vdpCount), y=vdpCount, fill=vdpCount))
plot1 = ny + geom_bar(stat = "identity") + scale_fill_continuous(low="light blue", high="dark blue") + coord_flip() + ggtitle("New York") + xlab("Models") + ylab("Inventory Totals") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black")) + theme(legend.position="none")

#PHILADELPHIA
ph = ggplot(ph_mod[1:10,], aes(x = reorder(model,vdpCount), y=vdpCount, fill=vdpCount))
plot2 = ph + geom_bar(stat = "identity") + scale_fill_continuous(low="violet", high="violetred") + coord_flip() + ggtitle("Philadelphia") + xlab("Models") + ylab("Inventory Totals") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black")) + theme(legend.position="none")

#BOSTON
bos = ggplot(bos_mod[1:10,], aes(x = reorder(model,vdpCount), y=vdpCount, fill=vdpCount))
plot3 = bos + geom_bar(stat = "identity")+ scale_fill_continuous(low="brown1", high="brown4") + coord_flip() + ggtitle("Boston") + xlab("Models") + ylab("Inventory Totals") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black")) + theme(legend.position="none")

#WASHINGTON DC
dc = ggplot(dc_mod[1:10,], aes(x = reorder(model,vdpCount), y=vdpCount, fill=vdpCount))
plot4 = dc + geom_bar(stat = "identity")+ scale_fill_continuous(low="gold", high="yellowgreen") + coord_flip() + ggtitle("Washington DC") + xlab("Models") + ylab("Inventory Totals") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black")) + theme(legend.position="none")

require(gridExtra)
grid.arrange(plot1, plot2, plot3, plot4, ncol=2)

plot of chunk unnamed-chunk-3


*Use Mustang inventory count in NY vs. Boston. NY = ~4K, Boston = ~500. It would be very hard to believe that demand for Ford Mustangs is THAT heterogeneous across NY and Boston markets…


Insights

Much like DMA size, there is also a large disparity in the total market's inventory count when broken down by model. Thus, we should prioritize shipping recommendations according to the inventory count for each model. The logic behind this is that the larger the inventory count for each model, the greater the financial implications of selling or auctioning off these cars at a less than optimal margin if demand is not sufficient at the model level.


Approach Moving Forward: Combining DMA Size & Inventory Count by Model Information

Moving forward we will prioritize shipping recommendations based on the size of the DMA and a measure of market inefficiency at the model level. Again, the financial implications of market inefficiency scale with the size of the market. Thus, our approach will quantify market inefficiency at the model level by assigning a ratio of supply/demand to each model in each DMA. If demand for a particular model is insufficient in one DMA, we will recommend that a certain number of vehicles of that model should be shipped to the largest DMA that most closely approixmates the inverse of that ratio (i.e., demand exceeds supply). Luckily, by focusing on these larger DMAs (markets), we will mostly likely not ecounter any substantial differences in the average price for a given model.

Constraints

These recommendations will be constrained by the cost of shipping “x” number of vehicles per mile. Thus, when deciding on where to ship a batch of cars, we will be constrained by a geographic radius, which is yet to be determined. A sample visualization of geographic constraints is provided below

#got city names corresponding to long, lat, dma_code data w/ Google Maps Package
#only 2500 API requests allowed per day - was limited by this
library(ggmap)

#new data to visualize geographic regions (constrained by shipping costs)
geo = read.csv("dma_lat_long.csv", header=TRUE, sep=",")
geo = select(geo, -vdpCount)
head(geo)
##   X make          model                dma_name        lon      lat
## 1 1 Ford Crown Victoria      ABILENE-SWEETWATER -100.12386 32.61875
## 2 2 Ford         Fiesta                  ALBANY  -73.75623 42.65258
## 3 3 Ford          F-450 ALBANY-SCHENECTADY-TROY  -73.87327 42.79180
## 4 4 Ford      Excursion    ALBUQUERQUE-SANTA FE -106.65618 35.07569
## 5 5 Ford  Expedition EL              ALEXANDRIA  -77.04692 38.80484
## 6 6 Ford          F-450                AMARILLO -101.83130 35.22200
dim(geo)
## [1] 195   6
#search for DMAs within a given radius, assign factor to each region
usa = map_data("state")

bc_bbox <- make_bbox(lat = lat, lon = long, data = usa)
bc_big <- get_map(location = bc_bbox, source = "google", maptype = "terrain")

order_lat_lon = geo[order(geo$lon, geo$lat), ]
dim(order_lat_lon)
## [1] 195   6
viz = order_lat_lon[153:195,]

ggmap(bc_big) + geom_point(data = viz, mapping = aes(x = lon, y = lat), alpha=0.5, size=6) + ggtitle("Visualizing Shipping Cost Constraints")

plot of chunk unnamed-chunk-4