*Limiting this analysis to Ford makes and models
dir()
## [1] "carsEventFactQuery.csv" "carsRecordFactQuery.csv"
## [3] "dma_lat_long.csv" "dma_name_fix.csv"
## [5] "figure" "Ford-VDPs.csv"
## [7] "FordData.csv" "inventory_count_DMA.html"
## [9] "inventory_count_DMA.md" "inventory_count_DMA.Rmd"
## [11] "make_model_count_DMA.html" "make_model_count_DMA.md"
## [13] "make_model_count_DMA.Rmd" "make_model_count.html"
## [15] "make_model_count.md" "make_model_count.Rmd"
## [17] "redacted_longlat.csv" "time-VDPs.csv"
## [19] "usersByLocationQuery.csv" "Vast Data.Rproj"
## [21] "vdpsByLocationQuery.csv" "vdpsByMakeModelLocationQuery.csv"
cars = read.csv("vdpsByMakeModelLocationQuery.csv", header=TRUE, sep=",")
dim(cars)
## [1] 53812 6
str(cars)
## 'data.frame': 53812 obs. of 6 variables:
## $ make : Factor w/ 62 levels "Acura","Alfa Romeo",..: 20 24 30 8 24 30 30 30 20 45 ...
## $ model : Factor w/ 810 levels "1 Series","100",..: 283 83 775 15 174 342 775 775 283 92 ...
## $ dma_name: Factor w/ 198 levels "ABILENE-SWEETWATER",..: 46 123 123 123 123 123 186 46 123 123 ...
## $ vdpCount: Factor w/ 1125 levels " DC (HAGRSTWN)",..: 1038 980 954 902 834 782 1 765 759 702 ...
## $ X : Factor w/ 389 levels ""," AL)","1",..: 1 1 1 1 1 1 301 1 1 1 ...
## $ X.1 : int NA NA NA NA NA NA NA NA NA NA ...
library(sqldf)
get_ford = sqldf("SELECT * FROM cars WHERE make = 'Ford'")
head(get_ford)
## make model dma_name vdpCount X X.1
## 1 Ford F-150 DALLAS-FT. WORTH 8574 NA
## 2 Ford F-150 NEW YORK 5332 NA
## 3 Ford Mustang LOS ANGELES 4204 NA
## 4 Ford F-150 HOUSTON 4099 NA
## 5 Ford F-150 LOS ANGELES 3908 NA
## 6 Ford Mustang NEW YORK 3803 NA
#create csv to clean data in excel
#write.csv(get_ford, file = "FordData.csv",row.names=FALSE)
#clean data
clean = read.csv("FordData.csv", header=TRUE, sep=",")
library(dplyr)
clean = select(clean, -c(X, X.1))
#finding NA rows, if any
clean[!complete.cases(clean),]
## [1] make model dma_name vdpCount
## <0 rows> (or 0-length row.names)
head(clean)
## make model dma_name vdpCount
## 1 Ford Thunderbird ST. LOUIS 1
## 2 Ford Expedition BLUEFIELD-BECKLEY-OAK HILL 1
## 3 Ford Freestyle BILOXI-GULFPORT 1
## 4 Ford Windstar FT. WAYNE 1
## 5 Ford Freestar JONESBORO 1
## 6 Ford Econoline SAVANNAH 1
tail(clean)
## make model dma_name vdpCount
## 4192 Ford Flex TRI-CITIES 12
## 4193 Ford Econoline TRI-CITIES 10
## 4194 Ford Crown Victoria TRI-CITIES 4
## 4195 Ford Expedition TRI-CITIES 4
## 4196 Ford Expedition EL TRI-CITIES 2
## 4197 Ford Freestar TRI-CITIES 1
newdata = clean[order(-clean$vdpCount),]
head(newdata)
## make model dma_name vdpCount
## 3855 Ford F-150 DALLAS-FT. WORTH 8574
## 3854 Ford F-150 NEW YORK 5332
## 3853 Ford Mustang LOS ANGELES 4204
## 3852 Ford F-150 HOUSTON 4099
## 3851 Ford F-150 LOS ANGELES 3908
## 3850 Ford Mustang NEW YORK 3803
attach(newdata)
#histograms in plot matrix: x=model, y=vdpCount
#contingency tables
#automatic function for detecting outliers
#Total number of Fords in the market in January
ford_total = sqldf("SELECT SUM(vdpCount) FROM newdata")
ford_total
## SUM(vdpCount)
## 1 441178
#calculate sum for each distinct model and plot
model_totals = sqldf("SELECT model, SUM(vdpCount) as total FROM newdata GROUP BY model ORDER BY SUM(vdpCount) DESC")
model_totals
## model total
## 1 F-150 106695
## 2 Mustang 58457
## 3 F-250 37050
## 4 Explorer 30520
## 5 Escape 30440
## 6 Focus 23796
## 7 Ranger 23556
## 8 Fusion 19436
## 9 Edge 16708
## 10 F-350 16578
## 11 Taurus 11285
## 12 Expedition 9797
## 13 Econoline 8218
## 14 Explorer Sport Trac 6720
## 15 Flex 5371
## 16 Fiesta 5185
## 17 Transit Connect 3870
## 18 F-450 3159
## 19 Transit 2996
## 20 Bronco 2920
## 21 Expedition EL 2564
## 22 Excursion 2488
## 23 Crown Victoria 2118
## 24 Thunderbird 1872
## 25 Windstar 1386
## 26 Escort 1173
## 27 Five Hundred 1156
## 28 Freestyle 1102
## 29 C-Max 943
## 30 Freestar 845
## 31 F-650 628
## 32 GT 466
## 33 Tempo 438
## 34 Taurus X 367
## 35 F-550 274
## 36 Contour 261
## 37 Bronco II 118
## 38 Probe 67
## 39 Aerostar 57
## 40 LTD 48
## 41 F-750 21
## 42 F-Super Duty 15
## 43 F-800 10
## 44 F-700 4
library(ggplot2)
#SETTING ENTIRE "THEME"/GRAPH SIZE
theme_set(theme_grey(base_size=18))
#flip coordinate axis, reorder from most wins to least
g = ggplot(model_totals, aes(x = reorder(model,total), y=total, fill=total))
g + geom_bar(stat = "identity") + scale_fill_continuous(low="light blue", high="dark green", limits=c(0,108000)) + coord_flip() + ggtitle("Total Inventory in Market by Model") + xlab("Models") + ylab("Total in Market") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black"))
head(newdata)
## make model dma_name vdpCount
## 3855 Ford F-150 DALLAS-FT. WORTH 8574
## 3854 Ford F-150 NEW YORK 5332
## 3853 Ford Mustang LOS ANGELES 4204
## 3852 Ford F-150 HOUSTON 4099
## 3851 Ford F-150 LOS ANGELES 3908
## 3850 Ford Mustang NEW YORK 3803
#fix(newdata)
#NY model count
ny_mod = sqldf("SELECT model, dma_name, vdpCount FROM newdata WHERE dma_name = 'NEW YORK'")
#ny_mod
#PHIL model count
ph_mod = sqldf("SELECT model, dma_name, vdpCount FROM newdata WHERE dma_name = 'PHILADELPHIA'")
#ph_mod
#boston model count
bos_mod = sqldf("SELECT model, dma_name, vdpCount FROM newdata WHERE dma_name = 'BOSTON (MANCHESTER)'")
#bos_mod
#dc model count
dc_mod = sqldf("SELECT model, dma_name, vdpCount FROM newdata WHERE dma_name = 'WASHINGTON'")
#dc_mod
#NY
ny = ggplot(ny_mod[1:10,], aes(x = reorder(model,vdpCount), y=vdpCount, fill=vdpCount))
plot1 = ny + geom_bar(stat = "identity") + scale_fill_continuous(low="light blue", high="dark blue") + coord_flip() + ggtitle("New York") + xlab("Models") + ylab("Inventory Totals") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black")) + theme(legend.position="none")
#PHILADELPHIA
ph = ggplot(ph_mod[1:10,], aes(x = reorder(model,vdpCount), y=vdpCount, fill=vdpCount))
plot2 = ph + geom_bar(stat = "identity") + scale_fill_continuous(low="violet", high="violetred") + coord_flip() + ggtitle("Philadelphia") + xlab("Models") + ylab("Inventory Totals") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black")) + theme(legend.position="none")
#BOSTON
bos = ggplot(bos_mod[1:10,], aes(x = reorder(model,vdpCount), y=vdpCount, fill=vdpCount))
plot3 = bos + geom_bar(stat = "identity")+ scale_fill_continuous(low="brown1", high="brown4") + coord_flip() + ggtitle("Boston") + xlab("Models") + ylab("Inventory Totals") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black")) + theme(legend.position="none")
#WASHINGTON DC
dc = ggplot(dc_mod[1:10,], aes(x = reorder(model,vdpCount), y=vdpCount, fill=vdpCount))
plot4 = dc + geom_bar(stat = "identity")+ scale_fill_continuous(low="gold", high="yellowgreen") + coord_flip() + ggtitle("Washington DC") + xlab("Models") + ylab("Inventory Totals") + theme(plot.title = element_text(size = 22, face = "bold", family = "Helvetica"), axis.title=element_text(face="bold", size=18, color="black")) + theme(legend.position="none")
require(gridExtra)
grid.arrange(plot1, plot2, plot3, plot4, ncol=2)
*Use Mustang inventory count in NY vs. Boston. NY = ~4K, Boston = ~500. It would be very hard to believe that demand for Ford Mustangs is THAT heterogeneous across NY and Boston markets…
Insights
Much like DMA size, there is also a large disparity in the total market's inventory count when broken down by model. Thus, we should prioritize shipping recommendations according to the inventory count for each model. The logic behind this is that the larger the inventory count for each model, the greater the financial implications of selling or auctioning off these cars at a less than optimal margin if demand is not sufficient at the model level.
Moving forward we will prioritize shipping recommendations based on the size of the DMA and a measure of market inefficiency at the model level. Again, the financial implications of market inefficiency scale with the size of the market. Thus, our approach will quantify market inefficiency at the model level by assigning a ratio of supply/demand to each model in each DMA. If demand for a particular model is insufficient in one DMA, we will recommend that a certain number of vehicles of that model should be shipped to the largest DMA that most closely approixmates the inverse of that ratio (i.e., demand exceeds supply). Luckily, by focusing on these larger DMAs (markets), we will mostly likely not ecounter any substantial differences in the average price for a given model.
These recommendations will be constrained by the cost of shipping “x” number of vehicles per mile. Thus, when deciding on where to ship a batch of cars, we will be constrained by a geographic radius, which is yet to be determined. A sample visualization of geographic constraints is provided below
#got city names corresponding to long, lat, dma_code data w/ Google Maps Package
#only 2500 API requests allowed per day - was limited by this
library(ggmap)
#new data to visualize geographic regions (constrained by shipping costs)
geo = read.csv("dma_lat_long.csv", header=TRUE, sep=",")
geo = select(geo, -vdpCount)
head(geo)
## X make model dma_name lon lat
## 1 1 Ford Crown Victoria ABILENE-SWEETWATER -100.12386 32.61875
## 2 2 Ford Fiesta ALBANY -73.75623 42.65258
## 3 3 Ford F-450 ALBANY-SCHENECTADY-TROY -73.87327 42.79180
## 4 4 Ford Excursion ALBUQUERQUE-SANTA FE -106.65618 35.07569
## 5 5 Ford Expedition EL ALEXANDRIA -77.04692 38.80484
## 6 6 Ford F-450 AMARILLO -101.83130 35.22200
dim(geo)
## [1] 195 6
#search for DMAs within a given radius, assign factor to each region
usa = map_data("state")
bc_bbox <- make_bbox(lat = lat, lon = long, data = usa)
bc_big <- get_map(location = bc_bbox, source = "google", maptype = "terrain")
order_lat_lon = geo[order(geo$lon, geo$lat), ]
dim(order_lat_lon)
## [1] 195 6
viz = order_lat_lon[153:195,]
ggmap(bc_big) + geom_point(data = viz, mapping = aes(x = lon, y = lat), alpha=0.5, size=6) + ggtitle("Visualizing Shipping Cost Constraints")