http://rpubs.com/pgbpro20/supplyVsDemand
Two Part Assignment Part One: Supply/Demand analysis eCommerce site
Part Two: Fun with Modeling
rm(list = ls()) #remove all objects from environment
gc() #do a garbage cleanup
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 362023 19.4 592000 31.7 460000 24.6
## Vcells 559830 4.3 1308461 10.0 786431 6.0
#The three lines below detach any libraries not specified here that may cause conflict.
#pkgs <- names(sessionInfo()$otherPkgs)
#pkgs <- paste('package:', pkgs, sep = "")
#lapply(pkgs, detach, character.only = TRUE, unload = TRUE, force = TRUE)
setwd("/home/rstudio/Dropbox/Assignment 1")
library('plyr') #plyr must be loaded before dplyr
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) #plotting
library(ggmap) #mapping
library(zipcode) #geocoding
library(gdata) #reading in xlsx
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
members <- read.csv(protectedUrl)
colnames(members)
## [1] "id" "member_type"
## [3] "username" "password"
## [5] "first_name" "zip_code"
## [7] "latitude" "longitude"
## [9] "address" "country"
## [11] "state" "city"
## [13] "sex" "email"
## [15] "domain" "registration_on"
## [17] "dated_on" "set_email_time"
## [19] "subscribe_important" "subscribe_partners"
## [21] "is_valid" "is_blocked"
## [23] "update_msg" "is_payment"
## [25] "certify_email_status" "admin_certify_status"
## [27] "remind_email" "is_logedin"
## [29] "is_new_member" "is_deactivated"
## [31] "certification_email_count" "num_emails_deactivate"
## [33] "last_email_deactivate"
dim(members)
## [1] 2115 33
cleansedMembersColumns <- select(members, id, member_type, username, zip_code,
latitude, longitude, state, sex, dated_on, is_valid, is_deactivated)
colnames(cleansedMembersColumns)
## [1] "id" "member_type" "username" "zip_code"
## [5] "latitude" "longitude" "state" "sex"
## [9] "dated_on" "is_valid" "is_deactivated"
dim(cleansedMembersColumns)
## [1] 2115 11
cleansedMembers <- filter(cleansedMembersColumns, is_valid == 1 & is_deactivated ==
0)
dim(cleansedMembers)
## [1] 1304 11
michiganMap <- get_map(location = "michigan", maptype = "roadmap", zoom = 7)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=michigan&zoom=7&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=michigan&sensor=false
ggmap(michiganMap, extent = "device") + geom_point(aes(x = longitude, y = latitude),
colour = "red", alpha = 0.1, size = 2, data = cleansedMembers)
Already we have some interesting results. The population within the Grand Rapids area is entirely expected as there have been one in-person campaigns and several digital ad campaigns in this area.
However, the population around in the Detroit/Ann Arbor area is unexpected as there have been no such campaigns and the growth in this region is entirely due to virality. Further study is needed, probably we should normalize these results against the statewide population to find our next “high leverage” area where returns on advertisment investments will be high.
Now we want to target
Demand Calculation Basics
# i'm only interested in state of Michigan at this point
stateOfInterest = "Michigan"
stateOfInterestShort = "MI"
data("zipcode")
# zip codes are coming as characters but stored in members table as integer,
# so... wrangling
zipcode$zip <- as.integer(zipcode$zip)
# the full zip code dataframe is causing slowdowns on my machine, so I
# filter it here
zipcode <- filter(zipcode, state == stateOfInterestShort)
zipcode <- select(zipcode, zip, city)
# add city name to every member
cleansedMembers <- left_join(cleansedMembers, zipcode, by = c(zip_code = "zip"))
suppliers <- filter(cleansedMembers, member_type == "Caregiver" & state == stateOfInterest)
consumers <- filter(cleansedMembers, member_type == "Patient" & state == stateOfInterest)
# All we want is a df that has city, suppliers (integer), consumers
# (integer)
countSuppliersByCity <- count(suppliers, city)
names(countSuppliersByCity) <- c("city", "Suppliers")
countConsumersByCity <- count(consumers, city)
names(countConsumersByCity) <- c("city", "Consumers")
countMembersByCity <- merge(countSuppliersByCity, countConsumersByCity, by.x = "city",
by.y = "city", all.x = TRUE, all.y = TRUE)
# remove NAs
countMembersByCity[is.na(countMembersByCity)] <- 0
# Now we need to paste in our Competition Scoring
countMembersByCity$competitionScore <- countMembersByCity$Consumers/countMembersByCity$Suppliers
# lets add in a total count
countMembersByCity$totalCount <- countMembersByCity$Consumers + countMembersByCity$Suppliers
# Now we need to paste in our interpretation of competition score I know
# some nice ways to do this in PHP, but it's ugly here
countMembersByCity <- within(countMembersByCity, {
competitionRank <- ifelse(totalCount < 8, "Not Enough Info", ifelse(competitionScore <=
3, "Not Enough Consumers", ifelse(competitionScore > 3 & competitionScore <
7, "Perfect", ifelse(competitionScore >= 7, "Not Enough Suppliers",
"error"))))
})
# Simplify results for display
countMembersByCity <- select(countMembersByCity, city, competitionScore, totalCount,
competitionRank)
##### A list of cities: Not Enough Consumers #####
filter(countMembersByCity, competitionRank == "Not Enough Consumers")
## city competitionScore totalCount competitionRank
## 1 Alpena 1.666667 8 Not Enough Consumers
## 2 Battle Creek 2.666667 11 Not Enough Consumers
## 3 Big Rapids 1.666667 8 Not Enough Consumers
## 4 Cadillac 2.600000 18 Not Enough Consumers
## 5 Detroit 1.000000 10 Not Enough Consumers
## 6 Howard City 0.600000 8 Not Enough Consumers
## 7 Hudsonville 3.000000 8 Not Enough Consumers
## 8 Iron Mountain 1.666667 8 Not Enough Consumers
## 9 Lansing 1.142857 15 Not Enough Consumers
## 10 Middleville 0.800000 9 Not Enough Consumers
## 11 Niles 1.500000 20 Not Enough Consumers
## 12 Portage 1.500000 10 Not Enough Consumers
## 13 Traverse City 3.000000 8 Not Enough Consumers
## 14 Utica 2.666667 11 Not Enough Consumers
## 15 Ypsilanti 1.800000 14 Not Enough Consumers
##### A list of cities: Not Enough Suppliers #####
filter(countMembersByCity, competitionRank == "Not Enough Suppliers")
## city competitionScore totalCount competitionRank
## 1 Allendale Inf 8 Not Enough Suppliers
## 2 Ann Arbor 8.000000 9 Not Enough Suppliers
## 3 Grand Haven 7.000000 8 Not Enough Suppliers
## 4 Grand Rapids 7.695652 200 Not Enough Suppliers
## 5 Grandville Inf 8 Not Enough Suppliers
## 6 Holland 9.000000 10 Not Enough Suppliers
## 7 Kalamazoo 8.000000 36 Not Enough Suppliers
## 8 Marquette 9.000000 10 Not Enough Suppliers
## 9 Midland 13.000000 14 Not Enough Suppliers
## 10 Mount Pleasant 13.000000 28 Not Enough Suppliers
## 11 Muskegon Inf 15 Not Enough Suppliers
## 12 Sturgis 7.000000 8 Not Enough Suppliers
## 13 Wyoming 8.000000 9 Not Enough Suppliers
##### A list of cities: Perfect #####
filter(countMembersByCity, competitionRank == "Perfect")
## city competitionScore totalCount competitionRank
## 1 Bay City 4.5 11 Perfect
## 2 Comstock Park 3.5 9 Perfect
## 3 Rockford 5.0 12 Perfect
## 4 Saginaw 3.4 22 Perfect
housing <- read.xls("landdata.xls", pattern = "STATE") #warning this takes a long time!
housingForWork <- housing #I don't want to screw up and have to rerun the line above
# One of the colnames is utter nonsense
names(housingForWork)[names(housingForWork) == "Land.Share..Pct."] <- "Land.Share.Pct"
head(housingForWork, 10)
## STATE Date Home.Value Structure.Cost Land.Value Land.Share.Pct
## 1 AL 1975Q1 $24,784 $23,545 $1,239 5.0%
## 2 AL 1975Q2 $25,102 $23,847 $1,255 5.0%
## 3 AL 1975Q3 $25,472 $24,199 $1,274 5.0%
## 4 AL 1975Q4 $25,903 $24,608 $1,295 5.0%
## 5 AL 1976Q1 $26,447 $25,125 $1,322 5.0%
## 6 AL 1976Q2 $27,059 $25,696 $1,363 5.0%
## 7 AL 1976Q3 $27,748 $26,135 $1,613 5.8%
## 8 AL 1976Q4 $28,532 $26,654 $1,878 6.6%
## 9 AL 1977Q1 $29,394 $27,217 $2,177 7.4%
## 10 AL 1977Q2 $30,306 $27,804 $2,502 8.3%
## Home.Price.Index Land.Price.Index
## 1 0.222 0
## 2 0.225 0
## 3 0.228 0
## 4 0.232 0
## 5 0.237 0
## 6 0.242 0
## 7 0.248 0
## 8 0.255 0
## 9 0.263 0
## 10 0.271 0
# I'm going to do strip out the last two digits of date (Qx...really!!!!)
housingForWork$Year <- as.numeric(substr(housingForWork$Date, 1, 4))
housingForWork$Qrtr <- as.numeric(substr(housingForWork$Date, 6, 6))
housingForWork$Date <- housingForWork$Year + housingForWork$Qrtr/4
# This uses a regular expressions to clean up data
housingForWork$Home.Value <- gsub("[$]", "", housingForWork$Home.Value)
housingForWork$Home.Value <- gsub("[,]", "", housingForWork$Home.Value)
housingForWork$Structure.Cost <- gsub("[$]", "", housingForWork$Structure.Cost)
housingForWork$Structure.Cost <- gsub("[,]", "", housingForWork$Structure.Cost)
housingForWork$Land.Value <- gsub("[$]", "", housingForWork$Land.Value)
housingForWork$Land.Value <- gsub("[,]", "", housingForWork$Land.Value)
housingForWork$Land.Share.Pct <- gsub("[%]", "", housingForWork$Land.Share.Pct)
# Now we set the classes for the data sapply(housingForWork, class) #saving
# this for later
housingForWork$Home.Value <- as.numeric(housingForWork$Home.Value)
housingForWork$Structure.Cost <- as.numeric(housingForWork$Structure.Cost)
housingForWork$Land.Value <- as.numeric(housingForWork$Land.Value)
housingForWork$Land.Share.Pct <- as.numeric(housingForWork$Land.Share.Pct)
# Here's a silly histogram plot
hist(housingForWork$Home.Value)
head(housingForWork)
## STATE Date Home.Value Structure.Cost Land.Value Land.Share.Pct
## 1 AL 1975.25 24784 23545 1239 5
## 2 AL 1975.50 25102 23847 1255 5
## 3 AL 1975.75 25472 24199 1274 5
## 4 AL 1976.00 25903 24608 1295 5
## 5 AL 1976.25 26447 25125 1322 5
## 6 AL 1976.50 27059 25696 1363 5
## Home.Price.Index Land.Price.Index Year Qrtr
## 1 0.222 0 1975 1
## 2 0.225 0 1975 2
## 3 0.228 0 1975 3
## 4 0.232 0 1975 4
## 5 0.237 0 1976 1
## 6 0.242 0 1976 2
# And the same one in ggplot2
# Can a data scientist afford a home in Georgia?
ggplot(subset(housingForWork, STATE %in% "GA"), aes(x = Date, y = Home.Value,
color = STATE)) + geom_point()
# Lets explore land value with a smooth line
ggplot(subset(housingForWork, STATE %in% "GA"), aes(x = Date, y = Land.Value,
color = STATE)) + geom_line()
# Per the assignment I have to make a box plot, lets do it for 2015 Notice
# that this is
year2015 <- filter(housingForWork, Year == 2015 & (STATE == "GA" | STATE ==
"AL" | STATE == "TN" | STATE == "SC"))
ggplot(year2015, aes(STATE, Home.Value)) + geom_bar(stat = "identity", fill = "blue")
# Has Home value been going up since 2000?
yr2000AndUp <- filter(housingForWork, Year > 2000)
cor(yr2000AndUp$Date, yr2000AndUp$Home.Value)
## [1] 0.1984192
# These are more weakly correlated than I expected!
# Ok time to do a tiny bit of modeling just for fun Lets do a linear model
# of Land value from 1980 to 1990
head(housingForWork)
## STATE Date Home.Value Structure.Cost Land.Value Land.Share.Pct
## 1 AL 1975.25 24784 23545 1239 5
## 2 AL 1975.50 25102 23847 1255 5
## 3 AL 1975.75 25472 24199 1274 5
## 4 AL 1976.00 25903 24608 1295 5
## 5 AL 1976.25 26447 25125 1322 5
## 6 AL 1976.50 27059 25696 1363 5
## Home.Price.Index Land.Price.Index Year Qrtr
## 1 0.222 0 1975 1
## 2 0.225 0 1975 2
## 3 0.228 0 1975 3
## 4 0.232 0 1975 4
## 5 0.237 0 1976 1
## 6 0.242 0 1976 2
yrXToY <- filter(housingForWork, Date >= 1980 & Date <= 2010 & STATE == "GA")
yrXToY <- select(yrXToY, Date, Land.Value)
m <- lm(Date ~ Land.Value, data = yrXToY)
# Now lets check the residuals, you should see no pattern in the following
# plot
scatter.smooth(residuals(m) ~ fitted(m))
# Whoops I guess this is not a good linear model (not a huge surprise)
Citations
Quick and Dirty Linear Modeling: https://www.r-bloggers.com/quick-and-dirty-notes-on-general-linear-mix-models/
Regular expressions in R: https://www.r-bloggers.com/regular-expressions-in-r-vs-rstudio/
Also a regex cheat sheet: http://www.rexegg.com/regex-quickstart.html#chars
R-Markdown Cheat Sheet: URL https://www.rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf
D. Kahle and H. Wickham. ggmap: Spatial Visualization with ggplot2. The R Journal, 5(1), 144-161. URL http://journal.r-project.org/archive/2013-1/kahle-wickham.pdf
Heatmap Instructions: URL http://www.geo.ut.ee/aasa/LOOM02331/heatmap_in_R.html