Source file ⇒ Assignment_8.Rmd

Back to the Streets

download.file(url="http://tiny.cc/dcf/CMS_ProvidersSimple.rds", destfile = "MedAddresses.rds", mode = "wb")
MedAddresses <- readRDS("MedAddresses.rds")
pattern <- "(ST|RD|ROAD)"
LeftOvers <-
  MedAddresses %>% 
  filter(!grepl(pattern, address),
         !grepl(" APT|UNIT [[:digit:]]+$", address))

head(LeftOvers)
##            address first_name sex
## 1     900 SETON DR    ARDALAN   M
## 2   2650 RIDGE AVE     THOMAS   M
## 3    456 MAGEE AVE      DAVID   M
## 4 11100 EUCLID AVE   JENNIFER   F
## 5 12605 E 16TH AVE      KEVIN   M
## 6    1021 PARK AVE     AMANDA   F
  1. Find the most common endings.

*LeftOvers is created to contain all the matches that do not match our pattern.

# the following code is used to make the addresses consistent in casing
MedAddresses$address %>%
  toupper() %>%
  head()
## [1] "900 SETON DR"               "2650 RIDGE AVE"            
## [3] "4126 N HOLLAND SYLVANIA RD" "456 MAGEE AVE"             
## [5] "11100 EUCLID AVE"           "12605 E 16TH AVE"
# this pattern is designed to get keep cases that have one or more digits
numbpattern <- "^[[:digit:]]+"
Digits <-
  MedAddresses %>%
  filter(grepl(numbpattern, address))
# this pattern is used to filter out cases that do not have the string "BOX" followed by one or more digits
POpattern <- "BOX[[:blank:]]+([[:digit:]]+)"
NoPOBox <-
  Digits %>%
  filter(!grepl(POpattern, address))
# this pattern is used to capture the cases that are shorthand. \\b is used because it is a word boundary. This specifies exact names such as "RD" instead of Stafford Street

shortpattern <- "\\b(RD|ROAD|AVE|AVENUE|ST|STREET|WAY|DR|DRIVE|HWY|HIGHWAY|PL|PLACE|PKWY|PARKWAY|CTR|CENTER|BLD|BLVD|BOULEVARD|LN|LANE|CIR|CIRCLE|EXPY|CT|COURT|TERRACE)\\b"
short <-
  NoPOBox %>%
  filter(grepl(shortpattern, address))
TotalCount <- short %>%
  extractMatches(shortpattern, address, name=1) %>%
  mutate(name=gsub("^RD$","ROAD", name)) %>%
  mutate(name=gsub("^ST$","STREET", name)) %>%
  mutate(name=gsub("^BLD$","BLVD", name)) %>%
  mutate(name=gsub("^BOULEVARD$","BLVD", name)) %>%
  mutate(name=gsub("^CIR$","CIRCLE", name)) %>%
  mutate(name=gsub("^DR$","DRIVE", name)) %>%
  mutate(name=gsub("^LN$","LANE", name)) %>%
  mutate(name=gsub("^AVE$","AVENUE", name)) %>%
  mutate(name=gsub("^PL$","PLACE", name)) %>%
  mutate(name=gsub("^CT$","COURT", name)) %>%
  mutate(name=gsub("^CTR$","CENTER", name)) %>%
  mutate(name=gsub("^HWY$","HIGHWAY", name)) %>%
  mutate(name=gsub("^PKWY$","PARKWAY", name)) %>%
  group_by(name) %>%
  summarise(total=n()) %>%
  arrange(desc(total))

And here is the most popular street name endings:

as.data.frame(TotalCount)
##       name  total
## 1   STREET 242260
## 2   AVENUE 181212
## 3     ROAD 150300
## 4    DRIVE  95518
## 5     BLVD  66825
## 6  HIGHWAY  24516
## 7  PARKWAY  18115
## 8   CENTER  13668
## 9     LANE  13080
## 10     WAY  10723
## 11   PLACE   7998
## 12   COURT   7734
## 13  CIRCLE   6645
## 14    EXPY    996
## 15 TERRACE    356