Assignment 8

Back to the Streets

download.file(url="http://tiny.cc/dcf/CMS_ProvidersSimple.rds", destfile = "MedAddresses.rds", mode = "wb")
MedAddresses <- readRDS("MedAddresses.rds")

pattern <- "(ST|RD|ROAD)"
LeftOvers <-
  MedAddresses %>% 
  filter(!grepl(pattern, address),
         !grepl(" APT|UNIT [[:digit:]]+$", address))

head(LeftOvers)

##            address first_name sex
## 1     900 SETON DR    ARDALAN   M
## 2   2650 RIDGE AVE     THOMAS   M
## 3    456 MAGEE AVE      DAVID   M
## 4 11100 EUCLID AVE   JENNIFER   F
## 5 12605 E 16TH AVE      KEVIN   M
## 6    1021 PARK AVE     AMANDA   F

Find the most common endings.

We first set patterns to the common street endings i.e. “ST, RD, ROAD”
grepl then comares a regular expression to a string

*LeftOvers is created to contain all the matches that do not match our pattern.

First, strings containing ST, RD, or ROAD are filtered out
Then, strings with APT or UNIT followedd by zero or more digits are also filtered out.

# the following code is used to make the addresses consistent in casing
MedAddresses$address %>%
  toupper() %>%
  head()

## [1] "900 SETON DR"               "2650 RIDGE AVE"            
## [3] "4126 N HOLLAND SYLVANIA RD" "456 MAGEE AVE"             
## [5] "11100 EUCLID AVE"           "12605 E 16TH AVE"

# this pattern is designed to get keep cases that have one or more digits
numbpattern <- "^[[:digit:]]+"
Digits <-
  MedAddresses %>%
  filter(grepl(numbpattern, address))

# this pattern is used to filter out cases that do not have the string "BOX" followed by one or more digits
POpattern <- "BOX[[:blank:]]+([[:digit:]]+)"
NoPOBox <-
  Digits %>%
  filter(!grepl(POpattern, address))

# this pattern is used to capture the cases that are shorthand. \\b is used because it is a word boundary. This specifies exact names such as "RD" instead of Stafford Street

shortpattern <- "\\b(RD|ROAD|AVE|AVENUE|ST|STREET|WAY|DR|DRIVE|HWY|HIGHWAY|PL|PLACE|PKWY|PARKWAY|CTR|CENTER|BLD|BLVD|BOULEVARD|LN|LANE|CIR|CIRCLE|EXPY|CT|COURT|TERRACE)\\b"
short <-
  NoPOBox %>%
  filter(grepl(shortpattern, address))

TotalCount <- short %>%
  extractMatches(shortpattern, address, name=1) %>%
  mutate(name=gsub("^RD$","ROAD", name)) %>%
  mutate(name=gsub("^ST$","STREET", name)) %>%
  mutate(name=gsub("^BLD$","BLVD", name)) %>%
  mutate(name=gsub("^BOULEVARD$","BLVD", name)) %>%
  mutate(name=gsub("^CIR$","CIRCLE", name)) %>%
  mutate(name=gsub("^DR$","DRIVE", name)) %>%
  mutate(name=gsub("^LN$","LANE", name)) %>%
  mutate(name=gsub("^AVE$","AVENUE", name)) %>%
  mutate(name=gsub("^PL$","PLACE", name)) %>%
  mutate(name=gsub("^CT$","COURT", name)) %>%
  mutate(name=gsub("^CTR$","CENTER", name)) %>%
  mutate(name=gsub("^HWY$","HIGHWAY", name)) %>%
  mutate(name=gsub("^PKWY$","PARKWAY", name)) %>%
  group_by(name) %>%
  summarise(total=n()) %>%
  arrange(desc(total))

And here is the most popular street name endings:

as.data.frame(TotalCount)

##       name  total
## 1   STREET 242260
## 2   AVENUE 181212
## 3     ROAD 150300
## 4    DRIVE  95518
## 5     BLVD  66825
## 6  HIGHWAY  24516
## 7  PARKWAY  18115
## 8   CENTER  13668
## 9     LANE  13080
## 10     WAY  10723
## 11   PLACE   7998
## 12   COURT   7734
## 13  CIRCLE   6645
## 14    EXPY    996
## 15 TERRACE    356

Assignment 8

Daniel Alonzo

March 30, 2016

Back to the Streets