Source file ⇒ Assignment_8.Rmd
download.file(url="http://tiny.cc/dcf/CMS_ProvidersSimple.rds", destfile = "MedAddresses.rds", mode = "wb")
MedAddresses <- readRDS("MedAddresses.rds")
pattern <- "(ST|RD|ROAD)"
LeftOvers <-
MedAddresses %>%
filter(!grepl(pattern, address),
!grepl(" APT|UNIT [[:digit:]]+$", address))
head(LeftOvers)
## address first_name sex
## 1 900 SETON DR ARDALAN M
## 2 2650 RIDGE AVE THOMAS M
## 3 456 MAGEE AVE DAVID M
## 4 11100 EUCLID AVE JENNIFER F
## 5 12605 E 16TH AVE KEVIN M
## 6 1021 PARK AVE AMANDA F
We first set patterns to the common street endings i.e. “ST, RD, ROAD”
grepl then comares a regular expression to a string
*LeftOvers is created to contain all the matches that do not match our pattern.
# the following code is used to make the addresses consistent in casing
MedAddresses$address %>%
toupper() %>%
head()
## [1] "900 SETON DR" "2650 RIDGE AVE"
## [3] "4126 N HOLLAND SYLVANIA RD" "456 MAGEE AVE"
## [5] "11100 EUCLID AVE" "12605 E 16TH AVE"
# this pattern is designed to get keep cases that have one or more digits
numbpattern <- "^[[:digit:]]+"
Digits <-
MedAddresses %>%
filter(grepl(numbpattern, address))
# this pattern is used to filter out cases that do not have the string "BOX" followed by one or more digits
POpattern <- "BOX[[:blank:]]+([[:digit:]]+)"
NoPOBox <-
Digits %>%
filter(!grepl(POpattern, address))
# this pattern is used to capture the cases that are shorthand. \\b is used because it is a word boundary. This specifies exact names such as "RD" instead of Stafford Street
shortpattern <- "\\b(RD|ROAD|AVE|AVENUE|ST|STREET|WAY|DR|DRIVE|HWY|HIGHWAY|PL|PLACE|PKWY|PARKWAY|CTR|CENTER|BLD|BLVD|BOULEVARD|LN|LANE|CIR|CIRCLE|EXPY|CT|COURT|TERRACE)\\b"
short <-
NoPOBox %>%
filter(grepl(shortpattern, address))
TotalCount <- short %>%
extractMatches(shortpattern, address, name=1) %>%
mutate(name=gsub("^RD$","ROAD", name)) %>%
mutate(name=gsub("^ST$","STREET", name)) %>%
mutate(name=gsub("^BLD$","BLVD", name)) %>%
mutate(name=gsub("^BOULEVARD$","BLVD", name)) %>%
mutate(name=gsub("^CIR$","CIRCLE", name)) %>%
mutate(name=gsub("^DR$","DRIVE", name)) %>%
mutate(name=gsub("^LN$","LANE", name)) %>%
mutate(name=gsub("^AVE$","AVENUE", name)) %>%
mutate(name=gsub("^PL$","PLACE", name)) %>%
mutate(name=gsub("^CT$","COURT", name)) %>%
mutate(name=gsub("^CTR$","CENTER", name)) %>%
mutate(name=gsub("^HWY$","HIGHWAY", name)) %>%
mutate(name=gsub("^PKWY$","PARKWAY", name)) %>%
group_by(name) %>%
summarise(total=n()) %>%
arrange(desc(total))
And here is the most popular street name endings:
as.data.frame(TotalCount)
## name total
## 1 STREET 242260
## 2 AVENUE 181212
## 3 ROAD 150300
## 4 DRIVE 95518
## 5 BLVD 66825
## 6 HIGHWAY 24516
## 7 PARKWAY 18115
## 8 CENTER 13668
## 9 LANE 13080
## 10 WAY 10723
## 11 PLACE 7998
## 12 COURT 7734
## 13 CIRCLE 6645
## 14 EXPY 996
## 15 TERRACE 356