Source file ⇒ HW_8.Rmd

download.file(url="http://tiny.cc/dcf/CMS_ProvidersSimple.rds",
              destfile = "YourNameForTheFile.rds")
DataTable <- readRDS("YourNameForTheFile.rds")
head(DataTable)
##                      address first_name sex
## 1               900 SETON DR    ARDALAN   M
## 2             2650 RIDGE AVE     THOMAS   M
## 3 4126 N HOLLAND SYLVANIA RD     RASHID   M
## 4              456 MAGEE AVE      DAVID   M
## 5           11100 EUCLID AVE   JENNIFER   F
## 6           12605 E 16TH AVE      KEVIN   M
sample_n(DataTable,size=50)
##                              address   first_name sex
## 93229              9337 KREWSTOWN RD        KEVIN   M
## 297011                 124 W 32ND ST       THOMAS   M
## 481995                215 COLLEGE ST     CLIFFORD   M
## 535412                 1275 YORK AVE        SCOTT   M
## 521235          1025 SILAS DEANE HWY       JANICE   F
## 740286               2148 DULUTH HWY        SUSIE   F
## 718718                 3 JENNIFER CT         RYAN   M
## 332025              3575 BROADWAY ST       JOANNA   F
## 699342               1300 MEDICAL DR        SUSAN   F
## 897844               7507 NE 51ST ST      BEVERLY   F
## 247873                1675 E MAIN ST         PAUL   M
## 311342         15410 S MOUNTAIN PKWY         LORI   F
## 28069              20601 E DIXIE HWY         ROBB   M
## 204925                   7068 SKYWAY      KENNETH   M
## 708733              2 DAVIS POINT LN      LINDSEY   F
## 539046                509 N BROAD ST MARIA TERESA   F
## 617940     1617 N. CALIFORNIA STREET     KIMBERLY   F
## 869902             434 S KIWANIS AVE      JEFFREY   M
## 462151 2006 BROOKWOOD MEDICAL CTR DR        KATHY   F
## 518394               2300 OPITZ BLVD     DANIELLE   F
## 512733        210 JUPITER LAKES BLVD         JOHN   M
## 170688      180 HARVESTER DR STE 110          IRA   M
## 93961             1570 E HERNDON AVE      PATRICK   M
## 380617        4117 MEDICAL CENTER DR      MARYANN   F
## 492313                  1421 3RD AVE      JEFFREY   M
## 20683             4708 ALLIANCE BLVD         MARK   M
## 883263                 7710 MERCY RD      MATTHEW   M
## 599425               900 PACIFIC AVE          EVE   F
## 1332              5454 WISCONSIN AVE       RACHEL   F
## 425854                7 CARNEGIE PLZ      KARILYN   F
## 834908             10215 FERNWOOD RD         ERIC   M
## 347313                  7402 YORK RD        DONNA   F
## 813669                   55 FRUIT ST        GILES   M
## 521381                15 HOSPITAL DR     MOHAMMAD   M
## 171488         15837 PAUL VEGA MD DR     BENJAMIN   M
## 866265     2100 NAPA-VALLEJO HIGHWAY         LEIF   M
## 167874        5849 DIAMOND POINT CIR       CARLOS   M
## 799910          9200 W WISCONSIN AVE      BARBARA   F
## 862287          3471 5TH AVE STE 910        BABAK   M
## 57613           1135 W UNIVERSITY DR      JINPING   F
## 401612         9055 E. DEL CAMINO DR       ROBERT   M
## 225589               5838 W BRICK RD         LISA   F
## 547456              80 MILL RIVER ST        WAYNE   M
## 427259               12501 JUDSON RD       JEANNE   F
## 474258                  400 E 8TH ST       NELSON   M
## 802109        736 N BATTLEFIELD BLVD       VICTOR   M
## 206230                 800 S MAIN ST        LOUIS   M
## 260693                 790 TIMBER DR                 
## 315561           1221 N HIGHLAND AVE        MARIA   F
## 483555          1002 S OLD DIXIE HWY        BARRY   M

Running the sample multiple times I listed out the common street endings. The most common endings are: RD, BLVD, ST, AVE, PKWY, DR, STREET, HWY, FWY, LN, ROAD, AVENUE, WAY, CT, SQ, CTR, HIGHWAY, DRIVE, CIR, PL

pattern <- "[[:blank:]]+(RD|BLVD|ST|AVE|PKWY|DR|STREET|HWY|FWY|LN|ROAD|AVENUE|WAY|CT|SQ|CTR|HIGHWAY|DRIVE|CIR|PL)\\.?" #address endings in the parentheses plus the possibility of having a period after the endings e.g. "RD."

sample <- DataTable %>%
  sample_n(size=1000) #take a sample of size 1000

LeftOvers <-
  sample %>% 
  filter(!grepl(pattern,address)) #extract addresses from the sample without the endings from the pattern

LeftOvers
##                           address  first_name sex
## 1                      302 W. HAY        RANA   M
## 2     2 CALLE SATURNINO RODRIGUEZ        ROSA   F
## 3           10800 MIDLOTHIAN TPKE      DENISE   F
## 4                    865 ROUTE 45                
## 5                    1055 N 500 W        KURT   M
## 6             11124 KINGSTON PIKE       BLAKE   M
## 7                    7510 FM 1765       TAGHI   M
## 8      1104-4 MEDICAL ARTS BLDG 4     MELVINA   F
## 9                       20 S PARK       KLAUS   M
## 10                    380 N 200 W CHRISTOPHER   M
## 11                 226 ROUTE 37 W        LORI   F
## 12              1975 LIN LOR LANE       DAVID   M
## 13     1491 HEALTH CENTER PARKWAY      CURTIS   M
## 14                    965 S 100 W      STEVEN   M
## 15               1147 NW 64TH TER       LAURA   F
## 16               1147 NW 64TH TER       PETER   M
## 17      170 ALAMEDA DE LAS PULGAS    CLARENCE   M
## 18               5212 BRANDT PIKE       MAGDY   M
## 19                  26 LILAC MALL                
## 20                 310 E BROADWAY     WILLIAM   M
## 21                    6570 N SR 7                
## 22             5221 US ROUTE 60 E        MARK   M
## 23                   401 BROADWAY     XINGONG   M
## 24                619 DON ROSALIO       JAMES   M
## 25              9 MEDICAL PARKWAY     REBECCA   F
## 26  MMC - REHABILITATION MEDICINE        ANNA   F
## 27              731 S IL ROUTE 21    MOHAMMAD   M
## 28                  196 PARKWAY S    KIMBERLY   F
## 29        CENTRAL DUPAGE HOSPITAL       NILAY   M
## 30                   1055 N 300 W       CLARK   M
## 31             1942 MILITARY TPKE        DAWN   F
## 32           1330 S FORT HARRISON      DIONNE   F
## 33                       213 MAIN                
## 34                27401 LOS ALTOS        MARK   M
## 35                 550 N HILLSIDE     TIMOTHY   M
## 36                    604 INDIANA      JUSTIN   M
## 37            1111 LEFFINGWELL NE    KIMBERLY   F
## 38           8490 W HOMOSASSA TRL      HAROLD   M
## 39        2801 W KK RIVER PARKWAY   CHRISTINE   F
## 40           7605 ROYAL TROON TER       TAREK   M
## 41                   851 COLORADO      JOSHUA   M
## 42                 31-00 BROADWAY        JACK   M
## 43                    555 S 200 W                
## 44               825 E LINCOLNWAY        ANIL   M
## 45          658 HARLEYSVILLE PIKE      SANDRA   F
## 46              1355 AKALANI LOOP     VINCENT   M
## 47                   615 BROADWAY                
## 48    23961 CALLE DE LA MAGDALENA      ROBERT   M
## 49             200 HAWTHORNE LANE       KASEY   F
## 50                    3000 N I-35       LALIT   M
## 51                  3333 ROUTE 27                
## 52          KENNEDY HEALTH SYSTEM       LARRY   M
## 53                5600 EUPER LANE       HENRY   M
## 54        631 SW HORNE, SUITE 209       JAMIE   F
## 55                  500 SW RAMSEY       BRETT   M
## 56         JOHNS HOPKINS HOSPITAL     SANDEEP   M
## 57          MEDICAL ARTS PAVILION      RUBINA   F
## 58       707 WEST LACEY BOULEVARD                
## 59                    100 FOX GLN     KATHRYN   F
## 60 PRESBYTERIAN HEART GROUP (PHG)       HIRAK   M

From the LeftOvers table I add the following street endings to my list: TRAIL, TRL, PARK, TPKE, LOOP, PIKE, LANE, EXPY, VLY, TER, CSWY, BOULEVARD, CENTER

pattern <- "[[:blank:]]+(RD|BLVD|ST|AVE|PKWY|DR|STREET|HWY|FWY|LN|ROAD|AVENUE|WAY|CT|SQ|CTR|HIGHWAY|DRIVE|CIR|PL|TRAIL|TRL|PARK|TPKE|LOOP|PIKE|LANE|EXPY|VLY|TER|CSWY|BOULEVARD|CENTER)\\.?" #add more endings to the pattern

sample <- DataTable %>%
  sample_n(size=1000)

LeftOvers <-
  sample %>%
  filter(!grepl(pattern,address))

LeftOvers
##                                     address first_name sex
## 1  UNIVERSITY OF TN DEPARTMENT OF PATHOLOGY       LISA   F
## 2                            5979 LAKESHORE      RANDY   M
## 3                             1010 N KANSAS  AVEEKSHIT   M
## 4                          34 LAVELLE COURT     PAMELA   F
## 5                             5622 BROADWAY               
## 6                         10862 CALLE VERDE      ERICK   M
## 7                            1001 N WALDROP       DEAN   M
## 8                          25 CALLE PERAL N   BENJAMIN   M
## 9                      701 E EL CAMINO REAL    LOURDES   F
## 10                           818 US 31W BYP               
## 11                         2301 SOUTH LAMAR    MICHAEL   M
## 12                           200 MINOR HALL   JENNIFER   F
## 13                           48 CALLE CAPRI    JULIEVA   F
## 14                            9720 S 1300 E      BRENT   M
## 15                         BO. GURABO ABAJO               
## 16             568 NORTH SUNRISE, SUITE 100      DAVID   M
## 17              UTMB FACULTY GROUP PRACTICE     LISSET   F
## 18                         220 EAST HEMLOCK       JOHN   M
## 19                 11 CALLE RODRIGUEZ SERRA  ELIZABETH   F
## 20                             320 W PUEBLO     ENRICO   M
## 21                              598 N UNION   KATHLEEN   F
## 22                     8118 CALLE CONCORDIA   REYNALDO   M
## 23                   1048 N. EL CAMINO REAL               
## 24                115 CALLE EMILIO GONZALEZ      JORGE   M
## 25                        LOCAL 18A BLD. #4               
## 26                            1200 E 3900 S   PRATHIMA   F
## 27            STANFORD UNIV SCH OF MEDICINE       JOSE   M
## 28                          740 S LIMESTONE    MICHAEL   M
## 29   UK DIVISION OF CARDIOVASCULAR MEDICINE       MARY   F
## 30                         TAYLOR AT MARION    STEPHEN   M
## 31                   650 SIERRA MADRE VILLA       MARK   M
## 32                       G1B CALLE FRONTERA               
## 33                            700 W CENTRAL    CAMILLE   F
## 34                  23521 PASEO DE VALENCIA      SASAN   M
## 35                         101 E BEVERLY BL      DAVID   M
## 36                              85 BROADWAY     THOMAS   M
## 37                       610 S. MAPLE #3420    ABRAHAM   M
## 38                            718 ARLINGTON     ROBERT   M
## 39                            8939 BROADWAY    RICHARD   M
## 40                   43 BATAVIA CITY CENTRE    NASHIHA   F
## 41                              503 S ASPEN       GARY   M
## 42                         500 EAST DECATUR      BRIAN   M
## 43                3243 E MURDOCK, SUITE 404      DAVID   M
## 44                 24401 CALLE DE LA LOUISA    PATRICK   M
## 45                         115 LOCUST SREET     NICOLE   F
## 46             HARPER PROFESSIONAL BUILDING    KENNETH   M
## 47                        26 CITY HALL MALL       PAUL   M
## 48                       4253 ROUTE 9 NORTH    KENNETH   M

The LeftOvers table the addresses have roughly different street names with no common street endings. Now I can apply the code to the whole DataTable.

NonEndings <- DataTable %>%
  filter(!grepl(pattern,address)) #extract the leftovers from the whole table

head(NonEndings,100)
##                                           address first_name sex
## 1                                    7401 S. MAIN   JULIETTE   F
## 2                                   BUILDING H100    MICHAEL   M
## 3                     UNC DEPT OF FAMILY MEDICINE    STEPHEN   M
## 4                  BQ-17 CALLE FERNANDO 1 LOCAL 4   JANNETTE   F
## 5                            4015 INTERSTATE 45 N    ANTHONY   M
## 6                  MASSACHUSETTS GENERAL HOSPITAL      ALAKA   F
## 7                          UAB SURGERY GI SECTION    MELANIE   F
## 8                                   1802 S YAKIMA       NYEN   M
## 9                                      MSC08 4770       CARL   M
## 10                                   1455 MONTEGO    JESSICA   F
## 11                                        CMR 442        ZIA   M
## 12                                  1000 BROADWAY     ROBERT   M
## 13                            7000 CARR 844 # 844    ANTONIO   M
## 14                            405 WEST CALIFORNIA               
## 15                           6121 PASEO DEL NORTE               
## 16                              536 BROADWAY FL 5    MIRJANA   F
## 17                            HENRY FORD HOSPITAL   MOHAMMAD   M
## 18                                       3601 TVC    LEALANI   F
## 19                              25387 SAINT JAMES      SCOTT   M
## 20                        1375 IDLEWOOD PARC XING      LAURA   F
## 21                      1 CALLE SAN MIGUEL APT 75      JOYCE   F
## 22                               7000 NORTH MOPAC    TATIANA   F
## 23             MCCLURE 5 FLETCHERALLEN HEALTHCARE    MAUREEN   F
## 24                        65 NEWBURYPORT TURNPIKE       JOAN   F
## 25                             6830 VILLAGREEN VW    ELLISSA   F
## 26                           2409 E EXPRESSWAY 83               
## 27                               183 RT 206 NORTH   PHYGENIA   F
## 28                                   7900 FM 1826      CHRIS   F
## 29                      1232 UNIVERSITY OF OREGON    CYNTHIA   F
## 30                                   UVA HOSPITAL     FARNAZ   F
## 31                                  3725 W 4100 S      CASEY   M
## 32               MSU-EM RESIDENCY PROGRAM-LANSING    HILLARY   F
## 33                                  2 GRAFTON CMN       MARK   M
## 34                                      2ND FLOOR    NGOZIKA   F
## 35                                   391 BROADWAY   MARSHALL   M
## 36                                  8300 BROADWAY        ANN   F
## 37                                425 US ROUTE 30    MATTHEW   M
## 38                       520 SW RAMSEY, SUITE 101  VOLODYMYR   M
## 39                                    30 N 1900 E       ANNE   F
## 40                        HIGHWAY 160 TO ROUTE 59      DEREK   M
## 41                              248 CALLE DIAMELA               
## 42                                       3601 TVC       LYNN   F
## 43                                      96 RTE 37   JENNIFER   F
## 44                          CB 7450 101 BRAUER HL    GREGORY   M
## 45                  22971 VIA DESONRISA DEL NORTE     LESLIE   F
## 46                 MASSACHUSETTS GENERAL HOSPITAL      PETER   M
## 47                        5369 S CALLE SANTA CRUZ   PATRICIA   F
## 48                 DEPARTMENT OF PATHOLOGY WFUBMC       KYLE   M
## 49                          STONY BROOK RADIOLOGY    AVRAHAM   M
## 50                                 37399 GARFIELD     CHIRAG   M
## 51                           1650 GRAND CONCOURSE     JYOTHI   F
## 52                         RONALD REAGAN HOSPITAL       INNA   F
## 53                             656 ELMWOOD ANENUE    CHERYLE   F
## 54                                    475 W 940 N    ALLISON   F
## 55                       121 CALLE DEL PRESIDENTE     TAMARA   F
## 56                            LOCAL 3R, SUITE 201    CYNTHIA   F
## 57                                 320 N. MADISON     BRANDI   F
## 58                        354 NEWNAN CROSSING BYP     GORDON   M
## 59                                   1034 N 500 W       KORT   M
## 60                       UNIVERSITY OF NEW MEXICO       JENS   M
## 61                                 3110 NOGALITOS      DAVID   M
## 62                                  8300 BROADWAY  JASWINDER   M
## 63                               20905 GREENFIELD      LINDA   F
## 64                   ROUTE 4 & 20 S. INTERSECTION      AIMEE   F
## 65                                118 MED SURGE I    CAMERON   M
## 66                                    725 E 400 N     NATHAN   M
## 67                    UWMC DEPARTMENT OF MEDICINE      SUSAN   F
## 68                          3115 OCEAN FRONT WALK      EMILY   F
## 69                            27401 W IL ROUTE 22  AGNIESZKA   F
## 70                     415 BARTOW MUNICIPAL ARPRT               
## 71  LILLY MINI MALL CARR 165 KM 10.2 BO. CONTORNO               
## 72                         330 ARKANSAS SUITE 120     MIRIAM   F
## 73                                     11899 M 32     ANGELA   F
## 74                                  3115 ROUTE 38     JEREMY   M
## 75                     1 UNIVERSITY OF NEW MEXICO        MIA   F
## 76                             9149 ESTATE THOMAS     JOSEPH   M
## 77                          256 COLUMBIA TURNPIKE       MARY   F
## 78                STONY BROOK UNIVERSITY HOSPITAL   BENJAMIN   M
## 79                           1650 GRAND CONCOURSE SRILAKSHMI   F
## 80                          CARRETERA 173 KM. 8.7    JUANITA   F
## 81                              2351 INDIAN WELLS      CHRIS   M
## 82                           1770 GRAND CONCOURSE     GUNJAN   F
## 83                   DEPARTMENT OF ANESTHESIOLOGY       RYAN   M
## 84                                 912 TREATY OAK     NNENNA   F
## 85                               PASEO DEL PARQUE       ILSA   F
## 86                                  2465 BROADWAY  KATHERINE   F
## 87                               100 NEWPARK MALL      CALEB   M
## 88                                  255 ROUTE 108               
## 89                              1672 CALLE PARANA       JUAN   M
## 90                               400 N. FULLERTON     SYDNEY   F
## 91                              830 SOUTH GLOSTER      SARAH   F
## 92                         7350 SAND LAKE COMMONS       JEAN   F
## 93                     200 NASH MEDICAL ARTS MALL               
## 94                                1600 E BROADWAY      DIANA   F
## 95                                   100 E FM 495               
## 96                       34 LAVELLE COURT SUITE A  ELIZABETH   F
## 97                             4435 EASTGATE MALL       SEAN   M
## 98                         6TH MEDICAL GROUP/SGHC    HEATHER   F
## 99                                 327 IL ROUTE 2    RUSSELL   M
## 100                      OCEAN COUNTY FAMILY CARE       NEHA   F
nrow(NonEndings)
## [1] 39383
Endings <- DataTable %>%
  extractMatches(pattern,address,ending=1) %>% #extract the street endings that we want
  group_by(ending) %>%
  summarise(total=n()) %>%
  arrange(desc(total))

as.data.frame(Endings)
##       ending  total
## 1         ST 253982
## 2        AVE 176705
## 3         RD 129897
## 4         DR  88093
## 5       BLVD  61450
## 6       <NA>  39383
## 7       PARK  19884
## 8         PL  17478
## 9       PKWY  15181
## 10    CENTER  14654
## 11       HWY  13527
## 12      ROAD  12675
## 13       WAY  11006
## 14        LN  10569
## 15   HIGHWAY  10286
## 16        CT   8016
## 17       CIR   6215
## 18      PIKE   3973
## 19        SQ   2725
## 20       TRL   2243
## 21      LANE   1990
## 22      LOOP   1891
## 23       TER   1684
## 24      TPKE   1681
## 25 BOULEVARD   1566
## 26       FWY   1156
## 27      EXPY   1012
## 28     TRAIL    605
## 29      CSWY     71
## 30       VLY      7

From the table the endings STREET, AVENUE, DRIVE, and CTR are included in the counts for ST,AVE,DR, and CT respectively. However, since the abbreviation CTR has different definition to CT (CTR stands for CENTER while CT stands for COURT), I need to change the pattern.

pattern <- "[[:blank:]]+(RD|BLVD|ST|AVE|PKWY|DR|STREET|HWY|FWY|LN|ROAD|AVENUE|WAY|CT[R]?|SQ|HIGHWAY|DRIVE|CIR|PL|TRAIL|TRL|PARK|TPKE|LOOP|PIKE|LANE|EXPY|VLY|TER|CSWY|BOULEVARD|CENTER)\\.?" #want to extract both CT and CTR

Endings <- DataTable %>%
  extractMatches(pattern,address,ending=1) %>% 
  group_by(ending) %>%
  filter(ending!="<NA>") %>% #rule out the addresses without common endings
  summarise(total=n()) %>%
  arrange(desc(total))

Endings <- as.data.frame(Endings)
Endings$ending <- as.character(Endings$ending)
Endings
##       ending  total
## 1         ST 253982
## 2        AVE 176705
## 3         RD 129897
## 4         DR  88093
## 5       BLVD  61450
## 6       PARK  19884
## 7         PL  17478
## 8       PKWY  15181
## 9     CENTER  14654
## 10       HWY  13527
## 11      ROAD  12675
## 12       WAY  11006
## 13        LN  10569
## 14   HIGHWAY  10286
## 15       CIR   6215
## 16        CT   5619
## 17      PIKE   3973
## 18        SQ   2725
## 19       CTR   2397
## 20       TRL   2243
## 21      LANE   1990
## 22      LOOP   1891
## 23       TER   1684
## 24      TPKE   1681
## 25 BOULEVARD   1566
## 26       FWY   1156
## 27      EXPY   1012
## 28     TRAIL    605
## 29      CSWY     71
## 30       VLY      7

Since multiple endings are not in abbreviated form, I need to include them into the counts of their abbreviation forms (e.g. include the total of ROAD into the total of RD)

renamed <- Endings %>%
  filter(ending %in% c("ROAD","BOULEVARD","HIGHWAY","LANE","TRAIL","CENTER")) %>%
  spread(ending,total) %>%
  rename(RD=ROAD,BLVD=BOULEVARD,LN=LANE,TRL=TRAIL,HWY=HIGHWAY,CTR=CENTER) %>%
  gather(ending,total1)

renamed
##   ending total1
## 1   BLVD   1566
## 2    CTR  14654
## 3    HWY  10286
## 4     LN   1990
## 5     RD  12675
## 6    TRL    605
final <- Endings %>%
  left_join(renamed) %>%
  filter(ending != "ROAD", ending !="BOULEVARD", ending != "HIGHWAY", ending != "LANE", ending !="TRAIL", ending != "CENTER") %>%
  group_by(ending) %>%
  mutate(total=sum(c(total,total1),na.rm=TRUE)) %>% #add the non-abbreviated form endings to the abbreviated form endings (e.g. add the count of BOULEVARD to BLVD)
  select(ending,total) %>%
  as.data.frame()
## Joining by: "ending"
final
##    ending  total
## 1      ST 253982
## 2     AVE 176705
## 3      RD 142572
## 4      DR  88093
## 5    BLVD  63016
## 6    PARK  19884
## 7      PL  17478
## 8    PKWY  15181
## 9     HWY  23813
## 10    WAY  11006
## 11     LN  12559
## 12    CIR   6215
## 13     CT   5619
## 14   PIKE   3973
## 15     SQ   2725
## 16    CTR  17051
## 17    TRL   2848
## 18   LOOP   1891
## 19    TER   1684
## 20   TPKE   1681
## 21    FWY   1156
## 22   EXPY   1012
## 23   CSWY     71
## 24    VLY      7

From the final table we can see that the 3 most common street name endings are ST, AVE, and RD.

References: http://www.gis.co.clay.mn.us/USPS.htm#top