library(DT)
unloc <- list.files(path = getwd(), pattern = 'UNLOCODE', full.names = F)
unloc
## [1] "2022-2 UNLOCODE CodeListPart1.csv" "2022-2 UNLOCODE CodeListPart2.csv"
## [3] "2022-2 UNLOCODE CodeListPart3.csv"
unloc_1 <- read.csv(unloc[1], header = F)
unloc_2 <- read.csv(unloc[2], header = F)
unloc_3 <- read.csv(unloc[3], header = F)
unloc <- data.frame(rbind(unloc_1, unloc_2, unloc_3))
world_cities <- read.csv('World_cities.csv')
acs <- read.csv('acs_code.csv')

UN locaiton list

Observe the data structure, only few columns were retained.

head(unloc)
##   V1 V2  V3                 V4                 V5 V6       V7 V8   V9 V10
## 1    AD               .ANDORRA                                     NA    
## 2    AD ALV   Andorra la Vella   Andorra la Vella    --34-6-- AI  601    
## 3    AD CAN            Canillo            Canillo    --3----- RL  307    
## 4    AD ENC             Encamp             Encamp    --3----- RL  307    
## 5    AD ESC Escaldes-Engordany Escaldes-Engordany    --3----- RL  307    
## 6    AD EAC           Esc\xe0s              Escas 04 --3----- RL 1407    
##            V11 V12
## 1                 
## 2 4230N 00131E    
## 3 4234N 00135E    
## 4 4232N 00134E    
## 5 4231N 00133E    
## 6 4233N 00131E
unloc <- unloc[,c('V2', 'V4', 'V5')]
head(unloc)
##   V2                 V4                 V5
## 1 AD           .ANDORRA                   
## 2 AD   Andorra la Vella   Andorra la Vella
## 3 AD            Canillo            Canillo
## 4 AD             Encamp             Encamp
## 5 AD Escaldes-Engordany Escaldes-Engordany
## 6 AD           Esc\xe0s              Escas

Extract Country Names

un_country <- unloc[grepl('^\\.', unloc$V4),]
un_country <- un_country[,c('V2','V4')]
un_country$V4 <- gsub('\\.', '', un_country$V4)
un_country[is.na(un_country$V2),]
##         V2      V4
## 78589 <NA> NAMIBIA
un_country$V2[is.na(un_country$V2)] <- 'NA'

Grab Special Characters

un_country[grepl('<',un_country$V4),]
##       V2                  V4
## 4952  AX    <c5>LAND ISLANDS
## 7429  BL SAINT BARTH<c9>LEMY
## 18395 CI    C<d4>TE D'IVOIRE
## 21431 CW          CURA<c7>AO
## 87600 RE          R<c9>UNION
## 92580 TR          T<dc>RKIYE
un_country$V4[un_country$V2 == 'AX'] <- 'ALAND ISLANDS'
un_country$V4[un_country$V2 == 'BL'] <- 'SAINT BARTHELEMY'
un_country$V4[un_country$V2 == 'CI'] <- 'COTE D\'IOIRE'
un_country$V4[un_country$V2 == 'CW'] <- 'CURACAO'
un_country$V4[un_country$V2 == 'RE'] <- 'REUNION'
un_country$V4[un_country$V2 == 'TR'] <- 'TURKEY'

Attempts to Uniform Country Names

un_country[grepl('\\(', un_country$V4),]
##        V2                                 V4
## 13     AE         UNITED ARAB EMIRATES (THE)
## 16674  CC            COCOS (KEELING) ISLANDS
## 40269  FK        FALKLAND ISLANDS (MALVINAS)
## 76591  MF         SAINT MARTIN (FRENCH PART)
## 76681  MH             MARSHALL ISLANDS (THE)
## 76929  MP     NORTHERN MARIANA ISLANDS (THE)
## 83663  PH                  PHILIPPINES (THE)
## 88496  RU           RUSSIAN FEDERATION (THE)
## 89539  SD                        SUDAN (THE)
## 92063  SX          SINT MAARTEN (DUTCH PART)
## 92069  SY         SYRIAN ARAB REPUBLIC (THE)
## 92108  TC     TURKS AND CAICOS ISLANDS (THE)
## 93304  TW         TAIWAN (PROVINCE OF CHINA)
## 93778  US     UNITED STATES OF AMERICA (THE)
## 114765 VA      HOLY SEE (VATICAN CITY STATE)
## 114778 VE VENEZUELA (BOLIVARIAN REPUBLIC OF)
## 114989 VG           VIRGIN ISLANDS (BRITISH)
## 115001 VI                VIRGIN ISLANDS (US)

Removing () should be fine, except for the VIRGIN ISLANDS since they could be either BRITISH or US.

un_country$V4[un_country$V2=='VI'] = 'U.S. VIRGIN ISLANDS'
un_country$V4[un_country$V2=='VG'] = 'BRITISH VIRGIN ISLANDS'
un_country$V4[un_country$V2=='KP'] = 'NORTH KOREA'
un_country$V4[un_country$V2=='KR'] = 'SOUTH KOREA'
un_country$V4[un_country$V2=='BO'] = 'BOLIVIA'
un_country$V4[un_country$V2=='BQ'] = 'BONAIRE'
un_country$V4[un_country$V2=='CD'] = 'DEMOCRATIC REPUBLIC OF CONGO (ZAIRE)'
un_country$V4[un_country$V2=='FM'] = 'MICRONESIA'
un_country$V4[un_country$V2=='IR'] = 'IRAN'
un_country$V4[un_country$V2=='MD'] = 'MOLDOVA'
un_country$V4[un_country$V2=='PS'] = 'PALESTINE'
un_country$V4[un_country$V2=='SH'] = 'SAINT HELENA'
un_country$V4[un_country$V2=='TZ'] = 'TANZANIA'
un_country$V4[un_country$V2=='RU'] = 'RUSSIA'
un_country$V4[un_country$V2=='VN'] = 'VIETNAM'
un_country$V4[un_country$V2=='SY'] = "SYRIA"
un_country[grepl('^ST ', un_country$V4),]
## [1] V2 V4
## <0 rows> (or 0-length row.names)
un_country$V4 <- gsub("\\(.*)","",un_country$V4)
un_country[grepl(',', un_country$V4),]
## [1] V2 V4
## <0 rows> (or 0-length row.names)
un_country$V4 <- trimws(un_country$V4)
un_city <- unloc[,c('V2','V5')]
class(un_city$V5)
## [1] "character"
un_city <- un_city[!grepl("([0-9]+).*$", un_city$V5),] # places with digits are not meaningful, remove
un_city$V5 <- trimws(un_city$V5)
un_city$V5 <- toupper(un_city$V5)
un_city <- un_city[grepl("[[:alpha:]]", un_city$V5),] # remove blank cells (no digits detected)

ACS

acs$location <- toupper(acs$location)
acs[grepl('&', acs$location),]
##     code                    location
## 183  321           ANTIGUA & BARBUDA
## 194  340 ST VINCENT & THE GRENADINES
## 195  341           TRINIDAD & TOBAGO
## 207  342      TURKS & CAICOS ISLANDS
## 227  304        ST PIERRE & MIQUELON
## 295  443         SAO TOME & PRINCIPE
## 315  507    HEARD & MCDONALD ISLANDS
## 329  526     WALLIS & FUTUNA ISLANDS
acs$location <- gsub('&', 'AND', acs$location)
acs[grepl('^ST ', acs$location),]
##     code                      location
## 193  339                      ST LUCIA
## 194  340 ST VINCENT AND THE GRENADINES
## 206  337                 ST BARTHELEMY
## 227  304        ST PIERRE AND MIQUELON
## 286  450                     ST HELENA
acs$location  <- gsub('^ST ', 'SAINT ', acs$location )
acs$location  <- gsub('^ST. ', 'SAINT ', acs$location )
un <- merge(un_country, un_city, by = 'V2', all = TRUE)
names(un) <- c('code2d','country','city')

World Cities

names(world_cities)
##  [1] "X"          "Y"          "FID"        "OBJECTID"   "CITY_NAME" 
##  [6] "GMI_ADMIN"  "ADMIN_NAME" "FIPS_CNTRY" "CNTRY_NAME" "STATUS"    
## [11] "POP"        "POP_RANK"   "POP_CLASS"  "PORT_ID"    "LABEL_FLAG"
## [16] "POP_SOURCE"
city2500 <- world_cities[,c("CNTRY_NAME", "CITY_NAME")]
names(city2500) <- c('country','city')
city2500$city <- toupper(city2500$city)
city2500$country <- toupper(city2500$country)
city2500$country <- gsub('IS.$', 'ISLANDS', city2500$country)
city2500$country <- gsub('&', 'AND', city2500$country)
city2500$country <- gsub('^ST. ', 'SAINT ', city2500$country)
head(city2500)
##    country                 city
## 1   BRAZIL               CUIABA
## 2   BRAZIL             BRASILIA
## 3   BRAZIL              GOIANIA
## 4   BRAZIL         CAMPO GRANDE
## 5 PARAGUAY PEDRO JUAN CABALLERO
## 6 PARAGUAY     SALTO DEL GUAIRA
head(un)
##   code2d country               city
## 1     AD ANDORRA ESCALDES-ENGORDANY
## 2     AD ANDORRA              ESCAS
## 3     AD ANDORRA  LA FARGA DE MOLES
## 4     AD ANDORRA         LA MASSANA
## 5     AD ANDORRA             ORDINO
## 6     AD ANDORRA     PAS DE LA CASA
un <- data.frame(rbind(un[,c('country', 'city')], city2500))
un <- unique(un)

Merging ACS with UN lists

setdiff(acs$location[acs$code>56],un_country$V4)
##  [1] "COMMONWEALTH OF NORTHERN MARIANA ISLANDS"
##  [2] "CZECHOSLOVAKIA"                          
##  [3] "MADEIRA ISLANDS"                         
##  [4] "AZORES ISLANDS"                          
##  [5] "WALES"                                   
##  [6] "ENGLAND"                                 
##  [7] "SCOTLAND"                                
##  [8] "NORTHERN IRELAND"                        
##  [9] "YUGOSLAVIA"                              
## [10] "CZECH REPUBLIC"                          
## [11] "MACEDONIA"                               
## [12] "USSR"                                    
## [13] "EUROPE"                                  
## [14] "KOSOVO"                                  
## [15] "GIBRALTER"                               
## [16] "JAN MAYEN"                               
## [17] "SVALBARD"                                
## [18] "VATICAN CITY"                            
## [19] "MACAU"                                   
## [20] "PARACEL ISLANDS"                         
## [21] "KOREA"                                   
## [22] "LAOS"                                    
## [23] "ASIA"                                    
## [24] "BRUNEI"                                  
## [25] "SPRATLEY ISLANDS"                        
## [26] "EAST TIMOR"                              
## [27] "SAINT KITTS-NEVIS"                       
## [28] "WEST INDIES"                             
## [29] "NAVASSA ISLAND"                          
## [30] "NETHERLANDS ANTILLES"                    
## [31] "CURAƇAO"                                 
## [32] "SABA"                                    
## [33] "SINT EUSTATIUS"                          
## [34] "SOUTH AMERICA"                           
## [35] "NORTH AMERICA"                           
## [36] "CENTRAL AMERICA"                         
## [37] "CABO VERDE"                              
## [38] "IVORY COAST"                             
## [39] "DEMOCRATIC REPUBLIC OF CONGO (ZAIRE)"    
## [40] "AFRICA"                                  
## [41] "EUROPA ISLAND"                           
## [42] "GLORIOSO ISLANDS"                        
## [43] "JUAN DE NOVA ISLAND"                     
## [44] "TROMELIN ISLAND"                         
## [45] "SWAZILAND"                               
## [46] "JOHNSTON ATOLL"                          
## [47] "MIDWAY ISLANDS"                          
## [48] "WAKE ISLAND"                             
## [49] "BAKER ISLAND"                            
## [50] "HOWLAND ISLAND"                          
## [51] "JARVIS ISLAND"                           
## [52] "KINGMAN REEF"                            
## [53] "PALMYRA ATOLL"                           
## [54] "CHRISTMAS ISLAND, INDIAN OCEAN"          
## [55] "CORAL SEA ISLANDS"                       
## [56] "HEARD AND MCDONALD ISLANDS"              
## [57] "PITCAIRN ISLANDS"                        
## [58] "WALLIS AND FUTUNA ISLANDS"               
## [59] "OCEANIA"                                 
## [60] "AT SEA"
setdiff(un_country$V4, acs$location[acs$code>56])
##  [1] "ANTARCTICA"                                  
##  [2] "ALAND ISLANDS"                               
##  [3] "BRUNEI DARUSSALAM"                           
##  [4] "COCOS  ISLANDS"                              
##  [5] "DEMOCRATIC REPUBLIC OF CONGO"                
##  [6] "COTE D'IOIRE"                                
##  [7] "CAPE VERDE"                                  
##  [8] "CURACAO"                                     
##  [9] "CHRISTMAS ISLAND"                            
## [10] "CZECHIA"                                     
## [11] "GIBRALTAR"                                   
## [12] "SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS"
## [13] "HEARD ISLAND AND MCDONALD ISLANDS"           
## [14] "SAINT KITTS AND NEVIS"                       
## [15] "LAO PEOPLE'S DEMOCRATIC REPUBLIC"            
## [16] "NORTH MACEDONIA"                             
## [17] "MACAO"                                       
## [18] "NORTHERN MARIANA ISLANDS"                    
## [19] "PITCAIRN"                                    
## [20] "PALESTINE"                                   
## [21] "SVALBARD AND JAN MAYEN"                      
## [22] "ESWATINI"                                    
## [23] "FRENCH SOUTHERN TERRITORIES"                 
## [24] "TIMOR-LESTE"                                 
## [25] "UNITED STATES MINOR OUTLYING ISLANDS"        
## [26] "UNITED STATES OF AMERICA"                    
## [27] "HOLY SEE"                                    
## [28] "WALLIS AND FUTUNA"                           
## [29] "INSTALLATIONS IN INTERNATIONAL WATERS"
df1 <- merge(acs[acs$code>56,], un, by.x = 'location', by.y = 'country', all = TRUE)
df1 <- df1[df1$location != 'ANTARCTICA',]
df1 <- df1[df1$location != 'UNITED STATES OF AMERICA',]
names(df1)[1] <- 'country'
length(unique(df1$city))
## [1] 90278
#write.csv(df1, 'loc_ref.csv', row.names = F)