library(DT)
unloc <- list.files(path = getwd(), pattern = 'UNLOCODE', full.names = F)
unloc
## [1] "2022-2 UNLOCODE CodeListPart1.csv" "2022-2 UNLOCODE CodeListPart2.csv"
## [3] "2022-2 UNLOCODE CodeListPart3.csv"
unloc_1 <- read.csv(unloc[1], header = F)
unloc_2 <- read.csv(unloc[2], header = F)
unloc_3 <- read.csv(unloc[3], header = F)
unloc <- data.frame(rbind(unloc_1, unloc_2, unloc_3))
world_cities <- read.csv('World_cities.csv')
acs <- read.csv('acs_code.csv')
Observe the data structure, only few columns were retained.
head(unloc)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 AD .ANDORRA NA
## 2 AD ALV Andorra la Vella Andorra la Vella --34-6-- AI 601
## 3 AD CAN Canillo Canillo --3----- RL 307
## 4 AD ENC Encamp Encamp --3----- RL 307
## 5 AD ESC Escaldes-Engordany Escaldes-Engordany --3----- RL 307
## 6 AD EAC Esc\xe0s Escas 04 --3----- RL 1407
## V11 V12
## 1
## 2 4230N 00131E
## 3 4234N 00135E
## 4 4232N 00134E
## 5 4231N 00133E
## 6 4233N 00131E
unloc <- unloc[,c('V2', 'V4', 'V5')]
head(unloc)
## V2 V4 V5
## 1 AD .ANDORRA
## 2 AD Andorra la Vella Andorra la Vella
## 3 AD Canillo Canillo
## 4 AD Encamp Encamp
## 5 AD Escaldes-Engordany Escaldes-Engordany
## 6 AD Esc\xe0s Escas
un_country <- unloc[grepl('^\\.', unloc$V4),]
un_country <- un_country[,c('V2','V4')]
un_country$V4 <- gsub('\\.', '', un_country$V4)
un_country[is.na(un_country$V2),]
## V2 V4
## 78589 <NA> NAMIBIA
un_country$V2[is.na(un_country$V2)] <- 'NA'
un_country[grepl('<',un_country$V4),]
## V2 V4
## 4952 AX <c5>LAND ISLANDS
## 7429 BL SAINT BARTH<c9>LEMY
## 18395 CI C<d4>TE D'IVOIRE
## 21431 CW CURA<c7>AO
## 87600 RE R<c9>UNION
## 92580 TR T<dc>RKIYE
un_country$V4[un_country$V2 == 'AX'] <- 'ALAND ISLANDS'
un_country$V4[un_country$V2 == 'BL'] <- 'SAINT BARTHELEMY'
un_country$V4[un_country$V2 == 'CI'] <- 'COTE D\'IOIRE'
un_country$V4[un_country$V2 == 'CW'] <- 'CURACAO'
un_country$V4[un_country$V2 == 'RE'] <- 'REUNION'
un_country$V4[un_country$V2 == 'TR'] <- 'TURKEY'
un_country[grepl('\\(', un_country$V4),]
## V2 V4
## 13 AE UNITED ARAB EMIRATES (THE)
## 16674 CC COCOS (KEELING) ISLANDS
## 40269 FK FALKLAND ISLANDS (MALVINAS)
## 76591 MF SAINT MARTIN (FRENCH PART)
## 76681 MH MARSHALL ISLANDS (THE)
## 76929 MP NORTHERN MARIANA ISLANDS (THE)
## 83663 PH PHILIPPINES (THE)
## 88496 RU RUSSIAN FEDERATION (THE)
## 89539 SD SUDAN (THE)
## 92063 SX SINT MAARTEN (DUTCH PART)
## 92069 SY SYRIAN ARAB REPUBLIC (THE)
## 92108 TC TURKS AND CAICOS ISLANDS (THE)
## 93304 TW TAIWAN (PROVINCE OF CHINA)
## 93778 US UNITED STATES OF AMERICA (THE)
## 114765 VA HOLY SEE (VATICAN CITY STATE)
## 114778 VE VENEZUELA (BOLIVARIAN REPUBLIC OF)
## 114989 VG VIRGIN ISLANDS (BRITISH)
## 115001 VI VIRGIN ISLANDS (US)
Removing ()
should be fine, except for the
VIRGIN ISLANDS
since they could be either
BRITISH
or US
.
un_country$V4[un_country$V2=='VI'] = 'U.S. VIRGIN ISLANDS'
un_country$V4[un_country$V2=='VG'] = 'BRITISH VIRGIN ISLANDS'
un_country$V4[un_country$V2=='KP'] = 'NORTH KOREA'
un_country$V4[un_country$V2=='KR'] = 'SOUTH KOREA'
un_country$V4[un_country$V2=='BO'] = 'BOLIVIA'
un_country$V4[un_country$V2=='BQ'] = 'BONAIRE'
un_country$V4[un_country$V2=='CD'] = 'DEMOCRATIC REPUBLIC OF CONGO (ZAIRE)'
un_country$V4[un_country$V2=='FM'] = 'MICRONESIA'
un_country$V4[un_country$V2=='IR'] = 'IRAN'
un_country$V4[un_country$V2=='MD'] = 'MOLDOVA'
un_country$V4[un_country$V2=='PS'] = 'PALESTINE'
un_country$V4[un_country$V2=='SH'] = 'SAINT HELENA'
un_country$V4[un_country$V2=='TZ'] = 'TANZANIA'
un_country$V4[un_country$V2=='RU'] = 'RUSSIA'
un_country$V4[un_country$V2=='VN'] = 'VIETNAM'
un_country$V4[un_country$V2=='SY'] = "SYRIA"
un_country[grepl('^ST ', un_country$V4),]
## [1] V2 V4
## <0 rows> (or 0-length row.names)
un_country$V4 <- gsub("\\(.*)","",un_country$V4)
un_country[grepl(',', un_country$V4),]
## [1] V2 V4
## <0 rows> (or 0-length row.names)
un_country$V4 <- trimws(un_country$V4)
un_city <- unloc[,c('V2','V5')]
class(un_city$V5)
## [1] "character"
un_city <- un_city[!grepl("([0-9]+).*$", un_city$V5),] # places with digits are not meaningful, remove
un_city$V5 <- trimws(un_city$V5)
un_city$V5 <- toupper(un_city$V5)
un_city <- un_city[grepl("[[:alpha:]]", un_city$V5),] # remove blank cells (no digits detected)
acs$location <- toupper(acs$location)
acs[grepl('&', acs$location),]
## code location
## 183 321 ANTIGUA & BARBUDA
## 194 340 ST VINCENT & THE GRENADINES
## 195 341 TRINIDAD & TOBAGO
## 207 342 TURKS & CAICOS ISLANDS
## 227 304 ST PIERRE & MIQUELON
## 295 443 SAO TOME & PRINCIPE
## 315 507 HEARD & MCDONALD ISLANDS
## 329 526 WALLIS & FUTUNA ISLANDS
acs$location <- gsub('&', 'AND', acs$location)
acs[grepl('^ST ', acs$location),]
## code location
## 193 339 ST LUCIA
## 194 340 ST VINCENT AND THE GRENADINES
## 206 337 ST BARTHELEMY
## 227 304 ST PIERRE AND MIQUELON
## 286 450 ST HELENA
acs$location <- gsub('^ST ', 'SAINT ', acs$location )
acs$location <- gsub('^ST. ', 'SAINT ', acs$location )
un <- merge(un_country, un_city, by = 'V2', all = TRUE)
names(un) <- c('code2d','country','city')
names(world_cities)
## [1] "X" "Y" "FID" "OBJECTID" "CITY_NAME"
## [6] "GMI_ADMIN" "ADMIN_NAME" "FIPS_CNTRY" "CNTRY_NAME" "STATUS"
## [11] "POP" "POP_RANK" "POP_CLASS" "PORT_ID" "LABEL_FLAG"
## [16] "POP_SOURCE"
city2500 <- world_cities[,c("CNTRY_NAME", "CITY_NAME")]
names(city2500) <- c('country','city')
city2500$city <- toupper(city2500$city)
city2500$country <- toupper(city2500$country)
city2500$country <- gsub('IS.$', 'ISLANDS', city2500$country)
city2500$country <- gsub('&', 'AND', city2500$country)
city2500$country <- gsub('^ST. ', 'SAINT ', city2500$country)
head(city2500)
## country city
## 1 BRAZIL CUIABA
## 2 BRAZIL BRASILIA
## 3 BRAZIL GOIANIA
## 4 BRAZIL CAMPO GRANDE
## 5 PARAGUAY PEDRO JUAN CABALLERO
## 6 PARAGUAY SALTO DEL GUAIRA
head(un)
## code2d country city
## 1 AD ANDORRA ESCALDES-ENGORDANY
## 2 AD ANDORRA ESCAS
## 3 AD ANDORRA LA FARGA DE MOLES
## 4 AD ANDORRA LA MASSANA
## 5 AD ANDORRA ORDINO
## 6 AD ANDORRA PAS DE LA CASA
un <- data.frame(rbind(un[,c('country', 'city')], city2500))
un <- unique(un)
setdiff(acs$location[acs$code>56],un_country$V4)
## [1] "COMMONWEALTH OF NORTHERN MARIANA ISLANDS"
## [2] "CZECHOSLOVAKIA"
## [3] "MADEIRA ISLANDS"
## [4] "AZORES ISLANDS"
## [5] "WALES"
## [6] "ENGLAND"
## [7] "SCOTLAND"
## [8] "NORTHERN IRELAND"
## [9] "YUGOSLAVIA"
## [10] "CZECH REPUBLIC"
## [11] "MACEDONIA"
## [12] "USSR"
## [13] "EUROPE"
## [14] "KOSOVO"
## [15] "GIBRALTER"
## [16] "JAN MAYEN"
## [17] "SVALBARD"
## [18] "VATICAN CITY"
## [19] "MACAU"
## [20] "PARACEL ISLANDS"
## [21] "KOREA"
## [22] "LAOS"
## [23] "ASIA"
## [24] "BRUNEI"
## [25] "SPRATLEY ISLANDS"
## [26] "EAST TIMOR"
## [27] "SAINT KITTS-NEVIS"
## [28] "WEST INDIES"
## [29] "NAVASSA ISLAND"
## [30] "NETHERLANDS ANTILLES"
## [31] "CURAĆAO"
## [32] "SABA"
## [33] "SINT EUSTATIUS"
## [34] "SOUTH AMERICA"
## [35] "NORTH AMERICA"
## [36] "CENTRAL AMERICA"
## [37] "CABO VERDE"
## [38] "IVORY COAST"
## [39] "DEMOCRATIC REPUBLIC OF CONGO (ZAIRE)"
## [40] "AFRICA"
## [41] "EUROPA ISLAND"
## [42] "GLORIOSO ISLANDS"
## [43] "JUAN DE NOVA ISLAND"
## [44] "TROMELIN ISLAND"
## [45] "SWAZILAND"
## [46] "JOHNSTON ATOLL"
## [47] "MIDWAY ISLANDS"
## [48] "WAKE ISLAND"
## [49] "BAKER ISLAND"
## [50] "HOWLAND ISLAND"
## [51] "JARVIS ISLAND"
## [52] "KINGMAN REEF"
## [53] "PALMYRA ATOLL"
## [54] "CHRISTMAS ISLAND, INDIAN OCEAN"
## [55] "CORAL SEA ISLANDS"
## [56] "HEARD AND MCDONALD ISLANDS"
## [57] "PITCAIRN ISLANDS"
## [58] "WALLIS AND FUTUNA ISLANDS"
## [59] "OCEANIA"
## [60] "AT SEA"
setdiff(un_country$V4, acs$location[acs$code>56])
## [1] "ANTARCTICA"
## [2] "ALAND ISLANDS"
## [3] "BRUNEI DARUSSALAM"
## [4] "COCOS ISLANDS"
## [5] "DEMOCRATIC REPUBLIC OF CONGO"
## [6] "COTE D'IOIRE"
## [7] "CAPE VERDE"
## [8] "CURACAO"
## [9] "CHRISTMAS ISLAND"
## [10] "CZECHIA"
## [11] "GIBRALTAR"
## [12] "SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS"
## [13] "HEARD ISLAND AND MCDONALD ISLANDS"
## [14] "SAINT KITTS AND NEVIS"
## [15] "LAO PEOPLE'S DEMOCRATIC REPUBLIC"
## [16] "NORTH MACEDONIA"
## [17] "MACAO"
## [18] "NORTHERN MARIANA ISLANDS"
## [19] "PITCAIRN"
## [20] "PALESTINE"
## [21] "SVALBARD AND JAN MAYEN"
## [22] "ESWATINI"
## [23] "FRENCH SOUTHERN TERRITORIES"
## [24] "TIMOR-LESTE"
## [25] "UNITED STATES MINOR OUTLYING ISLANDS"
## [26] "UNITED STATES OF AMERICA"
## [27] "HOLY SEE"
## [28] "WALLIS AND FUTUNA"
## [29] "INSTALLATIONS IN INTERNATIONAL WATERS"
df1 <- merge(acs[acs$code>56,], un, by.x = 'location', by.y = 'country', all = TRUE)
df1 <- df1[df1$location != 'ANTARCTICA',]
df1 <- df1[df1$location != 'UNITED STATES OF AMERICA',]
names(df1)[1] <- 'country'
length(unique(df1$city))
## [1] 90278
#write.csv(df1, 'loc_ref.csv', row.names = F)