This is a demonstration of cleaning Administrative Records (AR) downloaded from the United States Department of State—Bureau of Consular Affairs.
The raw data were stored in 67 .txt files. Each of the .txt files contains the monthly non-immigrant visa assurances by nationality and visa class.
The raw data tables are available here.
This is how the raw data looks like online.
.pdf online
Download each of them and store them in .txt
.txt
The final product after cleaning looks like this:
Country names of different lengths were successfully extracted;
Headers and footers were removed;
The comma formatted numeric values were removed
.csv
Sys.time()
## [1] "2022-11-11 23:14:18 EST"
file_list <- list.files(path = getwd(), pattern = "*.txt", full.names = T)
Sys.time()
## [1] "2022-11-11 23:14:18 EST"
## Create a list with pre-defined names
filenames <- NA
for (n in 1: length(file_list)) {
filenames[n] <- substr(file_list[n],nchar(file_list[n])-16, nchar(file_list[n])-4)
}
head(filenames)
## [1] "nonimm_012018" "nonimm_012019" "nonimm_012020" "nonimm_012021"
## [5] "nonimm_012022" "nonimm_022018"
filenames <- filenames[grepl("nonimm_", filenames, ignore.case = T)]
nlist <- vector("list", length(filenames))
names(nlist) <- filenames
head(names(nlist))
## [1] "nonimm_012018" "nonimm_012019" "nonimm_012020" "nonimm_012021"
## [5] "nonimm_012022" "nonimm_022018"
The headers and footers can be removed at this step using
grepl()
. The comma(,) in numbers can be removed using
gsub()
.
Sys.time()
## [1] "2022-11-11 23:14:18 EST"
# n = number of .txt files in the directory
for (n in 1: length(nlist)) {
# Import: read in every file in the directory. The name of each file is index by `file_list[n]`
nlist[n] <- read.csv(file_list[n], sep = ";")
# Data Cleaning: remove non-observation rows
nlist[[n]] <- (nlist[[n]])[!grepl("page ", nlist[[n]], ignore.case = T)]
nlist[[n]] <- (nlist[[n]])[!grepl("Nationality Visa Class Issuances", nlist[[n]], ignore.case = T)]
nlist[[n]] <- (nlist[[n]])[!grepl("Nonimmigrant Visa Issuances by Nationality", nlist[[n]], ignore.case = T)]
nlist[[n]] <- (nlist[[n]])[!grepl("FY 20", nlist[[n]], ignore.case = T)]
# keep the first item until the content reaches the "grand total"
# Error in 1:which(grepl("grand total", nlist[[n]], ignore.case = T)) : argument of length 0
# To avoid the error message, make sure all files imported contain the "grand total" string.
# In this case, we only need to make sure all files imported are non-immigrant visa monthly statistics.
nlist[[n]] <-(nlist[[n]])[1: which(grepl("grand total", nlist[[n]], ignore.case = T))]
nlist[[n]] <- (nlist[[n]])[!grepl("grand total", nlist[[n]], ignore.case = T)] #
nlist[[n]] <- (nlist[[n]])[!grepl("SBU - IMMIGRATION", nlist[[n]], ignore.case = T)]
# Format: remove "," in the numbers >=1000
#nlist[[n]] <- gsub(",", "", nlist[[n]])
nlist[[n]] <- data.frame(nlist[[n]])
names(nlist[[n]]) <- "V1" # assign the column name for the only column in the data frames
}
tail(nlist[[1]]) # This is how the first item in the list looks like
## V1
## 3541 Zimbabwe L1 6
## 3542 Zimbabwe L2 4
## 3543 Zimbabwe M1 1
## 3544 Zimbabwe Q1 1
## 3545 Zimbabwe R1 3
## 3546 Zimbabwe U3 1
do.call(what = "rbind", lapply(nlist, as.dta.frame))
Extract the year and month information from the row names.
Sys.time()
## [1] "2022-11-11 23:14:19 EST"
df <- do.call(what = "rbind", lapply(nlist, as.data.frame))
# add the MMYYYY information
# observe the structure of the row names and extract the strings that reflect the time info
tail(rownames(df)) # "nonimm_122021.3020"
## [1] "nonimm_122021.3020" "nonimm_122021.3021" "nonimm_122021.3022"
## [4] "nonimm_122021.3023" "nonimm_122021.3024" "nonimm_122021.3025"
# The 8th - 13th character includes the time information
df$year <- as.numeric(substr(rownames(df), 10, 13)) # year
df$month <- as.numeric(substr(rownames(df), 8, 9)) # month
rownames(df) <- seq_len(nrow(df)) # re-assign the row names from 1 to nrow()
head(df)
## V1 year month
## 1 Afghanistan A1 9 2018 1
## 2 Afghanistan A2 24 2018 1
## 3 Afghanistan B1/B2 117 2018 1
## 4 Afghanistan F1 17 2018 1
## 5 Afghanistan F2 5 2018 1
## 6 Afghanistan G1 4 2018 1
At this stage, the combined data frame becomes super long with 212385 rows.
In R, extracting country names of different lengths means to split one string into multiple variables.
Observe the structure of the resultant data input. Indexing the content by space is the key for each row.
After the last space, there’s a number.
After the last but second space, it’s the visa type.
Everything before the last but second space, is the country/region name.
See the V1
column:
df[c(1, 800, 171673),]
## V1 year month
## 1 Afghanistan A1 9 2018 1
## 800 Costa Rica O1 2 2018 1
## 171673 United Arab Emirates F1 21 2018 10
NOTE:
scan(text = df$V1, what = " ")[length(scan(text = df$V1, what = " "))]
cannot be used to fill in each row of the data frame (or each item in a
vector). This is because the resultant lengths of the split string and
the targeted data frame (or vector) are different.Try
strsplit(text, split = " ", fixed = T)
instead.
# strsplit() runs faster than scan()
strings <- strsplit(df$V1, split = " ", fixed = T)
(strings[[800]])[1:(length(strings[[800]])-2)] # Costa Rica
## [1] "Costa" "Rica"
# The following takes tremendous time to run
#for (i in 1:nrow(df)) {strings[[i]] <- scan(text = df$V1[i], what = " ")}
# Try strsplit
Sys.time() # It takes 59s (1 min) to run if using `scan()`. This is too slow.
## [1] "2022-11-11 23:14:19 EST"
#To reduce the processing time, create vectors to fill in the content, don't do it within the data frame.
#for (i in 1:nrow(df)) {
# df$issuances[i] <- (strings[[i]])[length(strings[[i]])]
#}
#df$issuances <- as.numeric(df$issuances)
issuances <- NA
visa <- NA
nationality_conc <- NA
nationality <- vector("list", length(strings))
Sys.time()
## [1] "2022-11-11 23:14:19 EST"
for (i in 1:length(strings)) {
visa[i] <- (strings[[i]])[length(strings[[i]])-1]
issuances[i] <- (strings[[i]])[length(strings[[i]])]
nationality[[i]] <- (strings[[i]])[1: (length(strings[[i]])-2)]
# concatenate strings using collapse = " " not sep = " "
nationality_conc[i] <- paste(nationality[[i]], collapse = " ")
}
Sys.time()
## [1] "2022-11-11 23:14:20 EST"
nationality_conc <- toupper(nationality_conc)
df$nationality <- nationality_conc
df$visa <- visa
issuances <- gsub(",", "", issuances)
df$issuances <- issuances
df <- df[,names(df)!="V1"] # drop V1 after all useful information has been extracted
df <- df[order(df$year, df$month),] # ascending order first by year then by month
# A quick glance at the visa class tabulation. Note this is not the aggregated visa issuances.
table(df$visa)
##
## A1 A2 A3 B1 B1/B2 B2 BBBCC BBBCV C1 C1/D C2 C3 CW1
## 7728 10521 1614 6967 12447 4092 156 67 4362 8273 44 3087 494
## CW2 D E1 E2 E2C E3 E3D E3R F1 F2 G1 G2 G3
## 243 1252 1509 4089 114 89 1344 67 11334 6028 6147 6659 738
## G4 G5 H1B H1B1 H1C H2A H2B H3 H4 I J1 J2 K1
## 9737 967 7621 134 2 1630 2112 1133 5444 4789 10657 6538 8330
## K2 K3 K4 L1 L2 M1 M2 N8 N9 NATO1 NATO2 NATO4 NATO5
## 3186 24 5 6606 5798 4807 664 88 31 40 1575 517 62
## NATO6 NATO7 O1 O2 O3 P1 P2 P3 P4 Q1 R1 R2 S7
## 446 17 5282 2521 3200 4547 198 2441 1552 672 3934 1606 1
## T2 T3 T4 T5 T6 TD TN U1 U2 U3 U4 U5
## 196 386 108 56 49 1601 131 299 320 686 86 88
issuances <- as.numeric(issuances)
countrylist <- data.frame(table(nationality_conc)) # the country names has to be reconciled with spelling variations
df$type <- "NIV" # add a column that indicates the non-immigrant visa type. Just in case this table will be combined with the immigrant visa table later.
write.csv(df,"nonimm.csv",row.names = F)
For those countries whose names have multiple strings (e.g. United
Arab Emirates), we need to make sure the correct strings (all strings
before the last but second space) are properly extracted and
concatenated. We want "United Arab Emirates"
, not
"United" "Arab" "Emirates"
. The following shows different
attempts and what each of them generate.
names(nationality) <- as.numeric(seq_along(nationality))
nationality[171673]
## $`171673`
## [1] "United" "Arab" "Emirates"
paste(nationality[171673], collapse = " ")
## [1] "c(\"United\", \"Arab\", \"Emirates\")"
paste(unlist(nationality[171673]), collapse = " ")
## [1] "United Arab Emirates"
nationality[[171673]]
## [1] "United" "Arab" "Emirates"
unlist(nationality[[171673]])
## [1] "United" "Arab" "Emirates"
paste(nationality[[171673]], collapse = " ")
## [1] "United Arab Emirates"
paste(unlist(nationality[[171673]]), collapse = " ")
## [1] "United Arab Emirates"
#write.csv(df,"nonimm.csv", row.names = F)