This is a demonstration of cleaning Administrative Records (AR) downloaded from the United States Department of State—Bureau of Consular Affairs.

The raw data were stored in 67 .txt files. Each of the .txt files contains the monthly non-immigrant visa assurances by nationality and visa class.

The raw data tables are available here.

This is how the raw data looks like online.

.pdf online

Download each of them and store them in .txt

.txt

The final product after cleaning looks like this:

  1. Country names of different lengths were successfully extracted;

  2. Headers and footers were removed;

  3. The comma formatted numeric values were removed

.csv

Sys.time()
## [1] "2022-11-11 23:14:18 EST"
file_list <- list.files(path = getwd(), pattern = "*.txt", full.names = T)

Create a List

Sys.time()
## [1] "2022-11-11 23:14:18 EST"
## Create a list with pre-defined names
filenames <- NA
for (n in 1: length(file_list)) {
  filenames[n] <- substr(file_list[n],nchar(file_list[n])-16, nchar(file_list[n])-4)
}
head(filenames)
## [1] "nonimm_012018" "nonimm_012019" "nonimm_012020" "nonimm_012021"
## [5] "nonimm_012022" "nonimm_022018"
filenames <- filenames[grepl("nonimm_", filenames, ignore.case = T)]
nlist <- vector("list", length(filenames))
names(nlist) <- filenames
head(names(nlist))
## [1] "nonimm_012018" "nonimm_012019" "nonimm_012020" "nonimm_012021"
## [5] "nonimm_012022" "nonimm_022018"

Add 67 .txt Files to List

The headers and footers can be removed at this step using grepl(). The comma(,) in numbers can be removed using gsub().

Sys.time()
## [1] "2022-11-11 23:14:18 EST"
# n = number of .txt files in the directory
for (n in 1: length(nlist)) {
  # Import: read in every file in the directory. The name of each file is index by `file_list[n]`
  nlist[n] <- read.csv(file_list[n], sep = ";")
  # Data Cleaning: remove non-observation rows
  nlist[[n]] <- (nlist[[n]])[!grepl("page ", nlist[[n]], ignore.case = T)]
  nlist[[n]] <- (nlist[[n]])[!grepl("Nationality Visa Class Issuances", nlist[[n]], ignore.case = T)]
  nlist[[n]] <- (nlist[[n]])[!grepl("Nonimmigrant Visa Issuances by Nationality", nlist[[n]], ignore.case = T)]
  nlist[[n]] <- (nlist[[n]])[!grepl("FY 20", nlist[[n]], ignore.case = T)]
  # keep the first item until the content reaches the "grand total"
  # Error in 1:which(grepl("grand total", nlist[[n]], ignore.case = T)) : argument of length 0
  # To avoid the error message, make sure all files imported contain the "grand total" string.
  # In this case, we only need to make sure all files imported are non-immigrant visa monthly statistics.
  nlist[[n]] <-(nlist[[n]])[1: which(grepl("grand total", nlist[[n]], ignore.case = T))]
  nlist[[n]] <- (nlist[[n]])[!grepl("grand total", nlist[[n]], ignore.case = T)] # 
  nlist[[n]] <- (nlist[[n]])[!grepl("SBU - IMMIGRATION", nlist[[n]], ignore.case = T)]
  # Format: remove "," in the numbers >=1000
  #nlist[[n]] <- gsub(",", "", nlist[[n]])
  nlist[[n]] <- data.frame(nlist[[n]])
  names(nlist[[n]]) <- "V1" # assign the column name for the only column in the data frames
}
tail(nlist[[1]]) # This is how the first item in the list looks like
##                 V1
## 3541 Zimbabwe L1 6
## 3542 Zimbabwe L2 4
## 3543 Zimbabwe M1 1
## 3544 Zimbabwe Q1 1
## 3545 Zimbabwe R1 3
## 3546 Zimbabwe U3 1

List to data frame

  1. do.call(what = "rbind", lapply(nlist, as.dta.frame))

  2. Extract the year and month information from the row names.

Sys.time()
## [1] "2022-11-11 23:14:19 EST"
df <- do.call(what = "rbind", lapply(nlist, as.data.frame))
# add the MMYYYY information
# observe the structure of the row names and extract the strings that reflect the time info
tail(rownames(df)) # "nonimm_122021.3020"
## [1] "nonimm_122021.3020" "nonimm_122021.3021" "nonimm_122021.3022"
## [4] "nonimm_122021.3023" "nonimm_122021.3024" "nonimm_122021.3025"
# The 8th - 13th character includes the time information
df$year <- as.numeric(substr(rownames(df), 10, 13)) # year
df$month <- as.numeric(substr(rownames(df), 8, 9)) # month
rownames(df) <- seq_len(nrow(df)) # re-assign the row names from 1 to nrow()
head(df)
##                      V1 year month
## 1      Afghanistan A1 9 2018     1
## 2     Afghanistan A2 24 2018     1
## 3 Afghanistan B1/B2 117 2018     1
## 4     Afghanistan F1 17 2018     1
## 5      Afghanistan F2 5 2018     1
## 6      Afghanistan G1 4 2018     1

At this stage, the combined data frame becomes super long with 212385 rows.

Extract Country Names, Visa Class, and Issuances

In R, extracting country names of different lengths means to split one string into multiple variables.

Observe the structure of the resultant data input. Indexing the content by space is the key for each row.

  1. After the last space, there’s a number.

  2. After the last but second space, it’s the visa type.

  3. Everything before the last but second space, is the country/region name.

See the V1 column:

df[c(1, 800, 171673),]
##                                V1 year month
## 1                Afghanistan A1 9 2018     1
## 800               Costa Rica O1 2 2018     1
## 171673 United Arab Emirates F1 21 2018    10

NOTE: scan(text = df$V1, what = " ")[length(scan(text = df$V1, what = " "))] cannot be used to fill in each row of the data frame (or each item in a vector). This is because the resultant lengths of the split string and the targeted data frame (or vector) are different.Try strsplit(text, split = " ", fixed = T) instead.

# strsplit() runs faster than scan()
strings <- strsplit(df$V1, split = " ", fixed = T)
(strings[[800]])[1:(length(strings[[800]])-2)] # Costa Rica
## [1] "Costa" "Rica"
# The following takes tremendous time to run
#for (i in 1:nrow(df)) {strings[[i]] <- scan(text = df$V1[i], what = " ")}
# Try strsplit
Sys.time() # It takes 59s (1 min) to run if using `scan()`. This is too slow. 
## [1] "2022-11-11 23:14:19 EST"
#To reduce the processing time, create vectors to fill in the content, don't do it within the data frame.
#for (i in 1:nrow(df)) {
#  df$issuances[i] <- (strings[[i]])[length(strings[[i]])]
#}
#df$issuances <- as.numeric(df$issuances)
issuances <- NA
visa <- NA
nationality_conc <- NA
nationality <- vector("list", length(strings))
Sys.time()
## [1] "2022-11-11 23:14:19 EST"
for (i in 1:length(strings)) {
  visa[i] <- (strings[[i]])[length(strings[[i]])-1]
  issuances[i] <- (strings[[i]])[length(strings[[i]])]
  nationality[[i]] <- (strings[[i]])[1: (length(strings[[i]])-2)]
  # concatenate strings using collapse = " " not sep = " "
  nationality_conc[i] <- paste(nationality[[i]], collapse = " ")
}
Sys.time()
## [1] "2022-11-11 23:14:20 EST"
nationality_conc <- toupper(nationality_conc)
df$nationality <- nationality_conc 
df$visa <- visa
issuances <- gsub(",", "", issuances)
df$issuances <- issuances
df <- df[,names(df)!="V1"] # drop V1 after all useful information has been extracted
df <- df[order(df$year, df$month),] # ascending order first by year then by month
# A quick glance at the visa class tabulation. Note this is not the aggregated visa issuances.
table(df$visa)
## 
##    A1    A2    A3    B1 B1/B2    B2 BBBCC BBBCV    C1  C1/D    C2    C3   CW1 
##  7728 10521  1614  6967 12447  4092   156    67  4362  8273    44  3087   494 
##   CW2     D    E1    E2   E2C    E3   E3D   E3R    F1    F2    G1    G2    G3 
##   243  1252  1509  4089   114    89  1344    67 11334  6028  6147  6659   738 
##    G4    G5   H1B  H1B1   H1C   H2A   H2B    H3    H4     I    J1    J2    K1 
##  9737   967  7621   134     2  1630  2112  1133  5444  4789 10657  6538  8330 
##    K2    K3    K4    L1    L2    M1    M2    N8    N9 NATO1 NATO2 NATO4 NATO5 
##  3186    24     5  6606  5798  4807   664    88    31    40  1575   517    62 
## NATO6 NATO7    O1    O2    O3    P1    P2    P3    P4    Q1    R1    R2    S7 
##   446    17  5282  2521  3200  4547   198  2441  1552   672  3934  1606     1 
##    T2    T3    T4    T5    T6    TD    TN    U1    U2    U3    U4    U5 
##   196   386   108    56    49  1601   131   299   320   686    86    88
issuances <- as.numeric(issuances)
countrylist <- data.frame(table(nationality_conc)) # the country names has to be reconciled with spelling variations

Write .csv

df$type <- "NIV" # add a column that indicates the non-immigrant visa type. Just in case this table will be combined with the immigrant visa table later.
write.csv(df,"nonimm.csv",row.names = F)

Experimental Codes

For those countries whose names have multiple strings (e.g. United Arab Emirates), we need to make sure the correct strings (all strings before the last but second space) are properly extracted and concatenated. We want "United Arab Emirates", not "United" "Arab" "Emirates". The following shows different attempts and what each of them generate.

names(nationality) <- as.numeric(seq_along(nationality))
nationality[171673]
## $`171673`
## [1] "United"   "Arab"     "Emirates"
paste(nationality[171673], collapse = " ")
## [1] "c(\"United\", \"Arab\", \"Emirates\")"
paste(unlist(nationality[171673]), collapse = " ")
## [1] "United Arab Emirates"
nationality[[171673]]
## [1] "United"   "Arab"     "Emirates"
unlist(nationality[[171673]])
## [1] "United"   "Arab"     "Emirates"
paste(nationality[[171673]], collapse = " ")
## [1] "United Arab Emirates"
paste(unlist(nationality[[171673]]), collapse = " ")
## [1] "United Arab Emirates"
#write.csv(df,"nonimm.csv", row.names = F)