Last update: 2015-07-06 17:46:32

mdbdata <- data.table::fread(file.path(Sys.getenv("HOME"), "r_adhoc", "mdb_read", "hs_fcl_full.csv"),
                              data.table = F)


mdbdata$validyear[mdbdata$validyear == ""] <- "0"
mdbdata$validyear <- as.integer(mdbdata$validyear)

head(mdbdata)
##   area   flow fromcode   tocode  fcl fclorig validyear mdbyear fao iso2     country   cpc
## 1  AFG Import 01010000 01010000 1096    1096         0    2012   2   AF Afghanistan 02131
## 2  AFG Import 01011000 01011000 1096    1096         0    2012   2   AF Afghanistan 02131
## 3  AFG Import 01011100 01011199 1096    1096         0    2012   2   AF Afghanistan 02131
## 4  AFG Import 01011900 01011999 1096    1096         0    2012   2   AF Afghanistan 02131
## 5  AFG Import 01012000 01012000 1107    1107         0    2012   2   AF Afghanistan 02132
## 6  AFG Import 01012010 01012010 1107    1107         0    2012   2   AF Afghanistan 02132

Proportion of records where there is only one code and not a range of codes: 73.2%

tbl(trade_src, sql("select * from tariffline limit 10"))
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: <derived table> [?? x 9]
## 
##    year reporter partner     hs flow weight qty qunit        value
## 1  2011      275     642 843890    1     NA  NA     1    1041.9004
## 2  2011      275     276 280429    1     NA  NA     1     748.2794
## 3  2011      275     458 420292    1     NA  NA     1    2202.4601
## 4  2011      275     699 392690    1     NA  NA     1    2087.4333
## 5  2011      275     208 851762    1     NA  NA     1    2017.5091
## 6  2011      275     380 392630    1     NA  NA     1    6336.1592
## 7  2011      275     376 440900    1     NA  NA     1     557.8799
## 8  2011      275     344 610449    1     NA  NA     1   15228.6371
## 9  2011      275     840 680229    2     NA  NA     1 1552139.7831
## 10 2011      275     724 903149    1     NA  NA     1    4246.3043
## ..  ...      ...     ...    ...  ...    ... ...   ...          ...
agriItems <- paste0("'", paste(getAgriHSCodes(), collapse = "','"), "'")

tl_us2011 <- tbl(trade_src, sql(paste0(
"select * from (select year, reporter, partner, flow, hs, substring(hs from 1 for 6) AS hs6  
from tariffline
where reporter = '842' and year = '2011') as tbl1 
where hs6 in(", agriItems, ") "))) %>% 
  select(-hs6) %>% collect()

mdb_us2011 <- mdbdata %>% 
  select(-area, -iso2, -country, -cpc) %>% 
  filter(fao == faoarea(842), 
         validyear %in% c("0", "2011"))

Proportion of records where there is only one code and not a range of codes: 100%

OK, so for US 2011 there is no ranges of HS codes so we can postpone problem of matching HS-code to range from mdb data.

Degree of matching HS-codes between MDB base and TariffLine.

tl_us2011 %>% 
  filter(flow == "1") %>% 
  select(hs) %>% 
  distinct() %>%  
  anti_join(mdb_us2011 %>% 
              filter(flow == "Import") %>% 
              select(hs = fromcode, fcl) %>% 
              distinct(), 
            by = "hs") %>% 
  nrow()
## [1] 0