Last update: 2015-07-03 10:22:01

Introduction

We got tariff line data from UNSD. It’s stored in ad hoc table ess.ct_tariffline_adhoc at QA SWS server. Currently it’s possible to access the data only from R.

Structure of Tariff Line table

Size of dataset

For all countries and years.

cttl %>% summarize(nrows = n())
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: <derived table> [?? x 1]
## 
##       nrows
## 1  86001690
## ..      ...

Example of data

cttl
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: ct_tariffline_adhoc [86,001,690 x 14]
## 
##    rep tyear curr hsrep flow repcurr     comm prt weight    qty qunit tvalue est ht
## 1  156  2009  USD    H3    1     USD 87012000 616     NA      1     5 110761   0  0
## 2  156  2009  USD    H3    2     USD 87012000 516     NA     20     5 798000   0  0
## 3  156  2009  USD    H3    2     USD 82152000 662   1700   1700     8   5100   0  0
## 4  156  2009  USD    H3    2     USD 28321000 400 199070 199070     8  48153   0  0
## 5  156  2009  USD    H3    2     USD 72287090 158  44116  44116     8  63260   0  0
## 6  156  2009  USD    H3    2     USD 73151200  50  16536  16536     8  64610   0  0
## 7  156  2009  USD    H3    1     USD 20059991 826     19     19     8    152   0  0
## 8  156  2009  USD    H3    2     USD 63039100 208     NA  65491     5 667807   0  0
## 9  156  2009  USD    H3    1     USD 07133900 643   1180   1180     8   9042   0  0
## 10 156  2009  USD    H3    2     USD 44184000   8  75000  75000     8  46616   0  0
## .. ...   ...  ...   ...  ...     ...      ... ...    ...    ...   ...    ... ... ..
tbl(trade_src, sql(
"select * 
from ess.ct_tariffline_adhoc 
where rep in ('376', '842', '276') order by random() limit 20")) %>%
  collect() %>% as.data.frame()
##    rep tyear curr hsrep flow repcurr       comm prt  weight     qty qunit       tvalue est ht
## 1  842  2009  USD    H3    1     USD 9026906000 484      NA      NA     1 11351116.000   0  0
## 2  376  2012  USD    H4    2     USD   29110000 231      NA      NA     1    26000.000   0  0
## 3  376  2012  USD    H4    1     USD   84798199 528      NA      NA     1    33000.000   0  0
## 4  276  2012  USD    H4    2     EUR     151800 620  490736  490736     8   968331.583   0  0
## 5  842  2010  USD    H3    2     USD 6903200000 620    5470    5470     8    28744.000   0  0
## 6  276  2011  USD    H3    1     EUR     811300 642    1573    1573     8   117001.768   0  0
## 7  276  2011  USD    H3    1     EUR     870422 756 3253755 3253755     8 10580197.367   0  0
## 8  276  2009  USD    H3    2     USD     780411 642     900     900     8     3000.000   0  0
## 9  842  2010  USD    H3    2     USD 8544200000 772     392     392     8     6924.000   0  0
## 10 276  2013  USD    H4    2     EUR     390521 504   66868   66868     8   102274.185   0  0
## 11 842  2009  USD    H3    2     USD 8413200000 203      NA      19     5     3135.000   0  0
## 12 842  2013  USD    H4    1     USD 7113195085 858      NA      NA     1    31792.000   0  0
## 13 842  2013  USD    H4    2     USD 5904100000 208       9      15     2     2700.000   0  0
## 14 276  2009  USD    H3    1     USD     190531 586     200     200     8     1000.000   0  0
## 15 276  2009  USD    H3    2     USD     910511 840     300     300     8    32000.000   0  0
## 16 276  2011  USD    H3    2     EUR     520821 372      37      37     8     2040.247   0  0
## 17 842  2013  USD    H4    2     USD 9617002000 826      NA   14145     5   149538.000   0  0
## 18 276  2009  USD    H3    2     USD     630622 862     100     100     8     6000.000   0  0
## 19 276  2012  USD    H4    2     EUR     902139 188    1063    1063     8   116371.604   0  0
## 20 842  2013  USD    H4    1     USD 8474900020 688      NA      NA     1    37560.000   0  0

Range of years

For all countries

cttl %>% 
  select(tyear) %>% 
  group_by(tyear) %>% 
  summarize(nrows = n()) %>% 
  arrange(tyear) %>% collect()
## Source: local data frame [5 x 2]
## 
##   tyear    nrows
## 1  2009 17163862
## 2  2010 17219485
## 3  2011 17235566
## 4  2012 17028103
## 5  2013 17354674

Type of classification used by reporter

For all countires.

tlsql(select = "hsrep, count(hsrep)", where = "group by hsrep order by hsrep") %>%  
  mutate(hsrep = factor(hsrep, levels = c("H0", "H1", "H2", "H3", "H4", "S3"),
                        labels = c("HS1992", "HS1996", "HS2002", 
                                   "HS2007", "HS2012", "SITCr3")))
##     hsrep    count
## 1: HS1992    10643
## 2: HS1996   352140
## 3: HS2002  3557201
## 4: HS2007 51569744
## 5: HS2012 30511939
## 6: SITCr3       23

Lenght of hs codes

Full data set

tbl(trade_src, sql("
select hs_length, count(hs_length) as countn 
from (select char_length(comm) as hs_length 
from ess.ct_tariffline_adhoc) as tbl1 
group by hs_length order by hs_length
                   ")) %>% collect()
## Source: local data frame [14 x 2]
## 
##    hs_length   countn
## 1          0       23
## 2          2   176657
## 3          4   683576
## 4          5     7482
## 5          6 22500222
## 6          7    50107
## 7          8 46338748
## 8          9  1930341
## 9         10 12839762
## 10        11   669938
## 11        12   804818
## 12        13        2
## 13        15        3
## 14        16       11

Countries of the question

tbl(trade_src, sql("
select hs_length, count(hs_length) as countn 
from (select char_length(comm) as hs_length 
from ess.ct_tariffline_adhoc where rep in ('376', '842', '276')) as tbl1 
group by hs_length order by hs_length
                   ")) %>% collect()
## Source: local data frame [4 x 2]
## 
##   hs_length  countn
## 1         5    1835
## 2         6 2518689
## 3         8  581226
## 4        10 2316355

Reported flows

For all countires.

tlsql(select = "flow, count(flow)", where = "group by flow order by flow") 
##    flow    count
## 1:    1 47108217
## 2:    2 37005984
## 3:    3  1844379
## 4:    4    43110

Reported units

For all countires.

tlsql(select = "qunit, count(qunit)", where = "group by qunit order by qunit") 
##     qunit    count
##  1:     1  9667561
##  2:    10    31572
##  3:    11   309038
##  4:    12   313255
##  5:    13    25881
##  6:     2  1842513
##  7:     3     2448
##  8:     4   214816
##  9:     5 15976097
## 10:     6   916446
## 11:     7   720910
## 12:     8 55916692
## 13:     9    64461

See description of units in Annex I of Quantity and Weight Data in UN Comtrade.

Are there estimations?

For all countires.

# tlsql(select = "est, count(est)", where = "group by est order by est") 
tlsql(select = "sum( case when est != '0' then 1 else 0 end) as estimations") 
##    estimations
## 1:           0

There is no estimated values.

What does contain ht column?

tlsql(select = "ht, count(ht)", where = "group by ht") 
##    ht    count
## 1:  0 86001690

There is only zero in column ht.

What is missing value in Tariff Line data

It could be zero or NULL. We already saw NA weight and qty in random sample.

For all countries.

miss <- tbl(trade_src, sql("select sum (case when weight is null then 1 else 0 end) as weight_null,
      sum (case when qty is null then 1 else 0 end) as qty_null,
      sum (case when tvalue is null then 1 else 0 end) as tvalue_null,
      sum (case when weight = 0 then 1 else 0 end) as weight_0,
      sum (case when qty = 0 then 1 else 0 end) as qty_0,
      sum (case when tvalue = 0 then 1 else 0 end) as tvalue_0 from ess.ct_tariffline_adhoc ")) %>% 
  collect() %>% as.integer()

dim(miss) <- c(3, 2)
colnames(miss) <- c("NA", "zero")
rownames(miss) <- c("weight", "qty", "tvalue")
miss
##              NA   zero
## weight 16261537      0
## qty     9667561      0
## tvalue        0 349742

We have null (in R they are converted into NA) cases for weight and qty and zero cases for value.

Conflicts between units

Are there cases when for the same trade flow we have several records with different units (note from Giorgio)?

agriItems <- paste0("'", paste(getAgriHSCodes(), collapse = "','"), "'")

cttlagri <- tbl(trade_src, sql(paste0(
"select * 
from (select rep, tyear, flow, comm, substring(comm from 1 for 6) as hs6, 
prt, weight, qty, qunit, tvalue 
from ess.ct_tariffline_adhoc 
where rep in ('376', '842', '276') 
and tyear = '2011') as tbl1 
where hs6 in(", agriItems, ") "))) %>%
  select(-hs6)

cttlagri %>%
  group_by(tyear, rep, prt, flow, comm)  %>% 
  summarize(nrows = n()) %>% 
  ungroup() %>% 
  filter(nrows > 1)
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: <derived table> [?? x 6]
## Filter: nrows > 1

There are now such cases for the pilot countries, but they exist in other countries. For example:

   tyear rep prt flow       comm nrows
1   2011 328 780    1 2106909000     5
2   2011 499 899    1     220290     4
3   2011 328 531    2 0709909000     4
4   2011 328 531    2 1904200000     4
5   2011 328 780    1 2009112000     4
6   2011 328 780    1 2203001000     4
7   2011 328 780    1 2202909000     4
8   2011 328 780    1 2202101000     4
9   2011 328 780    2 2208401000     4
10  2011 328 840    1 0402999000     4
cttl %>% 
  select(-est, -ht) %>%
  filter(tyear == "2011",
         rep == "328",
         prt == "780",
         comm == "2106909000")
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: ct_tariffline_adhoc [11 x 12]
## Filter: tyear == "2011", rep == "328", prt == "780", comm == "2106909000" 
## 
##    rep tyear curr hsrep flow repcurr       comm prt    weight      qty qunit       tvalue
## 1  328  2011  USD    H3    1     GYD 2106909000 780 517844.89       NA     1 474988.98763
## 2  328  2011  USD    H3    1     GYD 2106909000 780   1130.00     4.00     2  20502.46073
## 3  328  2011  USD    H3    1     GYD 2106909000 780   3998.55 43303.00     5  43912.06895
## 4  328  2011  USD    H3    1     GYD 2106909000 780    139.78   136.00     7   1371.76044
## 5  328  2011  USD    H3    1     GYD 2106909000 780    806.54   806.54     8  10655.44112
## 6  328  2011  USD    H3    2     GYD 2106909000 780   5281.53       NA     1  75846.41057
## 7  328  2011  USD    H3    2     GYD 2106909000 780    208.40    60.00     5     74.61267
## 8  328  2011  USD    H3    2     GYD 2106909000 780   1088.16  1088.16     8  19533.35835
## 9  328  2011  USD    H3    3         2106909000 780   1635.72       NA     1   1412.85921
## 10 328  2011  USD    H3    3         2106909000 780      6.00    20.00     5     19.99604
## 11 328  2011  USD    H3    3         2106909000 780    200.00   200.00     8    598.57384

Missing values

cttl_missing <- cttlagri %>% 
  select(rep, tyear, flow, comm, prt, weight, qty, qunit, tvalue) %>% 
  filter(rep %in% c('376', '842', '276'),
         tyear == "2011") %>% 
  collect() %>% 
  group_by(tyear, rep) %>% 
  summarize(totalflows  = n(),
            noqty       = sum(is.na(qty)  & is.na(weight) & tvalue > 0),
            noqty_prcnt = round(100 * noqty / totalflows, 2),
            novl        = sum((!is.na(qty) | !is.na(weight)) & tvalue == 0),
            novl_prcnt  = round(100 * novl / totalflows, 2)) %>% 
  mutate(reporter = getCountryName(rep)) %>% 
  select(reporter, totalflows, noqty, noqty_prcnt, novl, novl_prcnt)

tfsource <- gettfsource(reporter = c(231, 105, 79), year = 2011) %>% 
  mutate(hs6 = stringr::str_extract(hs, "^.{6}"))

tfsource <- data.frame(hs = getAgriHSCodes(), stringsAsFactors = F) %>% 
  inner_join(tfsource, by = c("hs" = "hs6")) %>% 
  select(-hs) %>% 
  rename(hs = hs.y) %>% 
  mutate(reporter = getCountryName(unctarea(reporter)))
  

tfsource %>% 
  group_by(reporter) %>% 
  summarize(totalflows  = n(),
            noqty       = sum(qty == 0 & qty2 == 0 & value > 0),
            noqty_prcnt = round(100 * noqty / totalflows, 2),
            novl        = sum((qty > 0 | qty2 > 0) & value == 0),
            novl_prcnt  = round(100 * novl / totalflows, 2)) 
## Source: local data frame [3 x 6]
## 
##         reporter totalflows noqty noqty_prcnt novl novl_prcnt
## 1 Germany(1991-)      80001  6286        7.86   28       0.03
## 2         Israel      10145  6126       60.38    0       0.00
## 3     USA(1981-)      62420   549        0.88    0       0.00
cttl_missing
## Source: local data frame [3 x 7]
## Groups: tyear
## 
##   tyear       reporter totalflows noqty noqty_prcnt novl novl_prcnt
## 1  2011 Germany(1991-)      44951    85        0.19    0          0
## 2  2011         Israel      10167  6275       61.72    0          0
## 3  2011     USA(1981-)      28107   341        1.21    0          0

Why do sizes of TariffLine and TF_SOURCE differ?

Differencies in sets of unique HS codes

tfsource_uniq_hs <- tfsource %>% 
  select(hs) %>% 
  distinct() %>% 
  collect()

tariffline_uniq_hs <- cttlagri %>% 
  select(rep, tyear, comm) %>% 
  filter(rep %in% c('376', '842', '276'),
         tyear == "2011") %>% 
  select(comm) %>% 
  distinct() %>% 
  select(hs = comm) %>%
  collect()

  data.frame(source = c("tfsource", "tariffline"),
             uniq_hs = c(nrow(tfsource_uniq_hs),
                         nrow(tariffline_uniq_hs)))
##       source uniq_hs
## 1   tfsource    5431
## 2 tariffline    4126
  # HS codes which are absent in TF_SOURCE

  tariffline_uniq_hs %>% 
    anti_join(tfsource_uniq_hs, by = "hs") %>% 
    sample_n(50) %>% 
    unlist() %>% unname() %>% as.factor() 
##  [1] 130232 090111 020900 040610 070490 070990 090611 070511 210500 110100 080212 121410 090820 080610 040210 090619
## [17] 040291 121010 071340 170290 090830 021019 240310 150810 230210 040490 020732 030760 120720 110290 070190 330190
## [33] 200110 151511 240110 110520 180620 020712 020735 410210 151790 010420 151190 120590 071151 020423 120922 220900
## [49] 410320 220421
## 50 Levels: 010420 020423 020712 020732 020735 020900 021019 030760 040210 040291 040490 040610 070190 070490 ... 410320
  # HS codes which are absent in Tariff Line
  
  tfsource_uniq_hs %>% 
    anti_join(tariffline_uniq_hs, by = "hs") %>% 
    sample_n(50) %>% 
    unlist() %>% unname() %>% as.factor() 
##  [1] 20094919   04051030   1901901500 24022090   1602310090 15030019   4102291000 33012510   21033010   2401102040
## [11] 02101111   20098036   1108110000 06011090   01031000   12092590   22042917   01029005   02071370   11022090  
## [21] 2202903400 04049089   04029130   50030000   5101110000 22042119   1702300040 20087071   02101920   02062991  
## [31] 02101139   04064010   10061023   1209230030 0705190010 04041034   16024911   08022200   15071010   35011090  
## [41] 10062094   15132919   13023290   22086019   02068010   11071019   02061010   15121199   17029010   02073359  
## 50 Levels: 01029005 01031000 02061010 02062991 02068010 02071370 02073359 02101111 02101139 02101920 ... 5101110000
  # Trimmed to 6 digits HS codes which are absent in TF_SOURCE
  
  tariffline_uniq_hs %>% 
    mutate(hs = stringr::str_extract(hs, "^.{6}")) %>% 
    anti_join(tfsource_uniq_hs %>% 
                mutate(hs = stringr::str_extract(hs, "^.{6}")),
              by = "hs") 
## Source: local data frame [0 x 1]
## 
## Variables not shown: hs (chr)
  # Trimmed to 6 digits HS codes which are absent in Tariff Line
  
  tfsource_uniq_hs %>% 
    mutate(hs = stringr::str_extract(hs, "^.{6}")) %>% 
    anti_join(tariffline_uniq_hs %>% 
                mutate(hs = stringr::str_extract(hs, "^.{6}")),
              by = "hs") 
## [1] hs
## <0 rows> (or 0-length row.names)