Last update: 2015-07-03 10:22:01
We got tariff line data from UNSD. It’s stored in ad hoc table ess.ct_tariffline_adhoc
at QA SWS server. Currently it’s possible to access the data only from R.
For all countries and years.
cttl %>% summarize(nrows = n())
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: <derived table> [?? x 1]
##
## nrows
## 1 86001690
## .. ...
cttl
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: ct_tariffline_adhoc [86,001,690 x 14]
##
## rep tyear curr hsrep flow repcurr comm prt weight qty qunit tvalue est ht
## 1 156 2009 USD H3 1 USD 87012000 616 NA 1 5 110761 0 0
## 2 156 2009 USD H3 2 USD 87012000 516 NA 20 5 798000 0 0
## 3 156 2009 USD H3 2 USD 82152000 662 1700 1700 8 5100 0 0
## 4 156 2009 USD H3 2 USD 28321000 400 199070 199070 8 48153 0 0
## 5 156 2009 USD H3 2 USD 72287090 158 44116 44116 8 63260 0 0
## 6 156 2009 USD H3 2 USD 73151200 50 16536 16536 8 64610 0 0
## 7 156 2009 USD H3 1 USD 20059991 826 19 19 8 152 0 0
## 8 156 2009 USD H3 2 USD 63039100 208 NA 65491 5 667807 0 0
## 9 156 2009 USD H3 1 USD 07133900 643 1180 1180 8 9042 0 0
## 10 156 2009 USD H3 2 USD 44184000 8 75000 75000 8 46616 0 0
## .. ... ... ... ... ... ... ... ... ... ... ... ... ... ..
tbl(trade_src, sql(
"select *
from ess.ct_tariffline_adhoc
where rep in ('376', '842', '276') order by random() limit 20")) %>%
collect() %>% as.data.frame()
## rep tyear curr hsrep flow repcurr comm prt weight qty qunit tvalue est ht
## 1 842 2009 USD H3 1 USD 9026906000 484 NA NA 1 11351116.000 0 0
## 2 376 2012 USD H4 2 USD 29110000 231 NA NA 1 26000.000 0 0
## 3 376 2012 USD H4 1 USD 84798199 528 NA NA 1 33000.000 0 0
## 4 276 2012 USD H4 2 EUR 151800 620 490736 490736 8 968331.583 0 0
## 5 842 2010 USD H3 2 USD 6903200000 620 5470 5470 8 28744.000 0 0
## 6 276 2011 USD H3 1 EUR 811300 642 1573 1573 8 117001.768 0 0
## 7 276 2011 USD H3 1 EUR 870422 756 3253755 3253755 8 10580197.367 0 0
## 8 276 2009 USD H3 2 USD 780411 642 900 900 8 3000.000 0 0
## 9 842 2010 USD H3 2 USD 8544200000 772 392 392 8 6924.000 0 0
## 10 276 2013 USD H4 2 EUR 390521 504 66868 66868 8 102274.185 0 0
## 11 842 2009 USD H3 2 USD 8413200000 203 NA 19 5 3135.000 0 0
## 12 842 2013 USD H4 1 USD 7113195085 858 NA NA 1 31792.000 0 0
## 13 842 2013 USD H4 2 USD 5904100000 208 9 15 2 2700.000 0 0
## 14 276 2009 USD H3 1 USD 190531 586 200 200 8 1000.000 0 0
## 15 276 2009 USD H3 2 USD 910511 840 300 300 8 32000.000 0 0
## 16 276 2011 USD H3 2 EUR 520821 372 37 37 8 2040.247 0 0
## 17 842 2013 USD H4 2 USD 9617002000 826 NA 14145 5 149538.000 0 0
## 18 276 2009 USD H3 2 USD 630622 862 100 100 8 6000.000 0 0
## 19 276 2012 USD H4 2 EUR 902139 188 1063 1063 8 116371.604 0 0
## 20 842 2013 USD H4 1 USD 8474900020 688 NA NA 1 37560.000 0 0
For all countries
cttl %>%
select(tyear) %>%
group_by(tyear) %>%
summarize(nrows = n()) %>%
arrange(tyear) %>% collect()
## Source: local data frame [5 x 2]
##
## tyear nrows
## 1 2009 17163862
## 2 2010 17219485
## 3 2011 17235566
## 4 2012 17028103
## 5 2013 17354674
For all countires.
tlsql(select = "hsrep, count(hsrep)", where = "group by hsrep order by hsrep") %>%
mutate(hsrep = factor(hsrep, levels = c("H0", "H1", "H2", "H3", "H4", "S3"),
labels = c("HS1992", "HS1996", "HS2002",
"HS2007", "HS2012", "SITCr3")))
## hsrep count
## 1: HS1992 10643
## 2: HS1996 352140
## 3: HS2002 3557201
## 4: HS2007 51569744
## 5: HS2012 30511939
## 6: SITCr3 23
tbl(trade_src, sql("
select hs_length, count(hs_length) as countn
from (select char_length(comm) as hs_length
from ess.ct_tariffline_adhoc) as tbl1
group by hs_length order by hs_length
")) %>% collect()
## Source: local data frame [14 x 2]
##
## hs_length countn
## 1 0 23
## 2 2 176657
## 3 4 683576
## 4 5 7482
## 5 6 22500222
## 6 7 50107
## 7 8 46338748
## 8 9 1930341
## 9 10 12839762
## 10 11 669938
## 11 12 804818
## 12 13 2
## 13 15 3
## 14 16 11
tbl(trade_src, sql("
select hs_length, count(hs_length) as countn
from (select char_length(comm) as hs_length
from ess.ct_tariffline_adhoc where rep in ('376', '842', '276')) as tbl1
group by hs_length order by hs_length
")) %>% collect()
## Source: local data frame [4 x 2]
##
## hs_length countn
## 1 5 1835
## 2 6 2518689
## 3 8 581226
## 4 10 2316355
For all countires.
tlsql(select = "flow, count(flow)", where = "group by flow order by flow")
## flow count
## 1: 1 47108217
## 2: 2 37005984
## 3: 3 1844379
## 4: 4 43110
For all countires.
tlsql(select = "qunit, count(qunit)", where = "group by qunit order by qunit")
## qunit count
## 1: 1 9667561
## 2: 10 31572
## 3: 11 309038
## 4: 12 313255
## 5: 13 25881
## 6: 2 1842513
## 7: 3 2448
## 8: 4 214816
## 9: 5 15976097
## 10: 6 916446
## 11: 7 720910
## 12: 8 55916692
## 13: 9 64461
See description of units in Annex I of Quantity and Weight Data in UN Comtrade.
For all countires.
# tlsql(select = "est, count(est)", where = "group by est order by est")
tlsql(select = "sum( case when est != '0' then 1 else 0 end) as estimations")
## estimations
## 1: 0
There is no estimated values.
tlsql(select = "ht, count(ht)", where = "group by ht")
## ht count
## 1: 0 86001690
There is only zero in column ht.
It could be zero or NULL. We already saw NA weight and qty in random sample.
For all countries.
miss <- tbl(trade_src, sql("select sum (case when weight is null then 1 else 0 end) as weight_null,
sum (case when qty is null then 1 else 0 end) as qty_null,
sum (case when tvalue is null then 1 else 0 end) as tvalue_null,
sum (case when weight = 0 then 1 else 0 end) as weight_0,
sum (case when qty = 0 then 1 else 0 end) as qty_0,
sum (case when tvalue = 0 then 1 else 0 end) as tvalue_0 from ess.ct_tariffline_adhoc ")) %>%
collect() %>% as.integer()
dim(miss) <- c(3, 2)
colnames(miss) <- c("NA", "zero")
rownames(miss) <- c("weight", "qty", "tvalue")
miss
## NA zero
## weight 16261537 0
## qty 9667561 0
## tvalue 0 349742
We have null (in R they are converted into NA) cases for weight and qty and zero cases for value.
Are there cases when for the same trade flow we have several records with different units (note from Giorgio)?
agriItems <- paste0("'", paste(getAgriHSCodes(), collapse = "','"), "'")
cttlagri <- tbl(trade_src, sql(paste0(
"select *
from (select rep, tyear, flow, comm, substring(comm from 1 for 6) as hs6,
prt, weight, qty, qunit, tvalue
from ess.ct_tariffline_adhoc
where rep in ('376', '842', '276')
and tyear = '2011') as tbl1
where hs6 in(", agriItems, ") "))) %>%
select(-hs6)
cttlagri %>%
group_by(tyear, rep, prt, flow, comm) %>%
summarize(nrows = n()) %>%
ungroup() %>%
filter(nrows > 1)
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: <derived table> [?? x 6]
## Filter: nrows > 1
There are now such cases for the pilot countries, but they exist in other countries. For example:
tyear rep prt flow comm nrows
1 2011 328 780 1 2106909000 5
2 2011 499 899 1 220290 4
3 2011 328 531 2 0709909000 4
4 2011 328 531 2 1904200000 4
5 2011 328 780 1 2009112000 4
6 2011 328 780 1 2203001000 4
7 2011 328 780 1 2202909000 4
8 2011 328 780 1 2202101000 4
9 2011 328 780 2 2208401000 4
10 2011 328 840 1 0402999000 4
cttl %>%
select(-est, -ht) %>%
filter(tyear == "2011",
rep == "328",
prt == "780",
comm == "2106909000")
## Source: postgres 9.4.4 [trade@localhost:5432/sws_data]
## From: ct_tariffline_adhoc [11 x 12]
## Filter: tyear == "2011", rep == "328", prt == "780", comm == "2106909000"
##
## rep tyear curr hsrep flow repcurr comm prt weight qty qunit tvalue
## 1 328 2011 USD H3 1 GYD 2106909000 780 517844.89 NA 1 474988.98763
## 2 328 2011 USD H3 1 GYD 2106909000 780 1130.00 4.00 2 20502.46073
## 3 328 2011 USD H3 1 GYD 2106909000 780 3998.55 43303.00 5 43912.06895
## 4 328 2011 USD H3 1 GYD 2106909000 780 139.78 136.00 7 1371.76044
## 5 328 2011 USD H3 1 GYD 2106909000 780 806.54 806.54 8 10655.44112
## 6 328 2011 USD H3 2 GYD 2106909000 780 5281.53 NA 1 75846.41057
## 7 328 2011 USD H3 2 GYD 2106909000 780 208.40 60.00 5 74.61267
## 8 328 2011 USD H3 2 GYD 2106909000 780 1088.16 1088.16 8 19533.35835
## 9 328 2011 USD H3 3 2106909000 780 1635.72 NA 1 1412.85921
## 10 328 2011 USD H3 3 2106909000 780 6.00 20.00 5 19.99604
## 11 328 2011 USD H3 3 2106909000 780 200.00 200.00 8 598.57384
cttl_missing <- cttlagri %>%
select(rep, tyear, flow, comm, prt, weight, qty, qunit, tvalue) %>%
filter(rep %in% c('376', '842', '276'),
tyear == "2011") %>%
collect() %>%
group_by(tyear, rep) %>%
summarize(totalflows = n(),
noqty = sum(is.na(qty) & is.na(weight) & tvalue > 0),
noqty_prcnt = round(100 * noqty / totalflows, 2),
novl = sum((!is.na(qty) | !is.na(weight)) & tvalue == 0),
novl_prcnt = round(100 * novl / totalflows, 2)) %>%
mutate(reporter = getCountryName(rep)) %>%
select(reporter, totalflows, noqty, noqty_prcnt, novl, novl_prcnt)
tfsource <- gettfsource(reporter = c(231, 105, 79), year = 2011) %>%
mutate(hs6 = stringr::str_extract(hs, "^.{6}"))
tfsource <- data.frame(hs = getAgriHSCodes(), stringsAsFactors = F) %>%
inner_join(tfsource, by = c("hs" = "hs6")) %>%
select(-hs) %>%
rename(hs = hs.y) %>%
mutate(reporter = getCountryName(unctarea(reporter)))
tfsource %>%
group_by(reporter) %>%
summarize(totalflows = n(),
noqty = sum(qty == 0 & qty2 == 0 & value > 0),
noqty_prcnt = round(100 * noqty / totalflows, 2),
novl = sum((qty > 0 | qty2 > 0) & value == 0),
novl_prcnt = round(100 * novl / totalflows, 2))
## Source: local data frame [3 x 6]
##
## reporter totalflows noqty noqty_prcnt novl novl_prcnt
## 1 Germany(1991-) 80001 6286 7.86 28 0.03
## 2 Israel 10145 6126 60.38 0 0.00
## 3 USA(1981-) 62420 549 0.88 0 0.00
cttl_missing
## Source: local data frame [3 x 7]
## Groups: tyear
##
## tyear reporter totalflows noqty noqty_prcnt novl novl_prcnt
## 1 2011 Germany(1991-) 44951 85 0.19 0 0
## 2 2011 Israel 10167 6275 61.72 0 0
## 3 2011 USA(1981-) 28107 341 1.21 0 0
tfsource_uniq_hs <- tfsource %>%
select(hs) %>%
distinct() %>%
collect()
tariffline_uniq_hs <- cttlagri %>%
select(rep, tyear, comm) %>%
filter(rep %in% c('376', '842', '276'),
tyear == "2011") %>%
select(comm) %>%
distinct() %>%
select(hs = comm) %>%
collect()
data.frame(source = c("tfsource", "tariffline"),
uniq_hs = c(nrow(tfsource_uniq_hs),
nrow(tariffline_uniq_hs)))
## source uniq_hs
## 1 tfsource 5431
## 2 tariffline 4126
# HS codes which are absent in TF_SOURCE
tariffline_uniq_hs %>%
anti_join(tfsource_uniq_hs, by = "hs") %>%
sample_n(50) %>%
unlist() %>% unname() %>% as.factor()
## [1] 130232 090111 020900 040610 070490 070990 090611 070511 210500 110100 080212 121410 090820 080610 040210 090619
## [17] 040291 121010 071340 170290 090830 021019 240310 150810 230210 040490 020732 030760 120720 110290 070190 330190
## [33] 200110 151511 240110 110520 180620 020712 020735 410210 151790 010420 151190 120590 071151 020423 120922 220900
## [49] 410320 220421
## 50 Levels: 010420 020423 020712 020732 020735 020900 021019 030760 040210 040291 040490 040610 070190 070490 ... 410320
# HS codes which are absent in Tariff Line
tfsource_uniq_hs %>%
anti_join(tariffline_uniq_hs, by = "hs") %>%
sample_n(50) %>%
unlist() %>% unname() %>% as.factor()
## [1] 20094919 04051030 1901901500 24022090 1602310090 15030019 4102291000 33012510 21033010 2401102040
## [11] 02101111 20098036 1108110000 06011090 01031000 12092590 22042917 01029005 02071370 11022090
## [21] 2202903400 04049089 04029130 50030000 5101110000 22042119 1702300040 20087071 02101920 02062991
## [31] 02101139 04064010 10061023 1209230030 0705190010 04041034 16024911 08022200 15071010 35011090
## [41] 10062094 15132919 13023290 22086019 02068010 11071019 02061010 15121199 17029010 02073359
## 50 Levels: 01029005 01031000 02061010 02062991 02068010 02071370 02073359 02101111 02101139 02101920 ... 5101110000
# Trimmed to 6 digits HS codes which are absent in TF_SOURCE
tariffline_uniq_hs %>%
mutate(hs = stringr::str_extract(hs, "^.{6}")) %>%
anti_join(tfsource_uniq_hs %>%
mutate(hs = stringr::str_extract(hs, "^.{6}")),
by = "hs")
## Source: local data frame [0 x 1]
##
## Variables not shown: hs (chr)
# Trimmed to 6 digits HS codes which are absent in Tariff Line
tfsource_uniq_hs %>%
mutate(hs = stringr::str_extract(hs, "^.{6}")) %>%
anti_join(tariffline_uniq_hs %>%
mutate(hs = stringr::str_extract(hs, "^.{6}")),
by = "hs")
## [1] hs
## <0 rows> (or 0-length row.names)