このノートでは、CEPIIのGravityデータセットを読み込み、必要な変数を抽出して保存する。
CEPII のGravityデータセットは、無料で以下のURLからダウンロードできる。
http://www.cepii.fr/CEPII/en/bdd_modele/bdd_modele_item.asp?id=8
R形式(131MB)、Stata形式(1.4GB)、csv形式(1.25GB)のファイルが提供されているが、ここでは最も軽量なR形式のファイルを用いる。
http://www.cepii.fr/DATA_DOWNLOAD/gravity/data/Gravity_rds_V202211.zip
用いるのは、202211版のファイルである。
このファイルをダウンロードして解凍し、Gravity.rds
を読み込む。
# Load the dataset
library(rio)
cepii0 <- import("../Data_raw/Gravity_V202211.rds")
names(cepii0)
## [1] "year" "country_id_o" "country_id_d"
## [4] "iso3_o" "iso3_d" "iso3num_o"
## [7] "iso3num_d" "country_exists_o" "country_exists_d"
## [10] "gmt_offset_2020_o" "gmt_offset_2020_d" "distw_harmonic"
## [13] "distw_arithmetic" "distw_harmonic_jh" "distw_arithmetic_jh"
## [16] "dist" "main_city_source_o" "main_city_source_d"
## [19] "distcap" "contig" "diplo_disagreement"
## [22] "scaled_sci_2021" "comlang_off" "comlang_ethno"
## [25] "comcol" "col45" "legal_old_o"
## [28] "legal_old_d" "legal_new_o" "legal_new_d"
## [31] "comleg_pretrans" "comleg_posttrans" "transition_legalchange"
## [34] "comrelig" "heg_o" "heg_d"
## [37] "col_dep_ever" "col_dep" "col_dep_end_year"
## [40] "col_dep_end_conflict" "empire" "sibling_ever"
## [43] "sibling" "sever_year" "sib_conflict"
## [46] "pop_o" "pop_d" "gdp_o"
## [49] "gdp_d" "gdpcap_o" "gdpcap_d"
## [52] "pop_source_o" "pop_source_d" "gdp_source_o"
## [55] "gdp_source_d" "gdp_ppp_o" "gdp_ppp_d"
## [58] "gdpcap_ppp_o" "gdpcap_ppp_d" "pop_pwt_o"
## [61] "pop_pwt_d" "gdp_ppp_pwt_o" "gdp_ppp_pwt_d"
## [64] "gatt_o" "gatt_d" "wto_o"
## [67] "wto_d" "eu_o" "eu_d"
## [70] "fta_wto" "fta_wto_raw" "rta_coverage"
## [73] "rta_type" "entry_cost_o" "entry_cost_d"
## [76] "entry_proc_o" "entry_proc_d" "entry_time_o"
## [79] "entry_time_d" "entry_tp_o" "entry_tp_d"
## [82] "tradeflow_comtrade_o" "tradeflow_comtrade_d" "tradeflow_baci"
## [85] "manuf_tradeflow_baci" "tradeflow_imf_o" "tradeflow_imf_d"
use "Gravity_V202211.dta",clear
describe, replace clear
export excel using varlist.xlsx,replace first(variables)
position | name | type | varlab |
---|---|---|---|
1 | year | int | Year |
2 | country_id_o | str5 | Gravity country ID, origin |
3 | country_id_d | str5 | Gravity country ID, destination |
4 | iso3_o | str3 | Origin ISO3 alphabetic |
5 | iso3_d | str3 | Destination ISO3 alphabetic |
6 | iso3num_o | int | Origin ISO3 numeric |
7 | iso3num_d | int | Destination ISO3 numeric |
8 | country_exists_o | byte | 1 = Origin country exists |
9 | country_exists_d | byte | 1 = Destination country exists |
10 | gmt_offset_2020_o | float | Origin GMT offset (hours) |
11 | gmt_offset_2020_d | float | Destination GMT offset (hours) |
12 | distw_harmonic | int | Population-weighted distance between most populated cities (harmonic mean) |
13 | distw_arithmetic | int | Population-weighted distance between most populated cities (arithmetic mean) |
14 | distw_harmonic_jh | int | Weighted distance by J. Hinz (harmonic mean) |
15 | distw_arithmetic_jh | int | Weighted distance by J. Hinz (arithmetic mean) |
16 | dist | int | Simple distance between most populated cities, measured in km |
17 | main_city_source_o | str15 | Source of most populated city, origin country |
18 | main_city_source_d | str15 | Source of most populated city, destination country |
19 | distcap | int | Simple distance between capital city, measured km |
20 | contig | byte | =1 if origin and destination are contiguous |
21 | diplo_disagreement | float | UN diplomatic disagreement score |
22 | scaled_sci_2021 | long | Social connectedness index in 2021 |
23 | comlang_off | byte | 1 = Common official or primary language |
24 | comlang_ethno | byte | 1 = Language is spoken by at least 9% of the population |
25 | comcol | byte | 1 = Common colonizer post 1945 |
26 | col45 | byte | 1 = Pair in colonial relationship post 1945 |
27 | legal_old_o | byte | Origin legal system before transition |
28 | legal_old_d | byte | Destination legal system before transition |
29 | legal_new_o | byte | Origin legal system after transition |
30 | legal_new_d | byte | Destination legal system after transition |
31 | comleg_pretrans | byte | 1 = Common legal origins before transition |
32 | comleg_posttrans | byte | 1 = Common legal origins after transition |
33 | transition_legalchange | byte | 1 = Common legal origin changed since transition |
34 | comrelig | float | Common religion index |
35 | heg_o | byte | 1 = Origin is current or former hegemon of destination |
36 | heg_d | byte | 1 = Destination is current or former hegemon of origin |
37 | col_dep_ever | byte | 1 = Pair ever in colonial or dependency relationship |
38 | col_dep | byte | 1 = Pair currently in colonial or dependency relationship |
39 | col_dep_end_year | int | Independence date, if col_dep = 1 |
40 | col_dep_end_conflict | byte | 1 = Independence involved conflict, if col_dep_ever = 1 |
41 | empire | str3 | Hegemon if sibling = 1 and year < sever_year |
42 | sibling_ever | byte | 1 = Pair ever in sibling relationship |
43 | sibling | byte | 1 = Pair currently in sibling relationship |
44 | sever_year | int | Severance year for pairs if sibling == 1 |
45 | sib_conflict | byte | 1 = Pair ever in sibling relationship and conflict with hegemon |
46 | pop_o | double | Origin Population, total in thousands |
47 | pop_d | double | Destination Population, total in thousands |
48 | gdp_o | double | Origin GDP (current thousands US\()| |49|gdp_d|double|Destination GDP (current thousands US\)) |
50 | gdpcap_o | float | Origin GDP per cap (current thousands US\()| |51|gdpcap_d|float|Destination GDP per cap (current thousands US\)) |
52 | pop_source_o | byte | Origin Population source |
53 | pop_source_d | byte | Destination Population source |
54 | gdp_source_o | byte | Origin GDP source |
55 | gdp_source_d | byte | Destination GDP source |
56 | gdp_ppp_o | double | Origin GDP, PPP (current thousands international $) |
57 | gdp_ppp_d | double | Destination GDP, PPP (current thousands international $) |
58 | gdpcap_ppp_o | double | Origin GDP per cap, PPP (current thousands international $) |
59 | gdpcap_ppp_d | double | Destination GDP per cap, PPP (current thousands international $) |
position | name | type | varlab |
---|---|---|---|
60 | pop_pwt_o | float | Origin Population, total in thousands (PWT) |
61 | pop_pwt_d | float | Destination Population, total in thousands (PWT) |
62 | gdp_ppp_pwt_o | float | Origin GDP, current PPP (2011 thousands US$) (PWT) |
position | name | type | varlab |
---|---|---|---|
63 | gdp_ppp_pwt_d | float | Destination GDP, current PPP (2011 thousands US$) (PWT) |
64 | gatt_o | byte | Origin GATT membership |
65 | gatt_d | byte | Destination GATT membership |
66 | wto_o | byte | Origin WTO membership |
67 | wto_d | byte | Destination WTO membership |
68 | eu_o | byte | 1 = Origin is a EU member |
69 | eu_d | byte | 1 = Destination is a EU member |
70 | fta_wto | byte | 1=RTA (Source: WTO, 2011) |
71 | fta_wto_raw | byte | |
72 | rta_coverage | byte | Coverage |
73 | rta_type | byte | Type |
74 | entry_cost_o | double | Origin Cost of business start-up procedures (% of GNI per capita) |
75 | entry_cost_d | double | Destination Cost of business start-up procedures (% of GNI per capita) |
76 | entry_proc_o | byte | Origin Start-up procedures to register a business (number) |
77 | entry_proc_d | byte | Destination Start-up procedures to register a business (number) |
78 | entry_time_o | double | Origin Time required to start a business (days) |
79 | entry_time_d | double | Destination Time required to start a business (days) |
80 | entry_tp_o | float | Origin Days + procedures to start a business |
81 | entry_tp_d | float | Destination Days + procedures to start a business |
82 | tradeflow_comtrade_o | double | Trade flows as reported by the origin, 1000 Current USD (source: UNSD) |
83 | tradeflow_comtrade_d | double | Trade flows as reported by the destination, 1000 Current USD (source: UNSD) |
84 | tradeflow_baci | double | Trade flow, 1000 USD (source: BACI) |
85 | manuf_tradeflow_baci | double | Trade flow of manufactured goods, 1000 USD (source: BACI) |
86 | tradeflow_imf_o | float | Trade flows as reported by the origin, 1000 current USD (source: IMF) |
87 | tradeflow_imf_d | float | Trade flows as reported by the destination, 1000 current USD (source: IMF) |
残す変数は以下の通り。
year
country_id_o
country_id_d
iso3_o
iso3_d
iso3num_o
iso3num_d
dist
contig
comlang_off
gdp_o
gdp_d
gatt_o
gatt_d
wto_o
wto_d
eu_o
eu_d
fta_wto
tradeflow_comtrade_o
tradeflow_comtrade_d
tradeflow_baci
tradeflow_imf_o
tradeflow_imf_d
# Extract the above variables
cepii1 <- cepii0[, c("year",
"iso3_o", "iso3_d", "iso3num_o", "iso3num_d",
"dist", "comlang_off",
"gdp_o", "gdp_d",
"gatt_o", "gatt_d", "wto_o", "wto_d",
"eu_o", "eu_d",
"fta_wto",
"tradeflow_comtrade_o", "tradeflow_comtrade_d",
"tradeflow_baci",
"tradeflow_imf_o", "tradeflow_imf_d")]
# N. obs
nrow(cepii1)
## [1] 4699296
# distが欠損であれば、削除
cepii1 <- subset(cepii1, !is.na(cepii1$dist))
nrow(cepii1)
## [1] 3627844
nrow(cepii1)
## [1] 3627844
cepii1 <- subset(cepii1, iso3_o != iso3_d)
nrow(cepii1)
## [1] 3611478
# In Stata: egen pairid = group(iso3_o iso3_d)
# R codes which are equivalent to the Stata codes above
# create country pair ID
library(dplyr)
# Assuming your data frame is named df
cepii1 <- cepii1 %>%
mutate(pairid = group_indices(., iso3_o, iso3_d))
# Move the pairid to the first column
cepii1 <- cepii1 %>%
select(pairid, everything())
cepii1.rds
とする。# Save the dataset
library(rio)
rio::export(cepii1, "../Data_output/cepii1.rds")