このノートでは、CEPIIのGravityデータセットを読み込み、必要な変数を抽出して保存する。
CEPII のGravityデータセットは、無料で以下のURLからダウンロードできる。
http://www.cepii.fr/CEPII/en/bdd_modele/bdd_modele_item.asp?id=8
R形式(131MB)、Stata形式(1.4GB)、csv形式(1.25GB)のファイルが提供されているが、ここでは最も軽量なR形式のファイルを用いる。
http://www.cepii.fr/DATA_DOWNLOAD/gravity/data/Gravity_rds_V202211.zip
用いるのは、202211版のファイルである。
このファイルをダウンロードして解凍し、Gravity.rdsを読み込む。
# Load the dataset
library(rio)
cepii0 <- import("../Data_raw/Gravity_V202211.rds")
names(cepii0)
## [1] "year" "country_id_o" "country_id_d"
## [4] "iso3_o" "iso3_d" "iso3num_o"
## [7] "iso3num_d" "country_exists_o" "country_exists_d"
## [10] "gmt_offset_2020_o" "gmt_offset_2020_d" "distw_harmonic"
## [13] "distw_arithmetic" "distw_harmonic_jh" "distw_arithmetic_jh"
## [16] "dist" "main_city_source_o" "main_city_source_d"
## [19] "distcap" "contig" "diplo_disagreement"
## [22] "scaled_sci_2021" "comlang_off" "comlang_ethno"
## [25] "comcol" "col45" "legal_old_o"
## [28] "legal_old_d" "legal_new_o" "legal_new_d"
## [31] "comleg_pretrans" "comleg_posttrans" "transition_legalchange"
## [34] "comrelig" "heg_o" "heg_d"
## [37] "col_dep_ever" "col_dep" "col_dep_end_year"
## [40] "col_dep_end_conflict" "empire" "sibling_ever"
## [43] "sibling" "sever_year" "sib_conflict"
## [46] "pop_o" "pop_d" "gdp_o"
## [49] "gdp_d" "gdpcap_o" "gdpcap_d"
## [52] "pop_source_o" "pop_source_d" "gdp_source_o"
## [55] "gdp_source_d" "gdp_ppp_o" "gdp_ppp_d"
## [58] "gdpcap_ppp_o" "gdpcap_ppp_d" "pop_pwt_o"
## [61] "pop_pwt_d" "gdp_ppp_pwt_o" "gdp_ppp_pwt_d"
## [64] "gatt_o" "gatt_d" "wto_o"
## [67] "wto_d" "eu_o" "eu_d"
## [70] "fta_wto" "fta_wto_raw" "rta_coverage"
## [73] "rta_type" "entry_cost_o" "entry_cost_d"
## [76] "entry_proc_o" "entry_proc_d" "entry_time_o"
## [79] "entry_time_d" "entry_tp_o" "entry_tp_d"
## [82] "tradeflow_comtrade_o" "tradeflow_comtrade_d" "tradeflow_baci"
## [85] "manuf_tradeflow_baci" "tradeflow_imf_o" "tradeflow_imf_d"
use "Gravity_V202211.dta",clear
describe, replace clear
export excel using varlist.xlsx,replace first(variables)
| position | name | type | varlab |
|---|---|---|---|
| 1 | year | int | Year |
| 2 | country_id_o | str5 | Gravity country ID, origin |
| 3 | country_id_d | str5 | Gravity country ID, destination |
| 4 | iso3_o | str3 | Origin ISO3 alphabetic |
| 5 | iso3_d | str3 | Destination ISO3 alphabetic |
| 6 | iso3num_o | int | Origin ISO3 numeric |
| 7 | iso3num_d | int | Destination ISO3 numeric |
| 8 | country_exists_o | byte | 1 = Origin country exists |
| 9 | country_exists_d | byte | 1 = Destination country exists |
| 10 | gmt_offset_2020_o | float | Origin GMT offset (hours) |
| 11 | gmt_offset_2020_d | float | Destination GMT offset (hours) |
| 12 | distw_harmonic | int | Population-weighted distance between most populated cities (harmonic mean) |
| 13 | distw_arithmetic | int | Population-weighted distance between most populated cities (arithmetic mean) |
| 14 | distw_harmonic_jh | int | Weighted distance by J. Hinz (harmonic mean) |
| 15 | distw_arithmetic_jh | int | Weighted distance by J. Hinz (arithmetic mean) |
| 16 | dist | int | Simple distance between most populated cities, measured in km |
| 17 | main_city_source_o | str15 | Source of most populated city, origin country |
| 18 | main_city_source_d | str15 | Source of most populated city, destination country |
| 19 | distcap | int | Simple distance between capital city, measured km |
| 20 | contig | byte | =1 if origin and destination are contiguous |
| 21 | diplo_disagreement | float | UN diplomatic disagreement score |
| 22 | scaled_sci_2021 | long | Social connectedness index in 2021 |
| 23 | comlang_off | byte | 1 = Common official or primary language |
| 24 | comlang_ethno | byte | 1 = Language is spoken by at least 9% of the population |
| 25 | comcol | byte | 1 = Common colonizer post 1945 |
| 26 | col45 | byte | 1 = Pair in colonial relationship post 1945 |
| 27 | legal_old_o | byte | Origin legal system before transition |
| 28 | legal_old_d | byte | Destination legal system before transition |
| 29 | legal_new_o | byte | Origin legal system after transition |
| 30 | legal_new_d | byte | Destination legal system after transition |
| 31 | comleg_pretrans | byte | 1 = Common legal origins before transition |
| 32 | comleg_posttrans | byte | 1 = Common legal origins after transition |
| 33 | transition_legalchange | byte | 1 = Common legal origin changed since transition |
| 34 | comrelig | float | Common religion index |
| 35 | heg_o | byte | 1 = Origin is current or former hegemon of destination |
| 36 | heg_d | byte | 1 = Destination is current or former hegemon of origin |
| 37 | col_dep_ever | byte | 1 = Pair ever in colonial or dependency relationship |
| 38 | col_dep | byte | 1 = Pair currently in colonial or dependency relationship |
| 39 | col_dep_end_year | int | Independence date, if col_dep = 1 |
| 40 | col_dep_end_conflict | byte | 1 = Independence involved conflict, if col_dep_ever = 1 |
| 41 | empire | str3 | Hegemon if sibling = 1 and year < sever_year |
| 42 | sibling_ever | byte | 1 = Pair ever in sibling relationship |
| 43 | sibling | byte | 1 = Pair currently in sibling relationship |
| 44 | sever_year | int | Severance year for pairs if sibling == 1 |
| 45 | sib_conflict | byte | 1 = Pair ever in sibling relationship and conflict with hegemon |
| 46 | pop_o | double | Origin Population, total in thousands |
| 47 | pop_d | double | Destination Population, total in thousands |
| 48 | gdp_o | double | Origin GDP (current thousands US\()| |49|gdp_d|double|Destination GDP (current thousands US\)) |
| 50 | gdpcap_o | float | Origin GDP per cap (current thousands US\()| |51|gdpcap_d|float|Destination GDP per cap (current thousands US\)) |
| 52 | pop_source_o | byte | Origin Population source |
| 53 | pop_source_d | byte | Destination Population source |
| 54 | gdp_source_o | byte | Origin GDP source |
| 55 | gdp_source_d | byte | Destination GDP source |
| 56 | gdp_ppp_o | double | Origin GDP, PPP (current thousands international $) |
| 57 | gdp_ppp_d | double | Destination GDP, PPP (current thousands international $) |
| 58 | gdpcap_ppp_o | double | Origin GDP per cap, PPP (current thousands international $) |
| 59 | gdpcap_ppp_d | double | Destination GDP per cap, PPP (current thousands international $) |
| position | name | type | varlab |
|---|---|---|---|
| 60 | pop_pwt_o | float | Origin Population, total in thousands (PWT) |
| 61 | pop_pwt_d | float | Destination Population, total in thousands (PWT) |
| 62 | gdp_ppp_pwt_o | float | Origin GDP, current PPP (2011 thousands US$) (PWT) |
| position | name | type | varlab |
|---|---|---|---|
| 63 | gdp_ppp_pwt_d | float | Destination GDP, current PPP (2011 thousands US$) (PWT) |
| 64 | gatt_o | byte | Origin GATT membership |
| 65 | gatt_d | byte | Destination GATT membership |
| 66 | wto_o | byte | Origin WTO membership |
| 67 | wto_d | byte | Destination WTO membership |
| 68 | eu_o | byte | 1 = Origin is a EU member |
| 69 | eu_d | byte | 1 = Destination is a EU member |
| 70 | fta_wto | byte | 1=RTA (Source: WTO, 2011) |
| 71 | fta_wto_raw | byte | |
| 72 | rta_coverage | byte | Coverage |
| 73 | rta_type | byte | Type |
| 74 | entry_cost_o | double | Origin Cost of business start-up procedures (% of GNI per capita) |
| 75 | entry_cost_d | double | Destination Cost of business start-up procedures (% of GNI per capita) |
| 76 | entry_proc_o | byte | Origin Start-up procedures to register a business (number) |
| 77 | entry_proc_d | byte | Destination Start-up procedures to register a business (number) |
| 78 | entry_time_o | double | Origin Time required to start a business (days) |
| 79 | entry_time_d | double | Destination Time required to start a business (days) |
| 80 | entry_tp_o | float | Origin Days + procedures to start a business |
| 81 | entry_tp_d | float | Destination Days + procedures to start a business |
| 82 | tradeflow_comtrade_o | double | Trade flows as reported by the origin, 1000 Current USD (source: UNSD) |
| 83 | tradeflow_comtrade_d | double | Trade flows as reported by the destination, 1000 Current USD (source: UNSD) |
| 84 | tradeflow_baci | double | Trade flow, 1000 USD (source: BACI) |
| 85 | manuf_tradeflow_baci | double | Trade flow of manufactured goods, 1000 USD (source: BACI) |
| 86 | tradeflow_imf_o | float | Trade flows as reported by the origin, 1000 current USD (source: IMF) |
| 87 | tradeflow_imf_d | float | Trade flows as reported by the destination, 1000 current USD (source: IMF) |
残す変数は以下の通り。
year
country_id_o
country_id_d
iso3_o
iso3_d
iso3num_o
iso3num_d
dist
contig
comlang_off
gdp_o
gdp_d
gatt_o
gatt_d
wto_o
wto_d
eu_o
eu_d
fta_wto
tradeflow_comtrade_o
tradeflow_comtrade_d
tradeflow_baci
tradeflow_imf_o
tradeflow_imf_d
# Extract the above variables
cepii1 <- cepii0[, c("year",
"iso3_o", "iso3_d", "iso3num_o", "iso3num_d",
"dist", "comlang_off",
"gdp_o", "gdp_d",
"gatt_o", "gatt_d", "wto_o", "wto_d",
"eu_o", "eu_d",
"fta_wto",
"tradeflow_comtrade_o", "tradeflow_comtrade_d",
"tradeflow_baci",
"tradeflow_imf_o", "tradeflow_imf_d")]
# N. obs
nrow(cepii1)
## [1] 4699296
# distが欠損であれば、削除
cepii1 <- subset(cepii1, !is.na(cepii1$dist))
nrow(cepii1)
## [1] 3627844
nrow(cepii1)
## [1] 3627844
cepii1 <- subset(cepii1, iso3_o != iso3_d)
nrow(cepii1)
## [1] 3611478
# In Stata: egen pairid = group(iso3_o iso3_d)
# R codes which are equivalent to the Stata codes above
# create country pair ID
library(dplyr)
# Assuming your data frame is named df
cepii1 <- cepii1 %>%
mutate(pairid = group_indices(., iso3_o, iso3_d))
# Move the pairid to the first column
cepii1 <- cepii1 %>%
select(pairid, everything())
cepii1.rdsとする。# Save the dataset
library(rio)
rio::export(cepii1, "../Data_output/cepii1.rds")