1 はじめに

2 Gravity dataset

# Load the dataset
library(rio)
cepii0 <- import("../Data_raw/Gravity_V202211.rds")

names(cepii0)
##  [1] "year"                   "country_id_o"           "country_id_d"          
##  [4] "iso3_o"                 "iso3_d"                 "iso3num_o"             
##  [7] "iso3num_d"              "country_exists_o"       "country_exists_d"      
## [10] "gmt_offset_2020_o"      "gmt_offset_2020_d"      "distw_harmonic"        
## [13] "distw_arithmetic"       "distw_harmonic_jh"      "distw_arithmetic_jh"   
## [16] "dist"                   "main_city_source_o"     "main_city_source_d"    
## [19] "distcap"                "contig"                 "diplo_disagreement"    
## [22] "scaled_sci_2021"        "comlang_off"            "comlang_ethno"         
## [25] "comcol"                 "col45"                  "legal_old_o"           
## [28] "legal_old_d"            "legal_new_o"            "legal_new_d"           
## [31] "comleg_pretrans"        "comleg_posttrans"       "transition_legalchange"
## [34] "comrelig"               "heg_o"                  "heg_d"                 
## [37] "col_dep_ever"           "col_dep"                "col_dep_end_year"      
## [40] "col_dep_end_conflict"   "empire"                 "sibling_ever"          
## [43] "sibling"                "sever_year"             "sib_conflict"          
## [46] "pop_o"                  "pop_d"                  "gdp_o"                 
## [49] "gdp_d"                  "gdpcap_o"               "gdpcap_d"              
## [52] "pop_source_o"           "pop_source_d"           "gdp_source_o"          
## [55] "gdp_source_d"           "gdp_ppp_o"              "gdp_ppp_d"             
## [58] "gdpcap_ppp_o"           "gdpcap_ppp_d"           "pop_pwt_o"             
## [61] "pop_pwt_d"              "gdp_ppp_pwt_o"          "gdp_ppp_pwt_d"         
## [64] "gatt_o"                 "gatt_d"                 "wto_o"                 
## [67] "wto_d"                  "eu_o"                   "eu_d"                  
## [70] "fta_wto"                "fta_wto_raw"            "rta_coverage"          
## [73] "rta_type"               "entry_cost_o"           "entry_cost_d"          
## [76] "entry_proc_o"           "entry_proc_d"           "entry_time_o"          
## [79] "entry_time_d"           "entry_tp_o"             "entry_tp_d"            
## [82] "tradeflow_comtrade_o"   "tradeflow_comtrade_d"   "tradeflow_baci"        
## [85] "manuf_tradeflow_baci"   "tradeflow_imf_o"        "tradeflow_imf_d"

3 変数の定義

  • Stata形式のファイル(Gravity_V202211.dta)で変数の定義を確認しておく。
use "Gravity_V202211.dta",clear
describe, replace clear
export excel using varlist.xlsx,replace first(variables)
position name type varlab
1 year int Year
2 country_id_o str5 Gravity country ID, origin
3 country_id_d str5 Gravity country ID, destination
4 iso3_o str3 Origin ISO3 alphabetic
5 iso3_d str3 Destination ISO3 alphabetic
6 iso3num_o int Origin ISO3 numeric
7 iso3num_d int Destination ISO3 numeric
8 country_exists_o byte 1 = Origin country exists
9 country_exists_d byte 1 = Destination country exists
10 gmt_offset_2020_o float Origin GMT offset (hours)
11 gmt_offset_2020_d float Destination GMT offset (hours)
12 distw_harmonic int Population-weighted distance between most populated cities (harmonic mean)
13 distw_arithmetic int Population-weighted distance between most populated cities (arithmetic mean)
14 distw_harmonic_jh int Weighted distance by J. Hinz (harmonic mean)
15 distw_arithmetic_jh int Weighted distance by J. Hinz (arithmetic mean)
16 dist int Simple distance between most populated cities, measured in km
17 main_city_source_o str15 Source of most populated city, origin country
18 main_city_source_d str15 Source of most populated city, destination country
19 distcap int Simple distance between capital city, measured km
20 contig byte =1 if origin and destination are contiguous
21 diplo_disagreement float UN diplomatic disagreement score
22 scaled_sci_2021 long Social connectedness index in 2021
23 comlang_off byte 1 = Common official or primary language
24 comlang_ethno byte 1 = Language is spoken by at least 9% of the population
25 comcol byte 1 = Common colonizer post 1945
26 col45 byte 1 = Pair in colonial relationship post 1945
27 legal_old_o byte Origin legal system before transition
28 legal_old_d byte Destination legal system before transition
29 legal_new_o byte Origin legal system after transition
30 legal_new_d byte Destination legal system after transition
31 comleg_pretrans byte 1 = Common legal origins before transition
32 comleg_posttrans byte 1 = Common legal origins after transition
33 transition_legalchange byte 1 = Common legal origin changed since transition
34 comrelig float Common religion index
35 heg_o byte 1 = Origin is current or former hegemon of destination
36 heg_d byte 1 = Destination is current or former hegemon of origin
37 col_dep_ever byte 1 = Pair ever in colonial or dependency relationship
38 col_dep byte 1 = Pair currently in colonial or dependency relationship
39 col_dep_end_year int Independence date, if col_dep = 1
40 col_dep_end_conflict byte 1 = Independence involved conflict, if col_dep_ever = 1
41 empire str3 Hegemon if sibling = 1 and year < sever_year
42 sibling_ever byte 1 = Pair ever in sibling relationship
43 sibling byte 1 = Pair currently in sibling relationship
44 sever_year int Severance year for pairs if sibling == 1
45 sib_conflict byte 1 = Pair ever in sibling relationship and conflict with hegemon
46 pop_o double Origin Population, total in thousands
47 pop_d double Destination Population, total in thousands
48 gdp_o double Origin GDP (current thousands US\()| |49|gdp_d|double|Destination GDP (current thousands US\))
50 gdpcap_o float Origin GDP per cap (current thousands US\()| |51|gdpcap_d|float|Destination GDP per cap (current thousands US\))
52 pop_source_o byte Origin Population source
53 pop_source_d byte Destination Population source
54 gdp_source_o byte Origin GDP source
55 gdp_source_d byte Destination GDP source
56 gdp_ppp_o double Origin GDP, PPP (current thousands international $)
57 gdp_ppp_d double Destination GDP, PPP (current thousands international $)
58 gdpcap_ppp_o double Origin GDP per cap, PPP (current thousands international $)
59 gdpcap_ppp_d double Destination GDP per cap, PPP (current thousands international $)
position name type varlab
60 pop_pwt_o float Origin Population, total in thousands (PWT)
61 pop_pwt_d float Destination Population, total in thousands (PWT)
62 gdp_ppp_pwt_o float Origin GDP, current PPP (2011 thousands US$) (PWT)
position name type varlab
63 gdp_ppp_pwt_d float Destination GDP, current PPP (2011 thousands US$) (PWT)
64 gatt_o byte Origin GATT membership
65 gatt_d byte Destination GATT membership
66 wto_o byte Origin WTO membership
67 wto_d byte Destination WTO membership
68 eu_o byte 1 = Origin is a EU member
69 eu_d byte 1 = Destination is a EU member
70 fta_wto byte 1=RTA (Source: WTO, 2011)
71 fta_wto_raw byte
72 rta_coverage byte Coverage
73 rta_type byte Type
74 entry_cost_o double Origin Cost of business start-up procedures (% of GNI per capita)
75 entry_cost_d double Destination Cost of business start-up procedures (% of GNI per capita)
76 entry_proc_o byte Origin Start-up procedures to register a business (number)
77 entry_proc_d byte Destination Start-up procedures to register a business (number)
78 entry_time_o double Origin Time required to start a business (days)
79 entry_time_d double Destination Time required to start a business (days)
80 entry_tp_o float Origin Days + procedures to start a business
81 entry_tp_d float Destination Days + procedures to start a business
82 tradeflow_comtrade_o double Trade flows as reported by the origin, 1000 Current USD (source: UNSD)
83 tradeflow_comtrade_d double Trade flows as reported by the destination, 1000 Current USD (source: UNSD)
84 tradeflow_baci double Trade flow, 1000 USD (source: BACI)
85 manuf_tradeflow_baci double Trade flow of manufactured goods, 1000 USD (source: BACI)
86 tradeflow_imf_o float Trade flows as reported by the origin, 1000 current USD (source: IMF)
87 tradeflow_imf_d float Trade flows as reported by the destination, 1000 current USD (source: IMF)

4 必要な変数の抽出

  • 残す変数は以下の通り。

  • year

  • country_id_o

  • country_id_d

  • iso3_o

  • iso3_d

  • iso3num_o

  • iso3num_d

  • dist

  • contig

  • comlang_off

  • gdp_o

  • gdp_d

  • gatt_o

  • gatt_d

  • wto_o

  • wto_d

  • eu_o

  • eu_d

  • fta_wto

  • tradeflow_comtrade_o

  • tradeflow_comtrade_d

  • tradeflow_baci

  • tradeflow_imf_o

  • tradeflow_imf_d

# Extract the above variables
cepii1 <- cepii0[, c("year",
                     "iso3_o", "iso3_d", "iso3num_o", "iso3num_d",
                     "dist", "comlang_off", 
                     "gdp_o", "gdp_d", 
                     "gatt_o", "gatt_d", "wto_o", "wto_d", 
                     "eu_o", "eu_d", 
                     "fta_wto",
                     "tradeflow_comtrade_o", "tradeflow_comtrade_d", 
                     "tradeflow_baci", 
                     "tradeflow_imf_o", "tradeflow_imf_d")]

5 不要な観測値の削除

# N. obs
nrow(cepii1)
## [1] 4699296
# distが欠損であれば、削除
cepii1 <- subset(cepii1, !is.na(cepii1$dist))

nrow(cepii1)
## [1] 3627844

6 iso3_o=iso3_dの観測値の削除

nrow(cepii1)
## [1] 3627844
cepii1 <- subset(cepii1, iso3_o != iso3_d)
nrow(cepii1)
## [1] 3611478

7 country pair IDの作成

  • group_indices()関数は、指定した変数の値に基づいてグループIDを作成する。
# In Stata:  egen pairid = group(iso3_o iso3_d)
# R codes which are equivalent to the Stata codes above
# create country pair ID

library(dplyr)

# Assuming your data frame is named df
cepii1 <- cepii1 %>%
  mutate(pairid = group_indices(., iso3_o, iso3_d))

# Move the pairid to the first column
cepii1 <- cepii1 %>%
  select(pairid, everything())

8 データの保存

  • 保存するファイル名はcepii1.rdsとする。
# Save the dataset
library(rio)
rio::export(cepii1, "../Data_output/cepii1.rds")