1 データ

# Load the dataset
library(rio)
cepii1 <- import("../Data_output/cepii1.rds")

names(cepii1)
##  [1] "pairid"               "year"                 "iso3_o"              
##  [4] "iso3_d"               "iso3num_o"            "iso3num_d"           
##  [7] "dist"                 "comlang_off"          "gdp_o"               
## [10] "gdp_d"                "gatt_o"               "gatt_d"              
## [13] "wto_o"                "wto_d"                "eu_o"                
## [16] "eu_d"                 "fta_wto"              "tradeflow_comtrade_o"
## [19] "tradeflow_comtrade_d" "tradeflow_baci"       "tradeflow_imf_o"     
## [22] "tradeflow_imf_d"

2 変数ごとの年次別欠損状況の確認

  • “tradeflow_comtrade_o”
  • “tradeflow_comtrade_d”
  • “tradeflow_baci”
  • “tradeflow_imf_o”
  • “tradeflow_imf_d”

3 Check missing values by year

  • select()関数は、指定した変数を抽出する。
  • gather()関数は、変数を縦に積み重ねる。
  • group_by()関数は、指定した変数でグループ化する。
  • summarise()関数は、指定した変数の集計値を計算する。
  • arrange()関数は、指定した変数でデータを並び替える。
library(tidyverse)

check <- cepii1 %>%
  # select variables containing "tradeflow" and "year"
  select(contains("tradeflow"), year) %>%
  # gather variables into key-value pairs. key = tradeflow & year, value = value
  gather(key = "tradeflow", value = "value", -year) %>%
  # group by tradeflow and year
  group_by(tradeflow, year) %>%
  # count missing values
  summarise(missing = sum(is.na(value))) %>%
  # arrange by descending order of missing values
  arrange(desc(missing))

# display the result
check
## # A tibble: 370 × 3
## # Groups:   tradeflow [5]
##    tradeflow             year missing
##    <chr>                <int>   <int>
##  1 tradeflow_baci        2021   54990
##  2 tradeflow_comtrade_d  2021   54990
##  3 tradeflow_comtrade_o  2021   54990
##  4 tradeflow_imf_d       2021   54990
##  5 tradeflow_imf_o       2021   54990
##  6 tradeflow_baci        1993   54056
##  7 tradeflow_baci        1994   53592
##  8 tradeflow_baci        1995   53592
##  9 tradeflow_baci        1992   53130
## 10 tradeflow_baci        1991   52670
## # ℹ 360 more rows

4 欠損値数を割合で表示

# count the number of observations by year
nobs <- cepii1 %>%
  group_by(year) %>%
  summarise(n = n())
# merge the number of missing values and the number of observations
check <- left_join(check, nobs, by = "year")
# calculate the ratio of missing values
check <- check %>%
  mutate(ratio = missing / n)

5 目的地の貿易フローの欠損値と出発地の貿易フローの欠損値の比較

  • 目的地の貿易フローの欠損値の割合が出発地の貿易フローの欠損値の割合よりも小さい傾向が見られる。
# subset of comtrade_d & imf_d
check_do <- check %>%
  filter(tradeflow %in% c("tradeflow_imf_d", "tradeflow_imf_o", "tradeflow_baci"))

# plot the ratio of missing values of comtrade_d & imf_d by year
ggplot(check_do , aes(x = year, y = ratio, color = tradeflow)) +
  geom_line() +
  labs(title = "Ratio of missing values by year",
       x = "Year",
       y = "Ratio of missing values") +
  theme_minimal()

6 目的地の貿易フローの欠損値の割合

  • filter()関数は、指定した条件に合致する観測値を抽出する。
# subset of comtrade_d & imf_d
check_d <- check %>%
  filter(tradeflow %in% c("tradeflow_comtrade_d", "tradeflow_imf_d", "tradeflow_baci"))

# plot the ratio of missing values of comtrade_d & imf_d by year
ggplot(check_d , aes(x = year, y = ratio, color = tradeflow)) +
  geom_line() +
  labs(title = "Ratio of missing values by year",
       x = "Year",
       y = "Ratio of missing values") +
  theme_minimal()

7 欠損状況が少ない貿易変数のみを残す

  • tradeflow_imf_dのみを残し、以下の変数は削除する。

  • “tradeflow_comtrade_o”

  • “tradeflow_comtrade_d”

  • “tradeflow_baci”

  • “tradeflow_imf_o”

# subset of tradeflow_imf_d
cepii2 <- cepii1 %>% select(-contains("comtrade"), -contains("baci"), -contains("imf_o"))

# Check the remaining variables
names(cepii2)
##  [1] "pairid"          "year"            "iso3_o"          "iso3_d"         
##  [5] "iso3num_o"       "iso3num_d"       "dist"            "comlang_off"    
##  [9] "gdp_o"           "gdp_d"           "gatt_o"          "gatt_d"         
## [13] "wto_o"           "wto_d"           "eu_o"            "eu_d"           
## [17] "fta_wto"         "tradeflow_imf_d"

8 データの保存

library(rio)
export(cepii2, "../Data_output/cepii2.rds")