# Load the dataset
library(rio)
cepii1 <- import("../Data_output/cepii1.rds")
names(cepii1)
## [1] "pairid" "year" "iso3_o"
## [4] "iso3_d" "iso3num_o" "iso3num_d"
## [7] "dist" "comlang_off" "gdp_o"
## [10] "gdp_d" "gatt_o" "gatt_d"
## [13] "wto_o" "wto_d" "eu_o"
## [16] "eu_d" "fta_wto" "tradeflow_comtrade_o"
## [19] "tradeflow_comtrade_d" "tradeflow_baci" "tradeflow_imf_o"
## [22] "tradeflow_imf_d"
library(tidyverse)
check <- cepii1 %>%
# select variables containing "tradeflow" and "year"
select(contains("tradeflow"), year) %>%
# gather variables into key-value pairs. key = tradeflow & year, value = value
gather(key = "tradeflow", value = "value", -year) %>%
# group by tradeflow and year
group_by(tradeflow, year) %>%
# count missing values
summarise(missing = sum(is.na(value))) %>%
# arrange by descending order of missing values
arrange(desc(missing))
# display the result
check
## # A tibble: 370 × 3
## # Groups: tradeflow [5]
## tradeflow year missing
## <chr> <int> <int>
## 1 tradeflow_baci 2021 54990
## 2 tradeflow_comtrade_d 2021 54990
## 3 tradeflow_comtrade_o 2021 54990
## 4 tradeflow_imf_d 2021 54990
## 5 tradeflow_imf_o 2021 54990
## 6 tradeflow_baci 1993 54056
## 7 tradeflow_baci 1994 53592
## 8 tradeflow_baci 1995 53592
## 9 tradeflow_baci 1992 53130
## 10 tradeflow_baci 1991 52670
## # ℹ 360 more rows
# count the number of observations by year
nobs <- cepii1 %>%
group_by(year) %>%
summarise(n = n())
# merge the number of missing values and the number of observations
check <- left_join(check, nobs, by = "year")
# calculate the ratio of missing values
check <- check %>%
mutate(ratio = missing / n)
# subset of comtrade_d & imf_d
check_do <- check %>%
filter(tradeflow %in% c("tradeflow_imf_d", "tradeflow_imf_o", "tradeflow_baci"))
# plot the ratio of missing values of comtrade_d & imf_d by year
ggplot(check_do , aes(x = year, y = ratio, color = tradeflow)) +
geom_line() +
labs(title = "Ratio of missing values by year",
x = "Year",
y = "Ratio of missing values") +
theme_minimal()
# subset of comtrade_d & imf_d
check_d <- check %>%
filter(tradeflow %in% c("tradeflow_comtrade_d", "tradeflow_imf_d", "tradeflow_baci"))
# plot the ratio of missing values of comtrade_d & imf_d by year
ggplot(check_d , aes(x = year, y = ratio, color = tradeflow)) +
geom_line() +
labs(title = "Ratio of missing values by year",
x = "Year",
y = "Ratio of missing values") +
theme_minimal()
tradeflow_imf_dのみを残し、以下の変数は削除する。
“tradeflow_comtrade_o”
“tradeflow_comtrade_d”
“tradeflow_baci”
“tradeflow_imf_o”
# subset of tradeflow_imf_d
cepii2 <- cepii1 %>% select(-contains("comtrade"), -contains("baci"), -contains("imf_o"))
# Check the remaining variables
names(cepii2)
## [1] "pairid" "year" "iso3_o" "iso3_d"
## [5] "iso3num_o" "iso3num_d" "dist" "comlang_off"
## [9] "gdp_o" "gdp_d" "gatt_o" "gatt_d"
## [13] "wto_o" "wto_d" "eu_o" "eu_d"
## [17] "fta_wto" "tradeflow_imf_d"