This file and code consolidates my data cleaning steps into one document. This file does not include any analysis steps.
setwd("~/Desktop/University of Utah PhD /Research/r_code")
## Reminder this data *INCLUDES* imputed conflict counts which was completed in excel
hma.con.df = read.csv("../data/conflict/hma_initial_model_df_v3_20240304.csv")
## Tidy up the dataframe
## pivot from wide to long
# load tidyverse library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Pivot Longer
hma.con.df.long = hma.con.df %>%
pivot_longer(
cols = -c(country),
names_sep = "\\.",
names_to = c(".value", "year")
)
## Check
head(hma.con.df.long)
## # A tibble: 6 × 15
## country year gdp pop temp cd ge con cpt rol rq polstab
## <chr> <chr> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… 2010 1.56e10 2.82e7 1.61 4 -1.48 10727 -1.65 -1.87 -1.52 -2.58
## 2 Afghan… 2011 1.82e10 2.92e7 1.40 4 -1.48 10727 -1.6 -1.92 -1.54 -2.5
## 3 Afghan… 2012 2.02e10 3.05e7 0.223 10 -1.38 10727 -1.43 -1.65 -1.19 -2.42
## 4 Afghan… 2013 2.06e10 3.15e7 1.28 5 -1.4 10727 -1.45 -1.61 -1.19 -2.52
## 5 Afghan… 2014 2.06e10 3.27e7 0.456 3 -1.36 10727 -1.36 -1.44 -1.12 -2.41
## 6 Afghan… 2015 2.00e10 3.38e7 1.09 5 -1.35 10727 -1.35 -1.52 -1.02 -2.56
## # ℹ 3 more variables: voice <dbl>, urbanpop <dbl>, waterwith <dbl>
# save the updated data frame to the clean folder
# imputed dataframe
#write.csv(hma.con.df.long, file = "../cleaned_data/hma_country_df_clean_20240418.csv")
This section of code drops observations that use estimated conflict counts for countries without data in ACLED. The list below denotes when data collection began: - Afghanistan: 2017 - Bangladesh: 2010 - Bhutan: 2020 - China: 2018 - India: 2016 - Kyrgyz Republic: 2018 - Nepal: 2010 - Pakistan: 2010 - Myanmar: 2010 - Tajikistan: 2018 - Turkmenistan: 2018 - Uzbekistan: 2018
## Duplicate the long dataframe
con2 = hma.con.df.long
##Subset all of the other countries
##subset dataframe based on Afghanistan and years 2017 and on
afg = subset(con2, con2$country == 'Afghanistan' & con2$year >= 2017)
##subset dataframe based on Bangladesh and years 2010 and on
ban = subset(con2, con2$country == 'Bangladesh' & con2$year >= 2010)
##subset dataframe based on Bangladesh and years 2020 and on
bhu = subset(con2, con2$country == 'Bhutan' & con2$year >= 2020)
##subset dataframe based on China and years 2018 and on
chi = subset(con2, con2$country == 'China' & con2$year >= 2018)
##subset dataframe based on India and years 2016 and on
ind = subset(con2, con2$country == 'India' & con2$year >= 2016)
##subset dataframe based on Kyrgyz Republic and years 2018 and on
kyr = subset(con2, con2$country == 'Kyrgyz Republic' & con2$year >= 2018)
##subset dataframe based on Nepal and years 2010 and on
nep = subset(con2, con2$country == 'Nepal' & con2$year >= 2010)
##subset dataframe based on Pakistan and years 2010 and on
pak = subset(con2, con2$country == 'Pakistan' & con2$year >= 2010)
##subset dataframe based on Myanmar and years 2010 and on
mya = subset(con2, con2$country == 'Myanmar' & con2$year >= 2010)
##subset dataframe based on Tajikistan and years 2010 and on
taj = subset(con2, con2$country == 'Tajikistan' & con2$year >= 2018)
##subset dataframe based on Turkmenistan and years 2010 and on
tur = subset(con2, con2$country == 'Turkmenistan' & con2$year >= 2018)
##subset dataframe based on Uzbekistan and years 2010 and on
uzb = subset(con2, con2$country == 'Uzbekistan' & con2$year >= 2018)
##Merge all of the countries back together into one dataframe
##Merge by rows using rbind
con.reduced = rbind(afg, ban, bhu, chi, ind, kyr, nep, pak, mya, taj, tur, uzb)
##Check
#view(con.reduced)
# save the clean data frame to the clean folder
#write.csv(con.reduced, file = "../cleaned_data/hma_country_non_imp_df_clean_20240418.csv")
Note:
## Load the data frame with adm level 1
# 2022 population was imputed in excel using a standard growth rate calculation
# nlights 2010, 2011, 2021, and 2022 are missing
# conflict counts by country/year are missing
hma.con.prov = read.csv("../data/conflict/hma_df_provinces_20240318.csv")
## Tidy up the dataframe
## pivot from wide to long
## Pivot Longer
hma.con.prov.long = hma.con.prov %>%
pivot_longer(
cols = -c(country, adm1),
names_sep = "\\.",
names_to = c(".value", "year")
)
## Check
head(hma.con.prov.long)
## # A tibble: 6 × 14
## country adm1 year pop temp precip nlights ge cpt rol rq
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AFG Badakhshan 2022 1.03e6 2.74 52.0 NA -1.44 -1.18 -1.66 -1.27
## 2 AFG Badakhshan 2021 8.89e5 2.30 35.0 NA -1.63 -1.15 -1.88 -1.31
## 3 AFG Badakhshan 2020 7.66e5 1.24 61.8 0.00162 -1.59 -1.49 -1.83 -1.39
## 4 AFG Badakhshan 2019 7.15e5 1.73 61.0 0.00153 -1.5 -1.42 -1.74 -1.11
## 5 AFG Badakhshan 2018 6.01e5 2.25 44.4 0.00140 -1.48 -1.5 -1.69 -1.14
## 6 AFG Badakhshan 2017 1.19e6 1.85 53.5 0.00167 -1.36 -1.53 -1.58 -1.37
## # ℹ 3 more variables: polstab <dbl>, voice <dbl>, con <int>
# filter out rows with NAs
prov.new = hma.con.prov.long %>%
filter(nlights != "NA") %>%
filter(con != "NA")
# check to ensure no NAs are in the data frame
summary(prov.new)
## country adm1 year pop
## Length:899 Length:899 Length:899 Min. : 3174
## Class :character Class :character Class :character 1st Qu.: 1134621
## Mode :character Mode :character Mode :character Median : 2940323
## Mean : 16711188
## 3rd Qu.: 19063332
## Max. :222543957
## temp precip nlights ge
## Min. :-3.056 Min. : 2.633 Min. : 0.00004 Min. :-1.5900
## 1st Qu.:12.690 1st Qu.: 41.803 1st Qu.: 0.01986 1st Qu.:-1.1700
## Median :19.986 Median : 92.840 Median : 0.12605 Median :-0.7400
## Mean :18.271 Mean :105.605 Mean : 1.16155 Mean :-0.5845
## 3rd Qu.:25.630 3rd Qu.:145.229 3rd Qu.: 0.46848 3rd Qu.: 0.0900
## Max. :29.199 Max. :372.400 Max. :87.28505 Max. : 0.6800
## cpt rol rq polstab
## Min. :-1.5300 Min. :-1.8300 Min. :-2.0500 Min. :-2.790
## 1st Qu.:-1.0700 1st Qu.:-1.1900 1st Qu.:-1.1100 1st Qu.:-1.510
## Median :-0.8000 Median :-0.7800 Median :-0.7500 Median :-0.960
## Mean :-0.7279 Mean :-0.7665 Mean :-0.7634 Mean :-1.186
## 3rd Qu.:-0.3000 3rd Qu.:-0.1000 3rd Qu.:-0.3000 3rd Qu.:-0.560
## Max. : 1.6200 Max. : 0.5800 Max. :-0.1300 Max. : 1.010
## voice con
## Min. :-2.1800 Min. : 0.0
## 1st Qu.:-1.2200 1st Qu.: 16.0
## Median :-0.8600 Median : 92.0
## Mean :-0.7385 Mean : 263.9
## 3rd Qu.:-0.1500 3rd Qu.: 303.0
## Max. : 0.4400 Max. :5129.0
# write the non-imputed data frame to csv
#write.csv(prov.new, file = "../cleaned_data/hma_adm1_non_imp_df_clean_20240418.csv")
The following imputation uses the missForest R package. Details on the imputation procedures can be found here.
Note: missForest requires the use of a data frame object (not a tibble), factors (not characters) for non-numeric data, and cannot impute with a large number of classes. In this case we change country, adm1, and year in to factors or numerics. We subsequently removed the adm1 from the data frame for imputation, then return it afterwards.
Additionally, the missForest object stores two things: 1. ximp - this contains the imputed data frame. 2. OOB Error - this is the Normalized RMSE error associated with the imputation.
#Load missForest
library(missForest)
# create a new data frame
prov.imp.temp = hma.con.prov.long
##Convert Characters into factors
prov.imp.temp$country = as.factor(prov.imp.temp$country)
prov.imp.temp$adm1 = as.factor(prov.imp.temp$adm1)
prov.imp.temp$year = as.numeric(prov.imp.temp$year)
##Convert to a dataframe (not tibble) for missForest
prov.imp.temp = as.data.frame(prov.imp.temp)
##Remove the adm1 levels for missForest to run correctly
hma.prov.ex = subset(prov.imp.temp, select = -adm1)
##Impute using missForest
prov.miss.forest = missForest(hma.prov.ex)
##Check object
##two portions
# ximp = stores the imputed data frame
# OOBerror stores the error associated with the imputation
summary(prov.miss.forest)
## Length Class Mode
## ximp 13 data.frame list
## OOBerror 2 -none- numeric
##Check the imputation to see how well it did
##nlights
plot(density(hma.prov.ex$nlights, na.rm=TRUE))
lines(density(prov.miss.forest$ximp$nlights), col = "red", lty = 2)
##conflict
plot(density(hma.prov.ex$con, na.rm=TRUE))
lines(density(prov.miss.forest$ximp$con), col = "red", lty = 2)
##Create a new dataframe with the imputed values
prov.df = prov.miss.forest$ximp
##Add back in the adm1
prov.df$adm1 = hma.con.prov.long$adm1
##relocate adm1 column after country
prov.df = prov.df %>%
relocate(adm1, .after = country)
##Check
str(prov.df)
## 'data.frame': 2535 obs. of 14 variables:
## $ country: Factor w/ 12 levels "AFG","BGD","BTN",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ adm1 : chr "Badakhshan" "Badakhshan" "Badakhshan" "Badakhshan" ...
## $ year : num 2022 2021 2020 2019 2018 ...
## $ pop : num 1031700 888742 765594 715202 600919 ...
## $ temp : num 2.74 2.3 1.24 1.73 2.25 ...
## $ precip : num 52 35 61.8 61 44.4 ...
## $ nlights: num 0.49843 0.28196 0.00162 0.00153 0.0014 ...
## $ ge : num -1.44 -1.63 -1.59 -1.5 -1.48 -1.36 -1.25 -1.35 -1.36 -1.4 ...
## $ cpt : num -1.18 -1.15 -1.49 -1.42 -1.5 -1.53 -1.54 -1.35 -1.36 -1.45 ...
## $ rol : num -1.66 -1.88 -1.83 -1.74 -1.69 -1.58 -1.52 -1.52 -1.44 -1.61 ...
## $ rq : num -1.27 -1.31 -1.39 -1.11 -1.14 -1.37 -1.34 -1.02 -1.12 -1.19 ...
## $ polstab: num -2.55 -2.52 -2.7 -2.65 -2.75 -2.79 -2.66 -2.56 -2.41 -2.52 ...
## $ voice : num -1.75 -1.57 -1.08 -1.01 -1.01 -0.99 -1.04 -1.12 -1.14 -1.24 ...
## $ con : num 129 250 250 200 177 ...
##write the finished dataframe to a new csv
#write.csv(prov.df, file = "../cleaned_data/hma_adm1_imp_df_clean_20240418.csv")