Introduction

This file and code consolidates my data cleaning steps into one document. This file does not include any analysis steps.

setwd("~/Desktop/University of Utah PhD /Research/r_code")

Country Level Dataframe

## Reminder this data *INCLUDES* imputed conflict counts which was completed in excel

hma.con.df = read.csv("../data/conflict/hma_initial_model_df_v3_20240304.csv")
## Tidy up the dataframe
## pivot from wide to long

# load tidyverse library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Pivot Longer
hma.con.df.long = hma.con.df %>% 
  pivot_longer(
    cols = -c(country),
    names_sep = "\\.",
    names_to = c(".value", "year")
  )


## Check
head(hma.con.df.long)
## # A tibble: 6 × 15
##   country year      gdp    pop  temp    cd    ge   con   cpt   rol    rq polstab
##   <chr>   <chr>   <dbl>  <int> <dbl> <int> <dbl> <int> <dbl> <dbl> <dbl>   <dbl>
## 1 Afghan… 2010  1.56e10 2.82e7 1.61      4 -1.48 10727 -1.65 -1.87 -1.52   -2.58
## 2 Afghan… 2011  1.82e10 2.92e7 1.40      4 -1.48 10727 -1.6  -1.92 -1.54   -2.5 
## 3 Afghan… 2012  2.02e10 3.05e7 0.223    10 -1.38 10727 -1.43 -1.65 -1.19   -2.42
## 4 Afghan… 2013  2.06e10 3.15e7 1.28      5 -1.4  10727 -1.45 -1.61 -1.19   -2.52
## 5 Afghan… 2014  2.06e10 3.27e7 0.456     3 -1.36 10727 -1.36 -1.44 -1.12   -2.41
## 6 Afghan… 2015  2.00e10 3.38e7 1.09      5 -1.35 10727 -1.35 -1.52 -1.02   -2.56
## # ℹ 3 more variables: voice <dbl>, urbanpop <dbl>, waterwith <dbl>
# save the updated data frame to the clean folder

# imputed dataframe
#write.csv(hma.con.df.long, file = "../cleaned_data/hma_country_df_clean_20240418.csv")

Remove Imputed Years

This section of code drops observations that use estimated conflict counts for countries without data in ACLED. The list below denotes when data collection began: - Afghanistan: 2017 - Bangladesh: 2010 - Bhutan: 2020 - China: 2018 - India: 2016 - Kyrgyz Republic: 2018 - Nepal: 2010 - Pakistan: 2010 - Myanmar: 2010 - Tajikistan: 2018 - Turkmenistan: 2018 - Uzbekistan: 2018

## Duplicate the long dataframe
con2 = hma.con.df.long
##Subset all of the other countries

##subset dataframe based on Afghanistan and years 2017 and on
afg = subset(con2, con2$country == 'Afghanistan' & con2$year >= 2017)

##subset dataframe based on Bangladesh and years 2010 and on
ban = subset(con2, con2$country == 'Bangladesh' & con2$year >= 2010)

##subset dataframe based on Bangladesh and years 2020 and on
bhu = subset(con2, con2$country == 'Bhutan' & con2$year >= 2020)

##subset dataframe based on China and years 2018 and on
chi = subset(con2, con2$country == 'China' & con2$year >= 2018)

##subset dataframe based on India and years 2016 and on
ind = subset(con2, con2$country == 'India' & con2$year >= 2016)

##subset dataframe based on Kyrgyz Republic and years 2018 and on
kyr = subset(con2, con2$country == 'Kyrgyz Republic' & con2$year >= 2018)

##subset dataframe based on Nepal and years 2010 and on
nep = subset(con2, con2$country == 'Nepal' & con2$year >= 2010)

##subset dataframe based on Pakistan and years 2010 and on
pak = subset(con2, con2$country == 'Pakistan' & con2$year >= 2010)

##subset dataframe based on Myanmar and years 2010 and on
mya = subset(con2, con2$country == 'Myanmar' & con2$year >= 2010)

##subset dataframe based on Tajikistan and years 2010 and on
taj = subset(con2, con2$country == 'Tajikistan' & con2$year >= 2018)

##subset dataframe based on Turkmenistan and years 2010 and on
tur = subset(con2, con2$country == 'Turkmenistan' & con2$year >= 2018)

##subset dataframe based on Uzbekistan and years 2010 and on
uzb = subset(con2, con2$country == 'Uzbekistan' & con2$year >= 2018)
##Merge all of the countries back together into one dataframe

##Merge by rows using rbind
con.reduced = rbind(afg, ban, bhu, chi, ind, kyr, nep, pak, mya, taj, tur, uzb)

##Check
#view(con.reduced)
# save the clean data frame to the clean folder
#write.csv(con.reduced, file = "../cleaned_data/hma_country_non_imp_df_clean_20240418.csv")

Administrative Level 1 Dataframe

Note:

  • 2022 population was imputed in excel using a standard growth rate calculation
  • nlights 2010, 2011, 2021, and 2022 are missing
  • Conflict counts by country/year are missing:
    • Afghanistan: 2017
    • Bangladesh: 2010
    • Bhutan: 2020
    • China: 2018
    • India: 2016
    • Kyrgyz Republic: 2018
    • Nepal: 2010
    • Pakistan: 2010
    • Myanmar: 2010
    • Tajikistan: 2018
    • Turkmenistan: 2018
    • Uzbekistan: 2018
## Load the data frame with adm level 1
# 2022 population was imputed in excel using a standard growth rate calculation
# nlights 2010, 2011, 2021, and 2022 are missing
# conflict counts by country/year are missing

hma.con.prov = read.csv("../data/conflict/hma_df_provinces_20240318.csv")
## Tidy up the dataframe
## pivot from wide to long
## Pivot Longer

hma.con.prov.long = hma.con.prov %>% 
  pivot_longer(
    cols = -c(country, adm1),
    names_sep = "\\.",
    names_to = c(".value", "year")
  )


## Check
head(hma.con.prov.long)
## # A tibble: 6 × 14
##   country adm1       year      pop  temp precip  nlights    ge   cpt   rol    rq
##   <chr>   <chr>      <chr>   <dbl> <dbl>  <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AFG     Badakhshan 2022   1.03e6  2.74   52.0 NA       -1.44 -1.18 -1.66 -1.27
## 2 AFG     Badakhshan 2021   8.89e5  2.30   35.0 NA       -1.63 -1.15 -1.88 -1.31
## 3 AFG     Badakhshan 2020   7.66e5  1.24   61.8  0.00162 -1.59 -1.49 -1.83 -1.39
## 4 AFG     Badakhshan 2019   7.15e5  1.73   61.0  0.00153 -1.5  -1.42 -1.74 -1.11
## 5 AFG     Badakhshan 2018   6.01e5  2.25   44.4  0.00140 -1.48 -1.5  -1.69 -1.14
## 6 AFG     Badakhshan 2017   1.19e6  1.85   53.5  0.00167 -1.36 -1.53 -1.58 -1.37
## # ℹ 3 more variables: polstab <dbl>, voice <dbl>, con <int>

Non-imputed Data Frame

# filter out rows with NAs
prov.new = hma.con.prov.long %>% 
  filter(nlights != "NA") %>% 
  filter(con != "NA")

# check to ensure no NAs are in the data frame
summary(prov.new)
##    country              adm1               year                pop           
##  Length:899         Length:899         Length:899         Min.   :     3174  
##  Class :character   Class :character   Class :character   1st Qu.:  1134621  
##  Mode  :character   Mode  :character   Mode  :character   Median :  2940323  
##                                                           Mean   : 16711188  
##                                                           3rd Qu.: 19063332  
##                                                           Max.   :222543957  
##       temp            precip           nlights               ge         
##  Min.   :-3.056   Min.   :  2.633   Min.   : 0.00004   Min.   :-1.5900  
##  1st Qu.:12.690   1st Qu.: 41.803   1st Qu.: 0.01986   1st Qu.:-1.1700  
##  Median :19.986   Median : 92.840   Median : 0.12605   Median :-0.7400  
##  Mean   :18.271   Mean   :105.605   Mean   : 1.16155   Mean   :-0.5845  
##  3rd Qu.:25.630   3rd Qu.:145.229   3rd Qu.: 0.46848   3rd Qu.: 0.0900  
##  Max.   :29.199   Max.   :372.400   Max.   :87.28505   Max.   : 0.6800  
##       cpt               rol                rq             polstab      
##  Min.   :-1.5300   Min.   :-1.8300   Min.   :-2.0500   Min.   :-2.790  
##  1st Qu.:-1.0700   1st Qu.:-1.1900   1st Qu.:-1.1100   1st Qu.:-1.510  
##  Median :-0.8000   Median :-0.7800   Median :-0.7500   Median :-0.960  
##  Mean   :-0.7279   Mean   :-0.7665   Mean   :-0.7634   Mean   :-1.186  
##  3rd Qu.:-0.3000   3rd Qu.:-0.1000   3rd Qu.:-0.3000   3rd Qu.:-0.560  
##  Max.   : 1.6200   Max.   : 0.5800   Max.   :-0.1300   Max.   : 1.010  
##      voice              con        
##  Min.   :-2.1800   Min.   :   0.0  
##  1st Qu.:-1.2200   1st Qu.:  16.0  
##  Median :-0.8600   Median :  92.0  
##  Mean   :-0.7385   Mean   : 263.9  
##  3rd Qu.:-0.1500   3rd Qu.: 303.0  
##  Max.   : 0.4400   Max.   :5129.0
# write the non-imputed data frame to csv
#write.csv(prov.new, file = "../cleaned_data/hma_adm1_non_imp_df_clean_20240418.csv")

Imputation Data Frame

The following imputation uses the missForest R package. Details on the imputation procedures can be found here.

Note: missForest requires the use of a data frame object (not a tibble), factors (not characters) for non-numeric data, and cannot impute with a large number of classes. In this case we change country, adm1, and year in to factors or numerics. We subsequently removed the adm1 from the data frame for imputation, then return it afterwards.

Additionally, the missForest object stores two things: 1. ximp - this contains the imputed data frame. 2. OOB Error - this is the Normalized RMSE error associated with the imputation.

#Load missForest
library(missForest)
# create a new data frame
prov.imp.temp = hma.con.prov.long

##Convert Characters into factors
prov.imp.temp$country = as.factor(prov.imp.temp$country)
prov.imp.temp$adm1 = as.factor(prov.imp.temp$adm1)
prov.imp.temp$year = as.numeric(prov.imp.temp$year)

##Convert to a dataframe (not tibble) for missForest
prov.imp.temp = as.data.frame(prov.imp.temp)

##Remove the adm1 levels for missForest to run correctly
hma.prov.ex = subset(prov.imp.temp, select = -adm1)
##Impute using missForest
prov.miss.forest = missForest(hma.prov.ex)
          

##Check object
##two portions
# ximp = stores the imputed data frame
# OOBerror stores the error associated with the imputation
summary(prov.miss.forest)
##          Length Class      Mode   
## ximp     13     data.frame list   
## OOBerror  2     -none-     numeric
##Check the imputation to see how well it did
##nlights
plot(density(hma.prov.ex$nlights, na.rm=TRUE))
lines(density(prov.miss.forest$ximp$nlights), col = "red", lty = 2)

##conflict
plot(density(hma.prov.ex$con, na.rm=TRUE))
lines(density(prov.miss.forest$ximp$con), col = "red", lty = 2)

##Create a new dataframe with the imputed values
prov.df = prov.miss.forest$ximp

##Add back in the adm1
prov.df$adm1 = hma.con.prov.long$adm1

##relocate adm1 column after country
prov.df = prov.df %>% 
  relocate(adm1, .after = country)

##Check
str(prov.df)
## 'data.frame':    2535 obs. of  14 variables:
##  $ country: Factor w/ 12 levels "AFG","BGD","BTN",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ adm1   : chr  "Badakhshan" "Badakhshan" "Badakhshan" "Badakhshan" ...
##  $ year   : num  2022 2021 2020 2019 2018 ...
##  $ pop    : num  1031700 888742 765594 715202 600919 ...
##  $ temp   : num  2.74 2.3 1.24 1.73 2.25 ...
##  $ precip : num  52 35 61.8 61 44.4 ...
##  $ nlights: num  0.49843 0.28196 0.00162 0.00153 0.0014 ...
##  $ ge     : num  -1.44 -1.63 -1.59 -1.5 -1.48 -1.36 -1.25 -1.35 -1.36 -1.4 ...
##  $ cpt    : num  -1.18 -1.15 -1.49 -1.42 -1.5 -1.53 -1.54 -1.35 -1.36 -1.45 ...
##  $ rol    : num  -1.66 -1.88 -1.83 -1.74 -1.69 -1.58 -1.52 -1.52 -1.44 -1.61 ...
##  $ rq     : num  -1.27 -1.31 -1.39 -1.11 -1.14 -1.37 -1.34 -1.02 -1.12 -1.19 ...
##  $ polstab: num  -2.55 -2.52 -2.7 -2.65 -2.75 -2.79 -2.66 -2.56 -2.41 -2.52 ...
##  $ voice  : num  -1.75 -1.57 -1.08 -1.01 -1.01 -0.99 -1.04 -1.12 -1.14 -1.24 ...
##  $ con    : num  129 250 250 200 177 ...
##write the finished dataframe to a new csv
#write.csv(prov.df, file = "../cleaned_data/hma_adm1_imp_df_clean_20240418.csv")