NDMA Data De-Identification

The following are steps undertaken for deidentifying NDMA data. This dataset covers the month of September 2024 for HHA, KIA and MUAC. Since the geocoordinates in the HHA dataset represent household coordinates, we will mask them (random displacement) using the Haversine Formula to randomly distribute a point around a central coordinate within a radius of 2.5 KM and drop other P.I.I.s in the relevant sheets.

1. HHA

# Load required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(geosphere)
## Warning: package 'geosphere' was built under R version 4.3.3
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.3.3
file_path <- "C:/Users/AAH USER/Downloads/HHA, KIA & MUAC September 2024.xlsx"

# Read the specific HHA sheet into a data frame
hha_sep_2024 <- read.xlsx(file_path, sheet = "HHA", startRow = 2)

The dataset contains household coordinates in columns “Lat” and “Long” which are considered P.I.I’s so we mask the coordinates, verify by plotting a histogram of the distribution of displacement distances of the original and displaced coordinates to establish uniformity.

We check for and deal with outliers if any in the “Lat” and “Long” columns

# Verify the dataset
summary(hha_sep_2024$Lat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -4.54397 -1.70073 -0.02622 -0.07814  1.41724  4.07071
summary(hha_sep_2024$Long)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34.86   36.78   37.87   38.02   39.46   41.86

All good. We proceed to mask the coordinates.

# Create backup columns for original coordinates
hha_sep_2024$Original_Lat <- hha_sep_2024$Lat
hha_sep_2024$Original_Long <- hha_sep_2024$Long

# Function to generate random displaced coordinates with uniform distance distribution
mask_coordinates_uniform <- function(lat, lon, radius_km) {
  R <- 6371  # Earth radius in kilometers
  
  # Random bearing angle (in radians)
  bearing <- runif(1, 0, 2 * pi)
  
  # Random distance uniformly sampled from [0, radius_km]
  rand_dist <- runif(1, 0, radius_km) / R  # Uniformly sampled distance in radians
  
  # Convert original coordinates to radians
  lat_rad <- lat * pi / 180
  lon_rad <- lon * pi / 180
  
  # Calculate new latitude (in radians)
  new_lat <- asin(sin(lat_rad) * cos(rand_dist) + 
                    cos(lat_rad) * sin(rand_dist) * cos(bearing))
  
  # Calculate new longitude (in radians)
  new_lon <- lon_rad + atan2(sin(bearing) * sin(rand_dist) * cos(lat_rad),
                             cos(rand_dist) - sin(lat_rad) * sin(new_lat))
  
  # Convert back to degrees
  new_lat <- new_lat * 180 / pi
  new_lon <- new_lon * 180 / pi
  
  return(c(new_lat, new_lon))
}

# Set displacement radius in kilometers
radius_km <- 2.5  

# Generate masked coordinates for each row using the modified function
masked_coords <- t(apply(hha_sep_2024, 1, function(row) {
  mask_coordinates_uniform(as.numeric(row["Original_Lat"]), as.numeric(row["Original_Long"]), radius_km)
}))

# Replace the original Lat and Long columns with the masked coordinates
hha_sep_2024$Lat <- masked_coords[, 1]
hha_sep_2024$Long <- masked_coords[, 2]

We then evaluate the distribution of the displacement distances before dropping the original coordinates by plotting a histogram and a scatter plot.

# Calculate displacement distances (in kilometers) as before
displacement_distances <- distHaversine(
  cbind(hha_sep_2024$Long, hha_sep_2024$Lat),  # Masked coordinates
  cbind(hha_sep_2024$Original_Long, hha_sep_2024$Original_Lat)  # Original coordinates
) / 1000  # Convert meters to kilometers

# Add displacement distances to the dataset for further analysis
hha_sep_2024$Displacement_Distance <- displacement_distances

# Plot: Histogram of displacement distances
ggplot(hha_sep_2024, aes(x = Displacement_Distance)) +
  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Displacement Distances",
       x = "Displacement Distance (km)", y = "Count") +
  theme_minimal()

There are no significant peaks or valleys in the histogram, suggesting that the displacements are indeed more uniformly distributed, as intended.

# Plot: Original vs Masked Coordinates Scatterplot
ggplot() +
  geom_point(data = hha_sep_2024, aes(x = Original_Long, y = Original_Lat, color = "Original"), 
             alpha = 0.5, size = 1, shape = 16) +
  geom_point(data = hha_sep_2024, aes(x = Long, y = Lat, color = "Displaced"), 
             alpha = 0.5, size = 1, shape = 16) +
  labs(title = "Original vs Displaced Coordinates",
       x = "Longitude", y = "Latitude", color = "Coordinate Type") +  # Legend title
  scale_color_manual(values = c("Original" = "red", "Displaced" = "blue")) +  # Custom colors
  theme_minimal() +
  coord_fixed()  # Ensures aspect ratio is 1:1 for accurate geographic representation

The scatterplot displays the original coordinates in red and the displaced coordinates in blue. It helps visualize how the coordinates were shifted.

We then drop the original coordinates column leaving only the masked coordinates columns. We also drop other PII’s which are the “HouseholdName”, “HouseHoldHead”, and “RespondentName”.

# Drop the specified PII columns along with original coordinates
hha_sep_2024 <- hha_sep_2024 %>%
  select(-c(Original_Lat, Original_Long, Displacement_Distance, HouseholdName, HouseHoldHead, RespondentName))  

# Check the updated dataset
head(hha_sep_2024)
##      QID County    SubCounty  Ward LivelihoodZone     Month Year       Lat
## 1 532993 Isiolo Isiolo North Burat  Agro Pastoral September 2024 0.2614619
## 2 532995 Isiolo Isiolo North Burat  Agro Pastoral September 2024 0.2758737
## 3 532996 Isiolo Isiolo North Burat  Agro Pastoral September 2024 0.2770786
## 4 532997 Isiolo Isiolo North Burat  Agro Pastoral September 2024 0.2757905
## 5 532998 Isiolo Isiolo North Burat  Agro Pastoral September 2024 0.2760298
## 6 532999 Isiolo Isiolo North Burat  Agro Pastoral September 2024 0.2697390
##       Long InterviewDate HouseholdCode HeadEducationLevel MainHHIncomeSource
## 1 37.56740      45538.00       ISL0417               None   3. Sale of crops
## 2 37.55013      45544.00       ISL0405               None   3. Sale of crops
## 3 37.54974      45544.00       ISL0430       2. Secondary   3. Sale of crops
## 4 37.55557      45549.00       ISL0429         1. Primary   4. Casual labour
## 5 37.54705      45549.34       ISL0406               None   3. Sale of crops
## 6 37.56004      45549.00       ISL0401         1. Primary   4. Casual labour
##   HeadGender RespondentGender MaleMembers FemaleMembers ChildrenBelow5
## 1     Female           Female           3             2              2
## 2     Female           Female           3             4              2
## 3     Female           Female           4             2              2
## 4     Female           Female           2             3              3
## 5     Female           Female           3             2              2
## 6     Female           Female           5             2              2
##   KeepLivestock MilkAnimals               MilkSource HowOftenMilked
## 1          TRUE        TRUE Own livestock production             NA
## 2          TRUE        TRUE Own livestock production             NA
## 3          TRUE       FALSE                                      NA
## 4          TRUE       FALSE                                      NA
## 5          TRUE       FALSE                     <NA>             NA
## 6          TRUE       FALSE                                      NA
##   AverageMilkedPerDay AverageMilkConsumedPerDay           WhoDrankMilk
## 1                  NA                        NA Children under 5 years
## 2                  NA                        NA Children under 5 years
## 3                  NA                        NA                       
## 4                  NA                        NA                       
## 5                  NA                        NA Children under 5 years
## 6                  NA                        NA                       
##   AverageMilkPrice HarvestedInLastWeeks AcresHarvested BagsHarvested
## 1               NA                FALSE             NA            NA
## 2               NA                FALSE             NA            NA
## 3               NA                 TRUE             NA            NA
## 4               NA                FALSE             NA            NA
## 5               NA                 TRUE             NA            NA
## 6               NA                FALSE             NA            NA
##   HaveFoodStock FoodStockSources DaysStockLast WaterSource1 WaterSource2
## 1         FALSE                             NA       Rivers             
## 2         FALSE         Purchase             3       Rivers             
## 3         FALSE                             NA       Rivers             
## 4          TRUE         Purchase             3       Rivers             
## 5         FALSE             <NA>            NA       Rivers         <NA>
## 6          TRUE         Purchase             3       Rivers             
##   WaterSource3 NormalWaterSource   WhyNotNormalWaterSource
## 1                           TRUE Breakdown of water source
## 2                          FALSE Breakdown of water source
## 3                           TRUE Breakdown of water source
## 4                           TRUE Breakdown of water source
## 5         <NA>              TRUE                      <NA>
## 6                           TRUE Breakdown of water source
##   DaysWaterSourceExpectedToLast DistanceFromWaterSource NoWaterJerryCans
## 1                            12                     2.5                9
## 2                            12                     2.5                8
## 3                            12                     2.6                8
## 4                            12                     2.5                9
## 5                            12                     2.5                8
## 6                            12                     2.6                9
##   JerryCansCost NormalHHWaterConsumption HHPayForWater CostTransportJerryCan
## 1            NA                        5         FALSE                    NA
## 2            NA                        4         FALSE                    NA
## 3            NA                        5         FALSE                    NA
## 4            NA                        5         FALSE                    NA
## 5            NA                        5         FALSE                    NA
## 6            NA                        5         FALSE                    NA
##   TreatWaterBeforeDrinking WaterTreatmentMethodUsed CSI_ReliedOnLess
## 1                    FALSE                                         2
## 2                    FALSE                                         2
## 3                    FALSE                                         4
## 4                    FALSE                                         2
## 5                    FALSE                     <NA>                2
## 6                    FALSE                                         2
##   CSI_BorrowedFood CSI_ReducedNoOfMeals CSI_ReducedPortionMealSize
## 1                0                    3                          1
## 2                0                    1                          0
## 3                1                    3                          2
## 4                0                    3                          1
## 5                0                    3                          2
## 6                0                    3                          1
##   CSI_QuantityForAdult CSI_SoldHouseholdAssets CSI_ReducedNonFoodExpenses
## 1                    0                       1                          1
## 2                    0                       1                          3
## 3                    0                       1                          1
## 4                    0                       1                          1
## 5                    3                       1                          3
## 6                    0                       1                          1
##   CSI_SoldProductiveAssets CSI_SpentSavings CSI_BorrowedMoney CSI_SoldHouseLand
## 1                        1                1                 1                 1
## 2                        1                1                 1                 1
## 3                        1                1                 1                 1
## 4                        1                1                 1                 1
## 5                        1                1                 1                 1
## 6                        1                1                 1                 1
##   CSI_WithdrewChildrenSchool CSI_SoldLastFemaleAnimal CSI_Begging
## 1                          1                        1           1
## 2                          1                        1           1
## 3                          1                        1           1
## 4                          1                        1           1
## 5                          1                        1           1
## 6                          1                        1           1
##   CSI_SoldMoreAnimals HFC_GrainDays HFC_GrainSource HFC_RootsDays
## 1                   1             7               5            NA
## 2                   1             7               5            NA
## 3                   1             7               5            NA
## 4                   1             7               5            NA
## 5                   1             7               5            NA
## 6                   1             7               5            NA
##   HFC_RootsSource HFC_PulsesNutsDays HFC_PulsesNutsSource HFC_OrangeVegDays
## 1              NA                  3                    5                NA
## 2              NA                  3                    5                NA
## 3              NA                  3                    5                NA
## 4              NA                  3                    5                NA
## 5              NA                  0                   NA                NA
## 6              NA                  3                    5                NA
##   HFC_OrangeVegSource HFC_GreenLeafyDays HFC_GreenLeafySource HFC_OtherVegDays
## 1                  NA                 NA                   NA                4
## 2                  NA                 NA                   NA                5
## 3                  NA                 NA                   NA                2
## 4                  NA                 NA                   NA                5
## 5                  NA                 NA                   NA                2
## 6                  NA                 NA                   NA                6
##   HFC_OtherVegSource HFC_OrangeFruitsDays HFC_OrangeFruitsSource
## 1                  5                   NA                     NA
## 2                  5                   NA                     NA
## 3                  5                   NA                     NA
## 4                  5                   NA                     NA
## 5                  5                   NA                     NA
## 6                  5                   NA                     NA
##   HFC_OtherFruitsDays HFC_OtherFruitsSource HFC_MeatDays HFC_MeatSource
## 1                   2                     5            0             10
## 2                   3                     5            1              5
## 3                   2                     5            0             10
## 4                   2                     5            0             10
## 5                   2                     5            0             NA
## 6                   2                     5            0             10
##   HFC_LiverDays HFC_LiverSource HFC_FishDays HFC_EggsDays HFC_EggsSource
## 1            NA              NA           NA           NA             NA
## 2            NA              NA           NA           NA             NA
## 3            NA              NA           NA           NA             NA
## 4            NA              NA           NA           NA             NA
## 5            NA              NA           NA           NA             NA
## 6            NA              NA           NA           NA             NA
##   HFC_MilkDays HFC_MilkSource HFC_OilDays HFC_OilSource HFC_SugarDays
## 1            4              1           6             5             7
## 2            4              1           7             5             7
## 3            0             10           7             5             6
## 4            2              5           7             5             7
## 5            2              1           7             5             6
## 6            2              5           7             5             7
##   HFC_SugarSource HFC_CondimentsDays HFC_CondimentsSource
## 1               5                  0                   NA
## 2               5                  0                   NA
## 3               5                  3                    5
## 4               5                  3                    5
## 5               5                  0                   NA
## 6               5                  0                   NA
##                MainIncomeSource MaleCasualLabour FemaleCasualLabour
## 1               5. Sale of wood               NA                 NA
## 2              1. Sale of crops               NA                 NA
## 3              1. Sale of crops                1                 NA
## 4 3. Sale of livestock products               NA                 NA
## 5              1. Sale of crops               NA                 NA
## 6 3. Sale of livestock products               NA                 NA
##   CasualLabourEarn CharcoalSaleEarn WoodSaleEarn DivisionID CountyID SiteID
## 1               NA               NA           NA         39        4      1
## 2               NA               NA           NA         39        4      1
## 3               NA               NA           NA         39        4      1
## 4               NA               NA           NA         39        4      1
## 5               NA               NA           NA         39        4      1
## 6               NA               NA           NA         39        4      1
##   LivelihoodZoneID DateCaptured
## 1                2           NA
## 2                2           NA
## 3                2           NA
## 4                2           NA
## 5                2           NA
## 6                2           NA

We also have to ensure that the “InterviewDate” column is parsed correctly as a date before saving the worksheet to the new workbook.

# Ensure the column is numeric
hha_sep_2024$InterviewDate <- as.numeric(hha_sep_2024$InterviewDate)

# Convert the numeric date to Date format
hha_sep_2024$InterviewDate <- as.Date(hha_sep_2024$InterviewDate, origin = "1899-12-30")

# View the first few dates to verify the conversion
head(hha_sep_2024$InterviewDate)
## [1] "2024-09-03" "2024-09-09" "2024-09-09" "2024-09-14" "2024-09-14"
## [6] "2024-09-14"

We save the sheet to a new workbook for the deidentified data for NDMA September 2024. We will write subsequent deidentified KIA and MUAC sheets to this workbook.

# Define the path for the new workbook
new_file_path <- "C:/Users/AAH USER/OneDrive - Action Against Hunger USA/Documents/NDMA_DeIdentified/HHA_KIA_MUAC_September_2024.xlsx"

# Create a new workbook
wb <- createWorkbook()

# Add the HHA data to the new workbook
addWorksheet(wb, "HHA")
writeData(wb, "HHA", hha_sep_2024)

# Save the new workbook
saveWorkbook(wb, new_file_path, overwrite = TRUE)

2. KIA

# Load required libraries
library(dplyr)
library(ggplot2)
library(geosphere)
library(openxlsx)

file_path <- "C:/Users/AAH USER/Downloads/HHA, KIA & MUAC September 2024.xlsx"

# Read the specific HHA sheet into a data frame
kia_sep_2024 <- read.xlsx(file_path, sheet = "KIA", startRow = 2)

The dataset contains coordinates, however these are not household coordinates so we do not need to mask them. We ensure the Interview date column is parsed correctly and save this subsequent sheet to the workbook.

# Ensure the column is numeric
kia_sep_2024$InterviewDate <- as.numeric(kia_sep_2024$InterviewDate)

# Convert the numeric date to Date format
kia_sep_2024$InterviewDate <- as.Date(kia_sep_2024$InterviewDate, origin = "1899-12-30")

# View the first few dates to verify the conversion
head(kia_sep_2024$InterviewDate)
## [1] "2024-09-13" "2024-09-12" "2024-09-11" "2024-09-10" "2024-09-05"
## [6] "2024-09-07"

We save this new sheet alongside the previous in the workbook created

# Define the path for the existing Excel workbook
existing_file_path <- "C:/Users/AAH USER/OneDrive - Action Against Hunger USA/Documents/NDMA_DeIdentified/HHA_KIA_MUAC_September_2024.xlsx"

# Load the existing workbook
wb <- loadWorkbook(existing_file_path)

# Add the KIA data to the existing workbook
addWorksheet(wb, "KIA ")
writeData(wb, "KIA ", kia_sep_2024)

# Save the updated workbook
saveWorkbook(wb, existing_file_path, overwrite = TRUE)

3. MUAC

# Load required libraries
library(dplyr)
library(ggplot2)
library(geosphere)
library(openxlsx)

file_path <- "C:/Users/AAH USER/Downloads/HHA, KIA & MUAC September 2024.xlsx"

# Read the specific HHA sheet into a data frame
muac_sep_2024 <- read.xlsx(file_path, sheet = "MUAC", startRow = 2)

This data set has P.I.I’s in the “ChildName” column so we will drop that, ensure the Interview date column is parsed correctly and save this subsequent sheet to the workbook.

# Drop the specified PII column 
muac_sep_2024 <- muac_sep_2024 %>%
  select(-ChildName)  

# Ensure the column is numeric
muac_sep_2024$InterviewDate <- as.numeric(muac_sep_2024$InterviewDate)

# Convert the numeric date to Date format
muac_sep_2024$InterviewDate <- as.Date(muac_sep_2024$InterviewDate, origin = "1899-12-30")

# Check the updated dataset
head(muac_sep_2024)
##   MUACIndicatorID    QID County SubCounty             Ward    LivelihoodZone
## 1         1470356 527698  Kwale   Kinango Chengoni-Samburu Livestock farming
## 2         1470357 527698  Kwale   Kinango Chengoni-Samburu Livestock farming
## 3         1470358 527698  Kwale   Kinango Chengoni-Samburu Livestock farming
## 4         1470359 527698  Kwale   Kinango Chengoni-Samburu Livestock farming
## 5         1470360 527698  Kwale   Kinango Chengoni-Samburu Livestock farming
## 6         1470361 527698  Kwale   Kinango Chengoni-Samburu Livestock farming
##       Month Year HouseholdCode     Gender MUAC MUAC_Color AgeInMonths
## 1 September 2024       KWL0610 Female      146       <NA>          39
## 2 September 2024       KWL0610 Female      140       <NA>          39
## 3 September 2024       KWL0610 Male        146       <NA>          25
## 4 September 2024       KWL0610 Male        146       <NA>          55
## 5 September 2024       KWL0610 Female      145       <NA>          54
## 6 September 2024       KWL0610 Female      146       <NA>          43
##   LiveInHousehold SufferedIllnesses InterviewDate DivisionID CountyID SiteID
## 1           FALSE              <NA>    2024-09-01        107       20    222
## 2           FALSE              <NA>    2024-09-01        107       20    222
## 3           FALSE              <NA>    2024-09-01        107       20    222
## 4           FALSE              <NA>    2024-09-01        107       20    222
## 5           FALSE              <NA>    2024-09-01        107       20    222
## 6           FALSE              <NA>    2024-09-01        107       20    222
##   LivelihoodZoneID
## 1               13
## 2               13
## 3               13
## 4               13
## 5               13
## 6               13

We save this new sheet alongside the previous two in the workbook created

# Define the path for the existing Excel workbook
existing_file_path <- "C:/Users/AAH USER/OneDrive - Action Against Hunger USA/Documents/NDMA_DeIdentified/HHA_KIA_MUAC_September_2024.xlsx"

# Load the existing workbook
wb <- loadWorkbook(existing_file_path)

# Add the MUAC data to the existing workbook
addWorksheet(wb, "MUAC ")
writeData(wb, "MUAC ", muac_sep_2024)

# Save the updated workbook
saveWorkbook(wb, existing_file_path, overwrite = TRUE)