NDMA Data De-Identification

The following are steps undertaken for deidentifying NDMA data. This dataset covers 2020 through to June 2024 for HHA Since the geocoordinates in the HHA dataset represent household coordinates, we will mask them (random displacement) using the Haversine Formula to randomly distribute a point around a central coordinate within a radius of 2.5 KM and drop other P.I.I.s.

1. HHA

# Load required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(geosphere)
## Warning: package 'geosphere' was built under R version 4.3.3
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.3.3
file_path <- "C:/Users/AAH USER/Downloads/01_HHA_NDMA_2020 to June 2024.xlsx"

# Read the specific HHA sheet into a data frame
hha_2020_2024 <- read.xlsx(file_path, sheet = "HHA")

The dataset contains household coordinates in columns “Lat” and “Long” which are considered P.I.I’s so we mask the coordinates, verify by plotting a histogram of the distribution of displacement distances of the original and displaced coordinates to establish uniformity.

We check for and deal with outliers if any in the “Lat” and “Long” columns

# Verify the dataset
summary(hha_2020_2024$Lat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -8.1896 -1.7200 -0.0629 -0.1178  1.4070 33.5989    1296
summary(hha_2020_2024$Long)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -35.43   36.78   37.88   37.83   39.49  105.39    1296

There seems to be erroneous entries since the max values in the “Lat” and “Long” columns are well above the mean. 1296 rows also dont have entries for coordinates. We deal with this entry.

# Calculate the mean for Lat and Long, ignoring NA values
lat_mean <- mean(hha_2020_2024$Lat, na.rm = TRUE)
lon_mean <- mean(hha_2020_2024$Long, na.rm = TRUE)

lat_sd <- sd(hha_2020_2024$Lat, na.rm = TRUE)
lon_sd <- sd(hha_2020_2024$Long, na.rm = TRUE)

# Calculate Z-scores
hha_2020_2024 <- hha_2020_2024 %>%
  mutate(lat_z = (Lat - lat_mean) / lat_sd,
         lon_z = (Long - lon_mean) / lon_sd)

# Set threshold for identifying outliers
threshold <- 3  # Common threshold for Z-scores

# Replace outliers with NA
hha_2020_2024 <- hha_2020_2024 %>%
  mutate(Lat = ifelse(abs(lat_z) > threshold & !is.na(lat_z), NA, Lat),
         Long = ifelse(abs(lon_z) > threshold & !is.na(lon_z), NA, Long))

# Remove the Z-score columns 
hha_2020_2024 <- hha_2020_2024 %>%
  select(-lat_z, -lon_z)

# Verify the dataset
summary(hha_2020_2024$Lat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -6.4786 -1.7207 -0.0631 -0.1208  1.4063  4.3153    1329
summary(hha_2020_2024$Long)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   34.76   36.78   37.89   38.02   39.50   42.79    2518

1222 rows in total are affected by the outlier and are replaced with NAs, bringing the total to 2518 NAs. We proceed to mask the coordinates.

# Create backup columns for original coordinates
hha_2020_2024$Original_Lat <- hha_2020_2024$Lat
hha_2020_2024$Original_Long <- hha_2020_2024$Long

# Function to generate random displaced coordinates with uniform distance distribution
mask_coordinates_uniform <- function(lat, lon, radius_km) {
  R <- 6371  # Earth radius in kilometers
  
  # Random bearing angle (in radians)
  bearing <- runif(1, 0, 2 * pi)
  
  # Random distance uniformly sampled from [0, radius_km]
  rand_dist <- runif(1, 0, radius_km) / R  # Uniformly sampled distance in radians
  
  # Convert original coordinates to radians
  lat_rad <- lat * pi / 180
  lon_rad <- lon * pi / 180
  
  # Calculate new latitude (in radians)
  new_lat <- asin(sin(lat_rad) * cos(rand_dist) + 
                    cos(lat_rad) * sin(rand_dist) * cos(bearing))
  
  # Calculate new longitude (in radians)
  new_lon <- lon_rad + atan2(sin(bearing) * sin(rand_dist) * cos(lat_rad),
                             cos(rand_dist) - sin(lat_rad) * sin(new_lat))
  
  # Convert back to degrees
  new_lat <- new_lat * 180 / pi
  new_lon <- new_lon * 180 / pi
  
  return(c(new_lat, new_lon))
}

# Set displacement radius in kilometers
radius_km <- 2.5  

# Generate masked coordinates for each row using the modified function
masked_coords <- t(apply(hha_2020_2024, 1, function(row) {
  mask_coordinates_uniform(as.numeric(row["Original_Lat"]), as.numeric(row["Original_Long"]), radius_km)
}))

# Replace the original Lat and Long columns with the masked coordinates
hha_2020_2024$Lat <- masked_coords[, 1]
hha_2020_2024$Long <- masked_coords[, 2]

We then evaluate the distribution of the displacement distances before dropping the original coordinates by plotting a histogram and a scatter plot.

# Calculate displacement distances (in kilometers) as before
displacement_distances <- distHaversine(
  cbind(hha_2020_2024$Long, hha_2020_2024$Lat),  # Masked coordinates
  cbind(hha_2020_2024$Original_Long, hha_2020_2024$Original_Lat)  # Original coordinates
) / 1000  # Convert meters to kilometers

# Add displacement distances to the dataset for further analysis
hha_2020_2024$Displacement_Distance <- displacement_distances

# Plot: Histogram of displacement distances
ggplot(hha_2020_2024, aes(x = Displacement_Distance)) +
  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Displacement Distances",
       x = "Displacement Distance (km)", y = "Count") +
  theme_minimal()
## Warning: Removed 2518 rows containing non-finite values (`stat_bin()`).

There are no significant peaks or valleys in the histogram, suggesting that the displacements are indeed more uniformly distributed, as intended.

# Plot: Original vs Masked Coordinates Scatterplot
ggplot() +
  geom_point(data = hha_2020_2024, aes(x = Original_Long, y = Original_Lat, color = "Original"), 
             alpha = 0.5, size = 1, shape = 16) +
  geom_point(data = hha_2020_2024, aes(x = Long, y = Lat, color = "Displaced"), 
             alpha = 0.5, size = 1, shape = 16) +
  labs(title = "Original vs Displaced Coordinates",
       x = "Longitude", y = "Latitude", color = "Coordinate Type") +  # Legend title
  scale_color_manual(values = c("Original" = "red", "Displaced" = "blue")) +  # Custom colors
  theme_minimal() +
  coord_fixed()  # Ensures aspect ratio is 1:1 for accurate geographic representation
## Warning: Removed 2518 rows containing missing values (`geom_point()`).
## Removed 2518 rows containing missing values (`geom_point()`).

The scatterplot displays the original coordinates in red and the displaced coordinates in blue. It helps visualize how the coordinates were shifted.

We then drop the original coordinates column leaving only the masked coordinates columns. We also drop other PII’s which are the “HouseholdName”, “HouseHoldHead”, and “RespondentName”.

# Drop the specified PII columns along with original coordinates
hha_2020_2024 <- hha_2020_2024 %>%
  select(-c(Original_Lat, Original_Long, Displacement_Distance, HouseholdName, HouseHoldHead, RespondentName))  

# Check the updated dataset
head(hha_2020_2024)
##      QID  County     SubCounty     Ward LivelihoodZone   Month Year       Lat
## 1 221362 Baringo Baringo South Mukutani       Pastoral January 2020 0.4532842
## 2 221364 Baringo Baringo South Mukutani       Pastoral January 2020 0.4400325
## 3 221365 Baringo Baringo South Mukutani       Pastoral January 2020 0.4469403
## 4 221366 Baringo Baringo South Mukutani       Pastoral January 2020 0.4466160
## 5 221367 Baringo Baringo South Mukutani       Pastoral January 2020 0.4469495
## 6 221368 Baringo Baringo South Mukutani       Pastoral January 2020 0.4751906
##       Long InterviewDate HouseholdCode HeadEducationLevel
## 1 36.11514      43834.00       BAR0303       2. Secondary
## 2 36.10135      43834.58       BAR0301         1. Primary
## 3 36.11736      43834.00       BAR0304         1. Primary
## 4 36.08576      43834.58       BAR0305         1. Primary
## 5 36.10973      43835.58       BAR0306       2. Secondary
## 6 36.11407      43835.00       BAR0307         1. Primary
##                            MainHHIncomeSource HeadGender RespondentGender
## 1 2. Sale of livestock and livestock products       Male           Female
## 2 2. Sale of livestock and livestock products       Male           Female
## 3 2. Sale of livestock and livestock products       Male           Female
## 4 2. Sale of livestock and livestock products       Male           Female
## 5                        1. Employment/Salary       Male           Female
## 6 2. Sale of livestock and livestock products       Male           Female
##   MaleMembers FemaleMembers ChildrenBelow5 KeepLivestock MilkAnimals
## 1           2             3              1          TRUE       FALSE
## 2           2             3              1          TRUE       FALSE
## 3           3             4              2          TRUE       FALSE
## 4           2             2              1          TRUE       FALSE
## 5           2             5              2          TRUE       FALSE
## 6           5             3              2          TRUE       FALSE
##                                 MilkSource HowOftenMilked AverageMilkedPerDay
## 1 Shared (free from neighbour or relative)             NA                  NA
## 2                 Own livestock production             NA                  NA
## 3                 Own livestock production             NA                  NA
## 4                                 Purchase             NA                  NA
## 5                 Own livestock production             NA                  NA
## 6 Shared (free from neighbour or relative)             NA                  NA
##   AverageMilkConsumedPerDay              WhoDrankMilk AverageMilkPrice
## 1                        NA    Children under 5 years               NA
## 2                        NA Everyone in the household               NA
## 3                        NA    Children under 5 years               NA
## 4                        NA    Children under 5 years               NA
## 5                        NA Everyone in the household               NA
## 6                        NA    Children under 5 years               NA
##   HarvestedInLastWeeks AcresHarvested BagsHarvested HaveFoodStock
## 1                FALSE             NA            NA         FALSE
## 2                FALSE             NA            NA         FALSE
## 3                FALSE             NA            NA         FALSE
## 4                FALSE             NA            NA         FALSE
## 5                FALSE             NA            NA         FALSE
## 6                FALSE             NA            NA         FALSE
##   FoodStockSources DaysStockLast WaterSource1  WaterSource2 WaterSource3
## 1             <NA>            NA    Boreholes Pans and dams         <NA>
## 2             <NA>            NA    Boreholes Pans and dams         <NA>
## 3             <NA>            NA    Boreholes          <NA>         <NA>
## 4             <NA>            NA    Boreholes          <NA>         <NA>
## 5             <NA>            NA    Boreholes          <NA>         <NA>
## 6             <NA>            NA    Boreholes          <NA>         <NA>
##   NormalWaterSource   WhyNotNormalWaterSource DaysWaterSourceExpectedToLast
## 1             FALSE Breakdown of water source                             3
## 2              TRUE                      <NA>                             2
## 3             FALSE Breakdown of water source                             3
## 4              TRUE                      <NA>                             3
## 5              TRUE                      <NA>                             3
## 6             FALSE Breakdown of water source                             4
##   DistanceFromWaterSource NoWaterJerryCans JerryCansCost
## 1                       3                2             2
## 2                       2                3             2
## 3                       2                3             2
## 4                       2                2             2
## 5                       2                3             2
## 6                       2                3             2
##   NormalHHWaterConsumption HHPayForWater CostTransportJerryCan
## 1                       40         FALSE                     0
## 2                       60          TRUE                     0
## 3                       60         FALSE                     0
## 4                       40          TRUE                     0
## 5                       60          TRUE                     0
## 6                       60         FALSE                     0
##   TreatWaterBeforeDrinking WaterTreatmentMethodUsed CSI_ReliedOnLess
## 1                    FALSE                     <NA>                0
## 2                    FALSE                     <NA>                0
## 3                    FALSE                     <NA>                0
## 4                    FALSE                     <NA>                0
## 5                    FALSE                     <NA>                0
## 6                    FALSE                     <NA>                0
##   CSI_BorrowedFood CSI_ReducedNoOfMeals CSI_ReducedPortionMealSize
## 1                0                    0                          3
## 2                2                    0                          3
## 3                0                    2                          3
## 4                0                    0                          0
## 5                0                    0                          0
## 6                0                    0                          3
##   CSI_QuantityForAdult CSI_SoldHouseholdAssets CSI_ReducedNonFoodExpenses
## 1                    0                       4                          1
## 2                    0                       4                          1
## 3                    0                       4                          1
## 4                    0                       4                          1
## 5                    0                       4                          1
## 6                    0                       4                          1
##   CSI_SoldProductiveAssets CSI_SpentSavings CSI_BorrowedMoney CSI_SoldHouseLand
## 1                        4                1                 1                 1
## 2                        4                1                 1                 1
## 3                        4                1                 1                 1
## 4                        4                1                 1                 1
## 5                        4                1                 3                 1
## 6                        4                1                 1                 1
##   CSI_WithdrewChildrenSchool CSI_SoldLastFemaleAnimal CSI_Begging
## 1                          1                        1           1
## 2                          1                        1           1
## 3                          1                        1           1
## 4                          1                        1           1
## 5                          1                        1           1
## 6                          1                        1           1
##   CSI_SoldMoreAnimals HFC_GrainDays HFC_GrainSource HFC_RootsDays
## 1                   1             6               5            NA
## 2                   1             4               5            NA
## 3                   1             4               5            NA
## 4                   1             5               5            NA
## 5                   1             6               5            NA
## 6                   1             4               5            NA
##   HFC_RootsSource HFC_PulsesNutsDays HFC_PulsesNutsSource HFC_OrangeVegDays
## 1              NA                  3                    5                NA
## 2              NA                  3                    3                NA
## 3              NA                  2                    5                NA
## 4              NA                  3                    5                NA
## 5              NA                  5                    5                NA
## 6              NA                  3                    5                NA
##   HFC_OrangeVegSource HFC_GreenLeafyDays HFC_GreenLeafySource HFC_OtherVegDays
## 1                  NA                 NA                   NA                4
## 2                  NA                 NA                   NA                4
## 3                  NA                 NA                   NA                4
## 4                  NA                 NA                   NA                4
## 5                  NA                 NA                   NA                5
## 6                  NA                 NA                   NA                4
##   HFC_OtherVegSource HFC_OrangeFruitsDays HFC_OrangeFruitsSource
## 1                  5                   NA                     NA
## 2                  5                   NA                     NA
## 3                  5                   NA                     NA
## 4                  5                   NA                     NA
## 5                  5                   NA                     NA
## 6                  5                   NA                     NA
##   HFC_OtherFruitsDays HFC_OtherFruitsSource HFC_MeatDays HFC_MeatSource
## 1                   0                    NA            1              1
## 2                   0                    NA            2              1
## 3                   0                    NA            1              1
## 4                   0                    NA            1              1
## 5                   2                     5            1              1
## 6                   0                    NA            2              1
##   HFC_LiverDays HFC_LiverSource HFC_FishDays HFC_EggsDays HFC_EggsSource
## 1            NA              NA           NA           NA             NA
## 2            NA              NA           NA           NA             NA
## 3            NA              NA           NA           NA             NA
## 4            NA              NA           NA           NA             NA
## 5            NA              NA           NA           NA             NA
## 6            NA              NA           NA           NA             NA
##   HFC_MilkDays HFC_MilkSource HFC_OilDays HFC_OilSource HFC_SugarDays
## 1            0              1           4             5             4
## 2            4              1           4             5             5
## 3            0             NA           4             5             4
## 4            0             NA           0            NA             4
## 5            4              1           5             5             7
## 6            0             NA           4             5             5
##   HFC_SugarSource HFC_CondimentsDays HFC_CondimentsSource     MainIncomeSource
## 1               5                  3                    5 2. Sale of livestock
## 2               5                  4                    5 2. Sale of livestock
## 3               5                  3                    5 2. Sale of livestock
## 4               5                  3                    5 2. Sale of livestock
## 5               5                  4                    5 8. Employment income
## 6               5                  3                    5 2. Sale of livestock
##   MaleCasualLabour FemaleCasualLabour CasualLabourEarn CharcoalSaleEarn
## 1                0                  0                0              500
## 2                0                  0                0                0
## 3                0                  0                0             1000
## 4                0                  0                0              500
## 5                0                  0                0                0
## 6                0                  0                0             1000
##   WoodSaleEarn DivisionID CountyID SiteID LivelihoodZoneID DateCaptured
## 1            0       1142        9   1422                1           NA
## 2            0       1142        9   1422                1           NA
## 3            0       1142        9   1422                1           NA
## 4            0       1142        9   1422                1           NA
## 5            0       1142        9   1422                1           NA
## 6            0       1142        9   1422                1           NA

We also have to ensure that the “InterviewDate” column is parsed correctly as a date before saving the worksheet to the new workbook.

# Ensure the column is numeric
hha_2020_2024$InterviewDate <- as.numeric(hha_2020_2024$InterviewDate)

# Convert the numeric date to Date format
hha_2020_2024$InterviewDate <- as.Date(hha_2020_2024$InterviewDate, origin = "1899-12-30")

# View the first few dates to verify the conversion
head(hha_2020_2024$InterviewDate)
## [1] "2020-01-04" "2020-01-04" "2020-01-04" "2020-01-04" "2020-01-05"
## [6] "2020-01-05"

We save the sheet to a new workbook for the deidentified data for HHA 2020_2024.

# Define the path for the new workbook
new_file_path <- "C:/Users/AAH USER/OneDrive - Action Against Hunger USA/Documents/NDMA_DeIdentified/HHA 2020_June 2024.xlsx"

# Create a new workbook
wb <- createWorkbook()

# Add the HHA data to the new workbook
addWorksheet(wb, "HHA")
writeData(wb, "HHA", hha_2020_2024)

# Save the new workbook
saveWorkbook(wb, new_file_path, overwrite = TRUE)