library(StatMatch)
library(dplyr)

The data shall be imputed as follows:

As an example, a dataframe will be created.

df = data.frame(
  Sex1 = c('M', 'M', 'F', 'F','M'),
  Sex2 = c('M', 'M', 'F', 'F', 'F'),
  Age1 = c(16, 14, 15, 15, 4),
  Age2 = c(15, 36, 58, 78, 100),
  Len3 = c(33, 44, 33, 22, 33)
)

The new “row” value to which the missing values are to be imputed shall be the following:

new_with_null = data.frame(Sex1 = 'M', 
                           Sex2 = NA , 
                           Age1 = 16, 
                           Age2 = NA, 
                           Len3 = NA)

For this case it will be necessary to choose how many “neighbors” to choose to impute the data.

n_neighbors= 3

Analyze the data

First step will be to find the columns of the entered patient with missing values.

datos = rbind(df, new_with_null) # The two data_frames are joined first.

na_por_columna <- colSums(is.na(datos)) 
columnas_con_na <- names(which(na_por_columna > 0))
columnas_con_na 
## [1] "Sex2" "Age2" "Len3"

Calculate Gower Distance and select nearest neighbors

Next step will be to calculate the Gower distance only with the variables shared between the data set and the “row” value with missing values.

datos_filtrados <- datos %>% select(-columnas_con_na) #Data to be used to calculate distance

datos_filtrados
##   Sex1 Age1
## 1    M   16
## 2    M   14
## 3    F   15
## 4    F   15
## 5    M    4
## 6    M   16
distance = gower.dist(data.x = datos_filtrados[-nrow(datos_filtrados),], data.y =  datos_filtrados[nrow(datos_filtrados),] )

distance_array = array(distance) # We convert the output into an array

distance_array 
## [1] 0.00000000 0.08333333 0.54166667 0.54166667 0.50000000

The n neighbours are selected to be used to calculate the mean and mode of the missing variables.

distance_array_ordenado <- sort(distance_array)
n_primeros <- head(distance_array_ordenado, n = n_neighbors)
indices <- which(distance_array %in% n_primeros) #Nearest Neighbour Indexes

These are the n rows that would be in the same group as the new “row” with missing values

datos[c(indices),]
##   Sex1 Sex2 Age1 Age2 Len3
## 1    M    M   16   15   33
## 2    M    M   14   36   44
## 5    M    F    4  100   33

Impute data

Non-shared variables

names_df1 <- names(datos)
names_df2 <- names(datos_filtrados)

columnas_no_compartidas <- setdiff(names_df1, names_df2)
columnas_no_compartidas
## [1] "Sex2" "Age2" "Len3"

Non-shared numerical variables

columnas_no_compartidas_numericas <- setdiff(names(datos[,sapply(datos, is.numeric)]),                                    names(datos_filtrados[sapply(datos_filtrados, is.numeric),]))
columnas_no_compartidas_numericas
## [1] "Age2" "Len3"

Non-numeric non-shared variables

columnas_no_compartidas_character <- setdiff (columnas_no_compartidas,columnas_no_compartidas_numericas)
columnas_no_compartidas_character
## [1] "Sex2"
for (i in columnas_no_compartidas_numericas) {
  print(i)
  new_with_null[,i] = mean(datos[c(indices),i])
  print(mean(datos[c(indices),i]))
}
## [1] "Age2"
## [1] 50.33333
## [1] "Len3"
## [1] 36.66667
for (i in columnas_no_compartidas_character) {
  print(i)
  print(names(table(datos[c(indices),i]))[which.max(table(datos[c(indices),i]))])
  new_with_null[,i]<- names(table(datos[c(indices),i]))[which.max(table(datos[c(indices),i]))]
}
## [1] "Sex2"
## [1] "M"

Result

new_with_null
##   Sex1 Sex2 Age1     Age2     Len3
## 1    M    M   16 50.33333 36.66667