identify-and-remove-duplicated-rows-irrespective-of-column-order.R

#identify and remove duplicated rows irrespective of column order
# Create an example data frame
df <- data.frame(
  A = c(1, 2, 3, 4, 5),
  B = c("a", "b", "c", "d", "e"),
  C = c(10, 20, 30, 40, 50)
)

# Add a row with the same data (changing the order of columns)
df <- rbind(df, c("b", 2, 20))

# Display the original data frame
print("Original Data Frame:")

## [1] "Original Data Frame:"

print(df)

##   A B  C
## 1 1 a 10
## 2 2 b 20
## 3 3 c 30
## 4 4 d 40
## 5 5 e 50
## 6 b 2 20

# Sort the values in each row
df_sorted <- t(apply(df, 1, function(x) sort(x)))
df_sorted

##      [,1] [,2] [,3]
## [1,] "1"  "10" "a" 
## [2,] "2"  "20" "b" 
## [3,] "3"  "30" "c" 
## [4,] "4"  "40" "d" 
## [5,] "5"  "50" "e" 
## [6,] "2"  "20" "b"

# Find duplicated rows
duplicate_rows <- duplicated(df_sorted)
duplicate_rows

## [1] FALSE FALSE FALSE FALSE FALSE  TRUE

duplicate_rows_df <- df[!duplicate_rows, ]
duplicate_rows_df

##   A B  C
## 1 1 a 10
## 2 2 b 20
## 3 3 c 30
## 4 4 d 40
## 5 5 e 50

df

##   A B  C
## 1 1 a 10
## 2 2 b 20
## 3 3 c 30
## 4 4 d 40
## 5 5 e 50
## 6 b 2 20

identify-and-remove-duplicated-rows-irrespective-of-column-order.R

liyix

2024-01-31