Duplicated rows
dat1 = data.frame(ID = c(100,100,200,300), Name = c('Ha', NA, 'Ly','Ly'), Age = c(30,0,NA,40), SSN = c(1,1,2,2), stringsAsFactors = FALSE)
#Use: stringAsFactors for changing factor level in data frame into correct format (in this case: Name has chr type, Age has num type)
dat1
## ID Name Age SSN
## 1 100 Ha 30 1
## 2 100 <NA> 0 1
## 3 200 Ly NA 2
## 4 300 Ly 40 2
duplicated(dat1[,c("ID","SSN")]) #Find elements of a data frame are duplicates (return T, F)
## [1] FALSE TRUE FALSE FALSE
dat1[duplicated(dat1[,c("ID","SSN")]),] #View duplicated elements (people with same ID and SSN)
## ID Name Age SSN
## 2 100 <NA> 0 1
#install.packages("dplyr")
library(dplyr)
distinct(dat1, Name, SSN) #Remove duplicated rows based on Name and SSN - using dplyr (Not keep all columns)
## Name SSN
## 1 Ha 1
## 2 <NA> 1
## 3 Ly 2
dat1[!duplicated(dat1$Name,dat1$SSN),] #Remove duplicated rows based on Name and SSN - using R base functions (keep all columns)
## ID Name Age SSN
## 1 100 Ha 30 1
## 2 100 <NA> 0 1
## 3 200 Ly NA 2
dat1[!(dat1$ID == 100 & dat1$Age == 0),] #Delete 1 row with multiple conditions
## ID Name Age SSN
## 1 100 Ha 30 1
## 3 200 Ly NA 2
## 4 300 Ly 40 2