# read the data from "MASS" package
library(MASS)
library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(Animals, package="MASS")
knitr::kable(head(Animals))
| Mountain beaver |
1.35 |
8.1 |
| Cow |
465.00 |
423.0 |
| Grey wolf |
36.33 |
119.5 |
| Goat |
27.66 |
115.0 |
| Guinea pig |
1.04 |
5.5 |
| Dipliodocus |
11700.00 |
50.0 |
data("mammals", package="MASS")
knitr::kable(head(mammals))
| Arctic fox |
3.385 |
44.5 |
| Owl monkey |
0.480 |
15.5 |
| Mountain beaver |
1.350 |
8.1 |
| Cow |
465.000 |
423.0 |
| Grey wolf |
36.330 |
119.5 |
| Goat |
27.660 |
115.0 |
# Since the two datasets share the same columns, I combine the data by the rows
dta <- rbind(Animals,mammals)
str(dta)
## 'data.frame': 90 obs. of 2 variables:
## $ body : num 1.35 465 36.33 27.66 1.04 ...
## $ brain: num 8.1 423 119.5 115 5.5 ...
# Locate the duplicated data
duplicated(dta)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [49] TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE
## [61] TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
## [73] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE
# There are 25 duplicated observations.
dta1 <- dta[duplicated(dta), ] %>% str(dta)
## 'data.frame': 25 obs. of 2 variables:
## $ body : num 1.35 465 36.33 27.66 1.04 ...
## $ brain: num 8.1 423 119.5 115 5.5 ...
# By removing the duplicated observations, we have 65 observations of 2 variables.
dta2 <- dta[!duplicated(dta), ] %>% str(dta)
## 'data.frame': 65 obs. of 2 variables:
## $ body : num 1.35 465 36.33 27.66 1.04 ...
## $ brain: num 8.1 423 119.5 115 5.5 ...
# The result is the same as the funtion unique().
str(unique(dta))
## 'data.frame': 65 obs. of 2 variables:
## $ body : num 1.35 465 36.33 27.66 1.04 ...
## $ brain: num 8.1 423 119.5 115 5.5 ...
# We can also do it by the function distinct() from [dplyr package]. It can be used to keep only unique/distinct rows from a data frame. If there are duplicate rows, only the first row is preserved.
dta %>% dplyr::distinct(body,brain, .keep_all = TRUE)%>% str(dta)
## 'data.frame': 65 obs. of 2 variables:
## $ body : num 1.35 465 36.33 27.66 1.04 ...
## $ brain: num 8.1 423 119.5 115 5.5 ...