# excel file
data <- read_excel("Salaries.xlsx")
data
## # A tibble: 397 × 6
## rank discipline yrs.since.phd yrs.service sex salary
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Prof B 19 18 Male 139750
## 2 Prof B 20 16 Male 173200
## 3 AsstProf B 4 3 Male 79750
## 4 Prof B 45 39 Male 115000
## 5 Prof B 40 41 Male 141500
## 6 AssocProf B 6 6 Male 97000
## 7 Prof B 30 23 Male 175000
## 8 Prof B 45 45 Male 147765
## 9 Prof B 21 20 Male 119250
## 10 Prof B 18 18 Female 129000
## # … with 387 more rows
Used all my variables and there were no unique data points that stood out from the rest.
data %>% count(rank, discipline, yrs.since.phd, yrs.service, sex, salary) %>% filter(n > 1)
## # A tibble: 3 × 7
## rank discipline yrs.since.phd yrs.service sex salary n
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <int>
## 1 AsstProf B 3 3 Male 89942 2
## 2 AsstProf B 4 4 Male 92000 3
## 3 AsstProf B 4 4 Male 92700 2
Divide it using dplyr::select in a way the two have a common variable, which you could use to join the two.
data_1half <- data %>% select(rank:yrs.since.phd)
data_2half <- data %>% select(yrs.since.phd:salary)
Use tidyr::left_join or other joining functions.
left_join(data_1half, data_2half)
## Joining, by = "yrs.since.phd"
## # A tibble: 3,819 × 6
## rank discipline yrs.since.phd yrs.service sex salary
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Prof B 19 18 Male 139750
## 2 Prof B 19 20 Male 101000
## 3 Prof B 19 19 Male 94384
## 4 Prof B 19 16 Male 82100
## 5 Prof B 19 11 Male 106608
## 6 Prof B 19 19 Male 151575
## 7 Prof B 19 18 Male 122100
## 8 Prof B 19 19 Male 86250
## 9 Prof B 19 5 Male 165000
## 10 Prof B 19 6 Male 96200
## # … with 3,809 more rows
data%>%
summarise(sum(str_detect(yrs.service, "5")))
## # A tibble: 1 × 1
## `sum(str_detect(yrs.service, "5"))`
## <int>
## 1 37
ratings <- c("1", "2", "3", "4")
ratings_whole <- str_c(ratings, collapse = "|")
ratings_whole
## [1] "1|2|3|4"
has_whole_rating <- str_subset(data, ratings_whole)
## Warning in stri_subset_regex(string, pattern, omit_na = TRUE, negate = negate, :
## argument is not an atomic vector; coercing
str_extract(ratings_whole, has_whole_rating)
## [1] NA NA NA
data %>% mutate(yrs.service = yrs.since.phd %>% str_replace("[0-20]", "-"))
## # A tibble: 397 × 6
## rank discipline yrs.since.phd yrs.service sex salary
## <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 Prof B 19 -9 Male 139750
## 2 Prof B 20 -0 Male 173200
## 3 AsstProf B 4 4 Male 79750
## 4 Prof B 45 45 Male 115000
## 5 Prof B 40 4- Male 141500
## 6 AssocProf B 6 6 Male 97000
## 7 Prof B 30 3- Male 175000
## 8 Prof B 45 45 Male 147765
## 9 Prof B 21 -1 Male 119250
## 10 Prof B 18 -8 Female 129000
## # … with 387 more rows