Import data
# excel file
data <- read_excel("data/mydatasal.xlsx")
data
## # A tibble: 32,562 × 13
## age workclass degree marital_status occupation relationship race gender
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 39 State-gov Bache… Never-married Adm-cleri… Not-in-fami… White Male
## 2 50 Self-emp-no… Bache… Married-civ-s… Exec-mana… Husband White Male
## 3 38 Private HS-gr… Divorced Handlers-… Not-in-fami… White Male
## 4 53 Private 11th Married-civ-s… Handlers-… Husband Black Male
## 5 28 Private Bache… Married-civ-s… Prof-spec… Wife Black Female
## 6 37 Private Maste… Married-civ-s… Exec-mana… Wife White Female
## 7 49 Private 9th Married-spous… Other-ser… Not-in-fami… Black Female
## 8 52 Self-emp-no… HS-gr… Married-civ-s… Exec-mana… Husband White Male
## 9 31 Private Maste… Never-married Prof-spec… Not-in-fami… White Female
## 10 42 Private Bache… Married-civ-s… Exec-mana… Husband White Male
## # ℹ 32,552 more rows
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## # country <chr>, salary <chr>
Apply the following dplyr verbs to your data
Filter rows
filter(data, age == 1, race == "white")
## # A tibble: 0 × 13
## # ℹ 13 variables: age <dbl>, workclass <chr>, degree <chr>,
## # marital_status <chr>, occupation <chr>, relationship <chr>, race <chr>,
## # gender <chr>, Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## # country <chr>, salary <chr>
Arrange rows
arrange(data, race)
## # A tibble: 32,562 × 13
## age workclass degree marital_status occupation relationship race gender
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 34 Private 7th-8th Married-civ-s… Transport… Husband Amer… Male
## 2 35 Private Some-col… Never-married Sales Not-in-fami… Amer… Female
## 3 60 ? 10th Divorced ? Not-in-fami… Amer… Female
## 4 51 Private HS-grad Married-civ-s… Craft-rep… Husband Amer… Male
## 5 25 Private Some-col… Never-married Adm-cleri… Not-in-fami… Amer… Female
## 6 20 Private HS-grad Never-married Handlers-… Own-child Amer… Male
## 7 23 ? Some-col… Never-married ? Unmarried Amer… Female
## 8 29 Private HS-grad Never-married Craft-rep… Not-in-fami… Amer… Male
## 9 27 ? Assoc-ac… Married-civ-s… ? Own-child Amer… Male
## 10 31 Private Assoc-ac… Divorced Other-ser… Unmarried Amer… Female
## # ℹ 32,552 more rows
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## # country <chr>, salary <chr>
Select columns
arrange(data, desc(age))
## # A tibble: 32,562 × 13
## age workclass degree marital_status occupation relationship race gender
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 90 Private HS-grad Never-married Other-ser… Not-in-fami… Black Male
## 2 90 Private HS-grad Never-married Other-ser… Not-in-fami… White Female
## 3 90 Private Bachelors Married-civ-s… Exec-mana… Husband White Male
## 4 90 Private Some-col… Never-married Other-ser… Not-in-fami… Asia… Male
## 5 90 Private Some-col… Separated Adm-cleri… Own-child White Female
## 6 90 Private 11th Never-married Handlers-… Own-child White Male
## 7 90 ? Bachelors Widowed ? Other-relat… White Female
## 8 90 Private Some-col… Never-married Other-ser… Not-in-fami… Asia… Male
## 9 90 Private 9th Never-married Adm-cleri… Not-in-fami… White Female
## 10 90 Local-gov Masters Married-civ-s… Exec-mana… Husband White Male
## # ℹ 32,552 more rows
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## # country <chr>, salary <chr>
Add columns
mutate(data,
age_plus_10 = age + 10)
## # A tibble: 32,562 × 14
## age workclass degree marital_status occupation relationship race gender
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 39 State-gov Bache… Never-married Adm-cleri… Not-in-fami… White Male
## 2 50 Self-emp-no… Bache… Married-civ-s… Exec-mana… Husband White Male
## 3 38 Private HS-gr… Divorced Handlers-… Not-in-fami… White Male
## 4 53 Private 11th Married-civ-s… Handlers-… Husband Black Male
## 5 28 Private Bache… Married-civ-s… Prof-spec… Wife Black Female
## 6 37 Private Maste… Married-civ-s… Exec-mana… Wife White Female
## 7 49 Private 9th Married-spous… Other-ser… Not-in-fami… Black Female
## 8 52 Self-emp-no… HS-gr… Married-civ-s… Exec-mana… Husband White Male
## 9 31 Private Maste… Never-married Prof-spec… Not-in-fami… White Female
## 10 42 Private Bache… Married-civ-s… Exec-mana… Husband White Male
## # ℹ 32,552 more rows
## # ℹ 6 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## # country <chr>, salary <chr>, age_plus_10 <dbl>
Summarize by groups
data %>%
group_by(race) %>%
summarize(
avg_age = mean(age, na.rm = TRUE),
count = n()
)
## # A tibble: 6 × 3
## race avg_age count
## <chr> <dbl> <int>
## 1 Amer-Indian-Eskimo 37.2 311
## 2 Asian-Pac-Islander 37.7 1039
## 3 Black 37.8 3124
## 4 Other 33.5 271
## 5 White 38.8 27816
## 6 <NA> NaN 1