Import data

# excel file
data <- read_excel("data/mydatasal.xlsx")
data

## # A tibble: 32,562 × 13
##      age workclass    degree marital_status occupation relationship race  gender
##    <dbl> <chr>        <chr>  <chr>          <chr>      <chr>        <chr> <chr> 
##  1    39 State-gov    Bache… Never-married  Adm-cleri… Not-in-fami… White Male  
##  2    50 Self-emp-no… Bache… Married-civ-s… Exec-mana… Husband      White Male  
##  3    38 Private      HS-gr… Divorced       Handlers-… Not-in-fami… White Male  
##  4    53 Private      11th   Married-civ-s… Handlers-… Husband      Black Male  
##  5    28 Private      Bache… Married-civ-s… Prof-spec… Wife         Black Female
##  6    37 Private      Maste… Married-civ-s… Exec-mana… Wife         White Female
##  7    49 Private      9th    Married-spous… Other-ser… Not-in-fami… Black Female
##  8    52 Self-emp-no… HS-gr… Married-civ-s… Exec-mana… Husband      White Male  
##  9    31 Private      Maste… Never-married  Prof-spec… Not-in-fami… White Female
## 10    42 Private      Bache… Married-civ-s… Exec-mana… Husband      White Male  
## # ℹ 32,552 more rows
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## #   country <chr>, salary <chr>

Apply the following dplyr verbs to your data

Filter rows

filter(data, age == 1, race == "white")

## # A tibble: 0 × 13
## # ℹ 13 variables: age <dbl>, workclass <chr>, degree <chr>,
## #   marital_status <chr>, occupation <chr>, relationship <chr>, race <chr>,
## #   gender <chr>, Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## #   country <chr>, salary <chr>

Arrange rows

arrange(data, race)

## # A tibble: 32,562 × 13
##      age workclass degree    marital_status occupation relationship race  gender
##    <dbl> <chr>     <chr>     <chr>          <chr>      <chr>        <chr> <chr> 
##  1    34 Private   7th-8th   Married-civ-s… Transport… Husband      Amer… Male  
##  2    35 Private   Some-col… Never-married  Sales      Not-in-fami… Amer… Female
##  3    60 ?         10th      Divorced       ?          Not-in-fami… Amer… Female
##  4    51 Private   HS-grad   Married-civ-s… Craft-rep… Husband      Amer… Male  
##  5    25 Private   Some-col… Never-married  Adm-cleri… Not-in-fami… Amer… Female
##  6    20 Private   HS-grad   Never-married  Handlers-… Own-child    Amer… Male  
##  7    23 ?         Some-col… Never-married  ?          Unmarried    Amer… Female
##  8    29 Private   HS-grad   Never-married  Craft-rep… Not-in-fami… Amer… Male  
##  9    27 ?         Assoc-ac… Married-civ-s… ?          Own-child    Amer… Male  
## 10    31 Private   Assoc-ac… Divorced       Other-ser… Unmarried    Amer… Female
## # ℹ 32,552 more rows
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## #   country <chr>, salary <chr>

Select columns

arrange(data, desc(age))

## # A tibble: 32,562 × 13
##      age workclass degree    marital_status occupation relationship race  gender
##    <dbl> <chr>     <chr>     <chr>          <chr>      <chr>        <chr> <chr> 
##  1    90 Private   HS-grad   Never-married  Other-ser… Not-in-fami… Black Male  
##  2    90 Private   HS-grad   Never-married  Other-ser… Not-in-fami… White Female
##  3    90 Private   Bachelors Married-civ-s… Exec-mana… Husband      White Male  
##  4    90 Private   Some-col… Never-married  Other-ser… Not-in-fami… Asia… Male  
##  5    90 Private   Some-col… Separated      Adm-cleri… Own-child    White Female
##  6    90 Private   11th      Never-married  Handlers-… Own-child    White Male  
##  7    90 ?         Bachelors Widowed        ?          Other-relat… White Female
##  8    90 Private   Some-col… Never-married  Other-ser… Not-in-fami… Asia… Male  
##  9    90 Private   9th       Never-married  Adm-cleri… Not-in-fami… White Female
## 10    90 Local-gov Masters   Married-civ-s… Exec-mana… Husband      White Male  
## # ℹ 32,552 more rows
## # ℹ 5 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## #   country <chr>, salary <chr>

Add columns

mutate(data,
       age_plus_10 = age + 10)

## # A tibble: 32,562 × 14
##      age workclass    degree marital_status occupation relationship race  gender
##    <dbl> <chr>        <chr>  <chr>          <chr>      <chr>        <chr> <chr> 
##  1    39 State-gov    Bache… Never-married  Adm-cleri… Not-in-fami… White Male  
##  2    50 Self-emp-no… Bache… Married-civ-s… Exec-mana… Husband      White Male  
##  3    38 Private      HS-gr… Divorced       Handlers-… Not-in-fami… White Male  
##  4    53 Private      11th   Married-civ-s… Handlers-… Husband      Black Male  
##  5    28 Private      Bache… Married-civ-s… Prof-spec… Wife         Black Female
##  6    37 Private      Maste… Married-civ-s… Exec-mana… Wife         White Female
##  7    49 Private      9th    Married-spous… Other-ser… Not-in-fami… Black Female
##  8    52 Self-emp-no… HS-gr… Married-civ-s… Exec-mana… Husband      White Male  
##  9    31 Private      Maste… Never-married  Prof-spec… Not-in-fami… White Female
## 10    42 Private      Bache… Married-civ-s… Exec-mana… Husband      White Male  
## # ℹ 32,552 more rows
## # ℹ 6 more variables: Column11 <dbl>, Column12 <dbl>, hoursperweek <dbl>,
## #   country <chr>, salary <chr>, age_plus_10 <dbl>

Summarize by groups

data %>%
  group_by(race) %>%
  summarize(
    avg_age = mean(age, na.rm = TRUE),
    count = n()
  )

## # A tibble: 6 × 3
##   race               avg_age count
##   <chr>                <dbl> <int>
## 1 Amer-Indian-Eskimo    37.2   311
## 2 Asian-Pac-Islander    37.7  1039
## 3 Black                 37.8  3124
## 4 Other                 33.5   271
## 5 White                 38.8 27816
## 6 <NA>                 NaN       1

Module 6: Apply 5

Connor Sweeney

Import data

Apply the following dplyr verbs to your data

Filter rows

Arrange rows

Select columns

Add columns

Summarize by groups