Import data

# excel file
data <- read_excel("../00_data/data/myData.xlsx")
data
## # A tibble: 9,355 × 12
##    work_year job_title    job_category      salary_currency salary salary_in_usd
##        <dbl> <chr>        <chr>             <chr>            <dbl>         <dbl>
##  1      2023 AI Architect Machine Learning… USD             305100        305100
##  2      2023 AI Architect Machine Learning… USD             146900        146900
##  3      2023 AI Architect Machine Learning… USD             330000        330000
##  4      2023 AI Architect Machine Learning… USD             204000        204000
##  5      2023 AI Architect Machine Learning… USD             330000        330000
##  6      2023 AI Architect Machine Learning… USD             204000        204000
##  7      2023 AI Architect Machine Learning… EUR             200000        215936
##  8      2023 AI Architect Machine Learning… USD             330000        330000
##  9      2023 AI Architect Machine Learning… USD             204000        204000
## 10      2023 AI Architect Machine Learning… USD             200000        200000
## # ℹ 9,345 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## #   employment_type <chr>, work_setting <chr>, company_location <chr>,
## #   company_size <chr>

Apply the following dplyr verbs to your data

Filter rows

filtered <-filter(data, job_category == "Data Science and Research")

Arrange rows

filtered %>%
    arrange(desc(salary_in_usd))
## # A tibble: 3,014 × 12
##    work_year job_title         job_category salary_currency salary salary_in_usd
##        <dbl> <chr>             <chr>        <chr>            <dbl>         <dbl>
##  1      2020 Research Scienti… Data Scienc… USD             450000        450000
##  2      2021 Principal Data S… Data Scienc… USD             416000        416000
##  3      2020 Data Scientist    Data Scienc… USD             412000        412000
##  4      2023 Research Scienti… Data Scienc… USD             405000        405000
##  5      2023 Research Engineer Data Scienc… USD             385000        385000
##  6      2022 Applied Data Sci… Data Scienc… USD             380000        380000
##  7      2023 Director of Data… Data Scienc… USD             375500        375500
##  8      2022 Data Science Tec… Data Scienc… USD             375000        375000
##  9      2023 Research Scienti… Data Scienc… USD             374000        374000
## 10      2023 Data Scientist    Data Scienc… USD             370000        370000
## # ℹ 3,004 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## #   employment_type <chr>, work_setting <chr>, company_location <chr>,
## #   company_size <chr>

Select columns

select(data, employee_residence, company_size)
## # A tibble: 9,355 × 2
##    employee_residence company_size
##    <chr>              <chr>       
##  1 United States      M           
##  2 United States      M           
##  3 United States      M           
##  4 United States      M           
##  5 United States      M           
##  6 United States      M           
##  7 Belgium            L           
##  8 United States      M           
##  9 United States      M           
## 10 United States      L           
## # ℹ 9,345 more rows

Add columns

No Data to add in my set.

Summarize by groups

Summarise the dataset by calculating the mean salary of different job categories

data %>%
    # Group by job category
    group_by(job_category) %>%
    # Calculate mean Salaries in USD
    summarise(salary = mean(salary_in_usd, )) %>%
    # Sort it
    arrange(desc(salary)) %>%
    ungroup()
## # A tibble: 10 × 2
##    job_category                    salary
##    <chr>                            <dbl>
##  1 Machine Learning and AI        178926.
##  2 Data Science and Research      163759.
##  3 Data Architecture and Modeling 156002.
##  4 Cloud and Database             155000 
##  5 Data Engineering               146198.
##  6 Leadership and Management      145476.
##  7 BI and Visualization           135092.
##  8 Data Analysis                  108506.
##  9 Data Management and Strategy   103140.
## 10 Data Quality and Operations    100879.