library(readxl)
# excel file
data <- read_excel("../00_data/data/myData.xlsx")
data
## # A tibble: 9,355 × 12
## work_year job_title job_category salary_currency salary salary_in_usd
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2023 AI Architect Machine Learning… USD 305100 305100
## 2 2023 AI Architect Machine Learning… USD 146900 146900
## 3 2023 AI Architect Machine Learning… USD 330000 330000
## 4 2023 AI Architect Machine Learning… USD 204000 204000
## 5 2023 AI Architect Machine Learning… USD 330000 330000
## 6 2023 AI Architect Machine Learning… USD 204000 204000
## 7 2023 AI Architect Machine Learning… EUR 200000 215936
## 8 2023 AI Architect Machine Learning… USD 330000 330000
## 9 2023 AI Architect Machine Learning… USD 204000 204000
## 10 2023 AI Architect Machine Learning… USD 200000 200000
## # ℹ 9,345 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## # employment_type <chr>, work_setting <chr>, company_location <chr>,
## # company_size <chr>
data_small <- data %>%
select(job_category, salary_in_usd, company_location, work_year) %>%
filter(company_location %in% c("United States")) %>%
filter(work_year == 2023) %>%
filter(job_category == "Data Analysis")
data_long <- data_small %>%
pivot_longer(c('job_category', 'company_location'),
names_to = "category",
values_to = "location")
data_long
## # A tibble: 1,966 × 4
## salary_in_usd work_year category location
## <dbl> <dbl> <chr> <chr>
## 1 85000 2023 job_category Data Analysis
## 2 85000 2023 company_location United States
## 3 70000 2023 job_category Data Analysis
## 4 70000 2023 company_location United States
## 5 60000 2023 job_category Data Analysis
## 6 60000 2023 company_location United States
## 7 35000 2023 job_category Data Analysis
## 8 35000 2023 company_location United States
## 9 180000 2023 job_category Data Analysis
## 10 180000 2023 company_location United States
## # ℹ 1,956 more rows
I don’t know if there is a useful way to apply this to my data. I think it is not because there are no columns with the same type of data that is comparable. I tried and am confident to use it on Data when it is applicable.
data_long %>%
pivot_wider(names_from = category,
values_from = location)
## Warning: Values from `location` are not uniquely identified; output will contain
## list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} |>
## dplyr::summarise(n = dplyr::n(), .by = c(salary_in_usd, work_year, category))
## |>
## dplyr::filter(n > 1L)
## # A tibble: 305 × 4
## salary_in_usd work_year job_category company_location
## <dbl> <dbl> <list> <list>
## 1 85000 2023 <chr [24]> <chr [24]>
## 2 70000 2023 <chr [21]> <chr [21]>
## 3 60000 2023 <chr [12]> <chr [12]>
## 4 35000 2023 <chr [1]> <chr [1]>
## 5 180000 2023 <chr [5]> <chr [5]>
## 6 78000 2023 <chr [6]> <chr [6]>
## 7 105000 2023 <chr [24]> <chr [24]>
## 8 99000 2023 <chr [1]> <chr [1]>
## 9 95000 2023 <chr [26]> <chr [26]>
## 10 75000 2023 <chr [40]> <chr [40]>
## # ℹ 295 more rows
Thus no wide to long form
data_analysis <- data %>%
filter(job_category == "Data Analysis")
data_analysis
## # A tibble: 1,457 × 12
## work_year job_title job_category salary_currency salary salary_in_usd
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2023 BI Data Analyst Data Analysis USD 25000 25000
## 2 2023 BI Data Analyst Data Analysis USD 50000 50000
## 3 2023 BI Data Analyst Data Analysis USD 85000 85000
## 4 2023 BI Data Analyst Data Analysis USD 70000 70000
## 5 2023 BI Data Analyst Data Analysis USD 60000 60000
## 6 2023 BI Data Analyst Data Analysis EUR 67000 72338
## 7 2022 BI Data Analyst Data Analysis EUR 58000 60938
## 8 2022 BI Data Analyst Data Analysis EUR 100000 105066
## 9 2022 BI Data Analyst Data Analysis USD 57000 57000
## 10 2022 BI Data Analyst Data Analysis AUD 65000 45050
## # ℹ 1,447 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## # employment_type <chr>, work_setting <chr>, company_location <chr>,
## # company_size <chr>
data_analysis_sep <- data_analysis %>%
separate(work_year, into = c("century", "year"), sep = 3)
data_analysis_sep
## # A tibble: 1,457 × 13
## century year job_title job_category salary_currency salary salary_in_usd
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 202 3 BI Data Anal… Data Analys… USD 25000 25000
## 2 202 3 BI Data Anal… Data Analys… USD 50000 50000
## 3 202 3 BI Data Anal… Data Analys… USD 85000 85000
## 4 202 3 BI Data Anal… Data Analys… USD 70000 70000
## 5 202 3 BI Data Anal… Data Analys… USD 60000 60000
## 6 202 3 BI Data Anal… Data Analys… EUR 67000 72338
## 7 202 2 BI Data Anal… Data Analys… EUR 58000 60938
## 8 202 2 BI Data Anal… Data Analys… EUR 100000 105066
## 9 202 2 BI Data Anal… Data Analys… USD 57000 57000
## 10 202 2 BI Data Anal… Data Analys… AUD 65000 45050
## # ℹ 1,447 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## # employment_type <chr>, work_setting <chr>, company_location <chr>,
## # company_size <chr>
data_res_loc <- data %>%
unite(col = "residence_company-location", c(employee_residence,company_location), sep = " - ")
data_res_loc
## # A tibble: 9,355 × 11
## work_year job_title job_category salary_currency salary salary_in_usd
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2023 AI Architect Machine Learning… USD 305100 305100
## 2 2023 AI Architect Machine Learning… USD 146900 146900
## 3 2023 AI Architect Machine Learning… USD 330000 330000
## 4 2023 AI Architect Machine Learning… USD 204000 204000
## 5 2023 AI Architect Machine Learning… USD 330000 330000
## 6 2023 AI Architect Machine Learning… USD 204000 204000
## 7 2023 AI Architect Machine Learning… EUR 200000 215936
## 8 2023 AI Architect Machine Learning… USD 330000 330000
## 9 2023 AI Architect Machine Learning… USD 204000 204000
## 10 2023 AI Architect Machine Learning… USD 200000 200000
## # ℹ 9,345 more rows
## # ℹ 5 more variables: `residence_company-location` <chr>,
## # experience_level <chr>, employment_type <chr>, work_setting <chr>,
## # company_size <chr>
data_analysis_small <- data_analysis %>%
select(job_category, salary_in_usd, company_location, work_year) %>%
filter(company_location %in% c("Canada","United States"))