This dataset is from my Week 5 discussion. It looked at the employment and salaries of healthcare employees from 2000 to 2020. My analysis was to see if there was a trend between healthcare employment and wages.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

#Importing the CSV

df <- read.csv("https://raw.githubusercontent.com/LeJQC/MSDS/main/DATA%20607/Project%202/healthcare%20salary.csv",skip = 4, header = TRUE, stringsAsFactors = FALSE)
head(df)
##                                      Occupation.title   X2000   X2005   X2009
## 1 Health care practitioners and technical occupations                        
## 2                                        Audiologists  11,530  10,030  12,590
## 3        Cardiovascular technologists and technicians  40,080  43,560  48,070
## 4   Clinical laboratory technologists and technicians       ?       ?       ?
## 5                                   Dental hygienists 148,460 161,140 173,900
## 6                     Diagnostic medical sonographers  31,760  43,590  51,630
##     X2010   X2015   X2016   X2020 X2000.1 X2005.1 X2009.1 X2010.1 X2015.1
## 1                                                                        
## 2  12,860  12,070  12,310  13,300   22.92   27.72   32.14   33.58   37.22
## 3  48,720  51,400  53,760  55,980   16.81   19.99   23.91   24.38   26.97
## 4       ? 320,550 326,920 326,220       ?       ?       ?       ?   24.91
## 5 177,520 200,550 204,990 194,830   24.99   29.15   32.63   33.02   34.96
## 6  53,010  61,250  65,790  73,920   22.03   26.65   30.60   31.20   34.08
##   X2016.1 X2020.1
## 1              NA
## 2   38.12   42.90
## 3   27.45   29.30
## 4   25.13   26.92
## 5   35.31   37.53
## 6   34.49   37.40

#Just getting employment data

df_employment <- df %>% 
  select(seq(1,8))
years <- c("Occupation",2000,2005,2009,2010,2015,2016,2020)
colnames(df_employment) <- years

#Cleaning up the non-values
df_employment <- df_employment %>% 
  filter(`2020` >= 0)

#Getting rid of the footnotes
df_employment$Occupation <- gsub("\\\\[345]", "", df_employment$Occupation)
head(df_employment)
##                                          Occupation    2000    2005    2009
## 1                                      Audiologists  11,530  10,030  12,590
## 2      Cardiovascular technologists and technicians  40,080  43,560  48,070
## 3 Clinical laboratory technologists and technicians       ?       ?       ?
## 4                                 Dental hygienists 148,460 161,140 173,900
## 5                   Diagnostic medical sonographers  31,760  43,590  51,630
## 6                              Dietetic technicians  28,010  23,780  24,510
##      2010    2015    2016    2020
## 1  12,860  12,070  12,310  13,300
## 2  48,720  51,400  53,760  55,980
## 3       ? 320,550 326,920 326,220
## 4 177,520 200,550 204,990 194,830
## 5  53,010  61,250  65,790  73,920
## 6  23,890  28,950  32,240  26,430

#Converting the table into long format

df_long <- df_employment %>% 
  pivot_longer(
    cols = seq(2,8),
    names_to = "Year",
    values_to = "Employment",
    values_drop_na = TRUE) %>% 
  filter(Employment >= 0)

#Calculating the employment rate of change from 2000 to 2020

df_long_growth <- df_long %>% 
  #Converting Employment column from char to numeric
  mutate(Employment = as.numeric(gsub(",", "", Employment)))%>% 
  filter(Year == 2000 | Year == 2020) %>% 
  group_by(Occupation) %>% 
  #There were a couple of occupations that were not around in 2000
  filter(n_distinct(Year) == 2) %>% 
  mutate(pct_rate_change = round(((Employment[Year == 2020] - Employment[Year == 2000]) / Employment[Year == 2000]) * 100)) %>% 
  arrange(desc(pct_rate_change)) %>%
  filter(Year == 2020) %>% 
  select("Occupation","pct_rate_change")

#Plotting the rate of change

df_long_growth %>%
  ggplot(aes(x = Occupation, y = pct_rate_change)) +
  geom_bar(stat = "identity") +
  coord_flip()+
  theme_minimal()+
  labs(title = "Employment Percentage change from 2000 to 2020", y = "Percent change")

#Taking the 10 occupations with the highest rate of change I did all of them at first but the graphs were really congested and I could not get any information out of it

top_10 <- head(df_long_growth,10)

top_10_occupation <- top_10[1]
top_10_occupation
## # A tibble: 10 × 1
## # Groups:   Occupation [10]
##    Occupation                     
##    <chr>                          
##  1 Massage therapists             
##  2 Occupational therapy assistants
##  3 Diagnostic medical sonographers
##  4 Physician assistants           
##  5 Pharmacy technicians           
##  6 Medical assistants             
##  7 Physical therapist assistants  
##  8 Physical therapists            
##  9 Speech-language pathologists   
## 10 Medical equipment preparers

#Creating a dataframe with the salary information and joining it with the top 10 occupations by employment growth

df_salary <- df[c(1,9:15)]
colnames(df_salary) <- years
df_salary$Occupation <- gsub("\\\\[345]", "", df_salary$Occupation)

df_salary_top <- semi_join(df_salary, top_10_occupation, by = "Occupation")

#Had to manually convert these to numeric so I can use pivot_longer
df_salary_top$`2000` <- as.numeric(df_salary_top$`2000`)
df_salary_top$`2005` <- as.numeric(df_salary_top$`2005`)
df_salary_top$`2010` <- as.numeric(df_salary_top$`2010`)
df_salary_top$`2009` <- as.numeric(df_salary_top$`2009`)
df_salary_top$`2015` <- as.numeric(df_salary_top$`2015`)
df_salary_top$`2016` <- as.numeric(df_salary_top$`2016`)

df_salary_top
##                         Occupation  2000  2005  2009  2010  2015  2016  2020
## 1  Diagnostic medical sonographers 22.03 26.65 30.60 31.20 34.08 34.49 37.40
## 2             Pharmacy technicians 10.38 12.19 13.92 14.10 15.23 15.47 17.52
## 3              Physical therapists 27.62 31.42 36.64 37.50 41.25 41.93 44.08
## 4             Physician assistants 29.17 34.17 40.78 41.89 47.73 49.08 55.81
## 5     Speech-language pathologists 23.31 27.89 32.86 33.60 36.97 37.60 40.02
## 6               Massage therapists 15.51 19.33 19.13 19.12 20.76 21.39 22.77
## 7               Medical assistants 11.46 12.58 14.16 14.31 15.34 15.79 17.75
## 8      Medical equipment preparers 10.68 12.42 14.32 14.59 16.80 17.29 19.75
## 9  Occupational therapy assistants 16.76 19.13 24.44 24.66 28.05 28.62 30.49
## 10   Physical therapist assistants 16.52 18.98 23.36 23.95 26.56 27.33 28.58

#Converting the salary dataframe to long data

df_long_salary <- df_salary_top %>% 
  pivot_longer(
    cols = seq(2,8),
    names_to = "Year",
    values_to = "Salary",
    values_drop_na = TRUE)
df_long_salary
## # A tibble: 70 × 3
##    Occupation                      Year  Salary
##    <chr>                           <chr>  <dbl>
##  1 Diagnostic medical sonographers 2000    22.0
##  2 Diagnostic medical sonographers 2005    26.6
##  3 Diagnostic medical sonographers 2009    30.6
##  4 Diagnostic medical sonographers 2010    31.2
##  5 Diagnostic medical sonographers 2015    34.1
##  6 Diagnostic medical sonographers 2016    34.5
##  7 Diagnostic medical sonographers 2020    37.4
##  8 Pharmacy technicians            2000    10.4
##  9 Pharmacy technicians            2005    12.2
## 10 Pharmacy technicians            2009    13.9
## # … with 60 more rows

#Plotting the Salary by year

df_long_salary %>% 
  ggplot(aes(x=Year, y= Salary, color=Occupation))+
  geom_point()+
  geom_line(aes(group=Occupation))

#Calculating percentage change of salary

df_salary_change <- df_long_salary %>% 
  filter(Year == 2000 | Year == 2020) %>% 
  group_by(Occupation) %>% 
  mutate(pct_salary_change = round(((Salary[Year == 2020] - Salary[Year == 2000]) / Salary[Year == 2000]) * 100)) %>% 
  filter(Year == 2020) %>% 
  arrange(desc(pct_salary_change)) %>% 
  select("Occupation","pct_salary_change")

df_salary_change
## # A tibble: 10 × 2
## # Groups:   Occupation [10]
##    Occupation                      pct_salary_change
##    <chr>                                       <dbl>
##  1 Physician assistants                           91
##  2 Medical equipment preparers                    85
##  3 Occupational therapy assistants                82
##  4 Physical therapist assistants                  73
##  5 Speech-language pathologists                   72
##  6 Diagnostic medical sonographers                70
##  7 Pharmacy technicians                           69
##  8 Physical therapists                            60
##  9 Medical assistants                             55
## 10 Massage therapists                             47

#Joining that dataframe with the employment growth dataframe

df_change <- left_join(df_salary_change, df_long_growth, by = "Occupation")
df_change
## # A tibble: 10 × 3
## # Groups:   Occupation [10]
##    Occupation                      pct_salary_change pct_rate_change
##    <chr>                                       <dbl>           <dbl>
##  1 Physician assistants                           91             126
##  2 Medical equipment preparers                    85              74
##  3 Occupational therapy assistants                82             169
##  4 Physical therapist assistants                  73             110
##  5 Speech-language pathologists                   72              79
##  6 Diagnostic medical sonographers                70             133
##  7 Pharmacy technicians                           69             118
##  8 Physical therapists                            60              83
##  9 Medical assistants                             55             115
## 10 Massage therapists                             47             245

#Plotting the results

df_change %>% 
  ggplot(aes(x= pct_rate_change, y=pct_salary_change, color = Occupation))+
  geom_point(size = 5)+
  theme_bw()+
  labs(title= "Employment Growth vs. Salary Change", x = "Employment Growth (%)", y= "Salary Growth (%)")

Of the 10 occupations with the highest rate of growth from 2000 to 2020, physician assistants have the highest salary increase. Surprisingly, massage therapists, who have the highest employment growth rate, had the least amount of salary growth. It doesn’t look like there is a trend between employment growth and salary growth. However,on the bright side, every occupation’s salary has grown within the past 20 years.