This dataset is from my Week 5 discussion. It looked at the employment and salaries of healthcare employees from 2000 to 2020. My analysis was to see if there was a trend between healthcare employment and wages.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.0
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#Importing the CSV
df <- read.csv("https://raw.githubusercontent.com/LeJQC/MSDS/main/DATA%20607/Project%202/healthcare%20salary.csv",skip = 4, header = TRUE, stringsAsFactors = FALSE)
head(df)
## Occupation.title X2000 X2005 X2009
## 1 Health care practitioners and technical occupations
## 2 Audiologists 11,530 10,030 12,590
## 3 Cardiovascular technologists and technicians 40,080 43,560 48,070
## 4 Clinical laboratory technologists and technicians ? ? ?
## 5 Dental hygienists 148,460 161,140 173,900
## 6 Diagnostic medical sonographers 31,760 43,590 51,630
## X2010 X2015 X2016 X2020 X2000.1 X2005.1 X2009.1 X2010.1 X2015.1
## 1
## 2 12,860 12,070 12,310 13,300 22.92 27.72 32.14 33.58 37.22
## 3 48,720 51,400 53,760 55,980 16.81 19.99 23.91 24.38 26.97
## 4 ? 320,550 326,920 326,220 ? ? ? ? 24.91
## 5 177,520 200,550 204,990 194,830 24.99 29.15 32.63 33.02 34.96
## 6 53,010 61,250 65,790 73,920 22.03 26.65 30.60 31.20 34.08
## X2016.1 X2020.1
## 1 NA
## 2 38.12 42.90
## 3 27.45 29.30
## 4 25.13 26.92
## 5 35.31 37.53
## 6 34.49 37.40
#Just getting employment data
df_employment <- df %>%
select(seq(1,8))
years <- c("Occupation",2000,2005,2009,2010,2015,2016,2020)
colnames(df_employment) <- years
#Cleaning up the non-values
df_employment <- df_employment %>%
filter(`2020` >= 0)
#Getting rid of the footnotes
df_employment$Occupation <- gsub("\\\\[345]", "", df_employment$Occupation)
head(df_employment)
## Occupation 2000 2005 2009
## 1 Audiologists 11,530 10,030 12,590
## 2 Cardiovascular technologists and technicians 40,080 43,560 48,070
## 3 Clinical laboratory technologists and technicians ? ? ?
## 4 Dental hygienists 148,460 161,140 173,900
## 5 Diagnostic medical sonographers 31,760 43,590 51,630
## 6 Dietetic technicians 28,010 23,780 24,510
## 2010 2015 2016 2020
## 1 12,860 12,070 12,310 13,300
## 2 48,720 51,400 53,760 55,980
## 3 ? 320,550 326,920 326,220
## 4 177,520 200,550 204,990 194,830
## 5 53,010 61,250 65,790 73,920
## 6 23,890 28,950 32,240 26,430
#Converting the table into long format
df_long <- df_employment %>%
pivot_longer(
cols = seq(2,8),
names_to = "Year",
values_to = "Employment",
values_drop_na = TRUE) %>%
filter(Employment >= 0)
#Calculating the employment rate of change from 2000 to 2020
df_long_growth <- df_long %>%
#Converting Employment column from char to numeric
mutate(Employment = as.numeric(gsub(",", "", Employment)))%>%
filter(Year == 2000 | Year == 2020) %>%
group_by(Occupation) %>%
#There were a couple of occupations that were not around in 2000
filter(n_distinct(Year) == 2) %>%
mutate(pct_rate_change = round(((Employment[Year == 2020] - Employment[Year == 2000]) / Employment[Year == 2000]) * 100)) %>%
arrange(desc(pct_rate_change)) %>%
filter(Year == 2020) %>%
select("Occupation","pct_rate_change")
#Plotting the rate of change
df_long_growth %>%
ggplot(aes(x = Occupation, y = pct_rate_change)) +
geom_bar(stat = "identity") +
coord_flip()+
theme_minimal()+
labs(title = "Employment Percentage change from 2000 to 2020", y = "Percent change")
#Taking the 10 occupations with the highest rate of change I did all of
them at first but the graphs were really congested and I could not get
any information out of it
top_10 <- head(df_long_growth,10)
top_10_occupation <- top_10[1]
top_10_occupation
## # A tibble: 10 × 1
## # Groups: Occupation [10]
## Occupation
## <chr>
## 1 Massage therapists
## 2 Occupational therapy assistants
## 3 Diagnostic medical sonographers
## 4 Physician assistants
## 5 Pharmacy technicians
## 6 Medical assistants
## 7 Physical therapist assistants
## 8 Physical therapists
## 9 Speech-language pathologists
## 10 Medical equipment preparers
#Creating a dataframe with the salary information and joining it with the top 10 occupations by employment growth
df_salary <- df[c(1,9:15)]
colnames(df_salary) <- years
df_salary$Occupation <- gsub("\\\\[345]", "", df_salary$Occupation)
df_salary_top <- semi_join(df_salary, top_10_occupation, by = "Occupation")
#Had to manually convert these to numeric so I can use pivot_longer
df_salary_top$`2000` <- as.numeric(df_salary_top$`2000`)
df_salary_top$`2005` <- as.numeric(df_salary_top$`2005`)
df_salary_top$`2010` <- as.numeric(df_salary_top$`2010`)
df_salary_top$`2009` <- as.numeric(df_salary_top$`2009`)
df_salary_top$`2015` <- as.numeric(df_salary_top$`2015`)
df_salary_top$`2016` <- as.numeric(df_salary_top$`2016`)
df_salary_top
## Occupation 2000 2005 2009 2010 2015 2016 2020
## 1 Diagnostic medical sonographers 22.03 26.65 30.60 31.20 34.08 34.49 37.40
## 2 Pharmacy technicians 10.38 12.19 13.92 14.10 15.23 15.47 17.52
## 3 Physical therapists 27.62 31.42 36.64 37.50 41.25 41.93 44.08
## 4 Physician assistants 29.17 34.17 40.78 41.89 47.73 49.08 55.81
## 5 Speech-language pathologists 23.31 27.89 32.86 33.60 36.97 37.60 40.02
## 6 Massage therapists 15.51 19.33 19.13 19.12 20.76 21.39 22.77
## 7 Medical assistants 11.46 12.58 14.16 14.31 15.34 15.79 17.75
## 8 Medical equipment preparers 10.68 12.42 14.32 14.59 16.80 17.29 19.75
## 9 Occupational therapy assistants 16.76 19.13 24.44 24.66 28.05 28.62 30.49
## 10 Physical therapist assistants 16.52 18.98 23.36 23.95 26.56 27.33 28.58
#Converting the salary dataframe to long data
df_long_salary <- df_salary_top %>%
pivot_longer(
cols = seq(2,8),
names_to = "Year",
values_to = "Salary",
values_drop_na = TRUE)
df_long_salary
## # A tibble: 70 × 3
## Occupation Year Salary
## <chr> <chr> <dbl>
## 1 Diagnostic medical sonographers 2000 22.0
## 2 Diagnostic medical sonographers 2005 26.6
## 3 Diagnostic medical sonographers 2009 30.6
## 4 Diagnostic medical sonographers 2010 31.2
## 5 Diagnostic medical sonographers 2015 34.1
## 6 Diagnostic medical sonographers 2016 34.5
## 7 Diagnostic medical sonographers 2020 37.4
## 8 Pharmacy technicians 2000 10.4
## 9 Pharmacy technicians 2005 12.2
## 10 Pharmacy technicians 2009 13.9
## # … with 60 more rows
#Plotting the Salary by year
df_long_salary %>%
ggplot(aes(x=Year, y= Salary, color=Occupation))+
geom_point()+
geom_line(aes(group=Occupation))
#Calculating percentage change of salary
df_salary_change <- df_long_salary %>%
filter(Year == 2000 | Year == 2020) %>%
group_by(Occupation) %>%
mutate(pct_salary_change = round(((Salary[Year == 2020] - Salary[Year == 2000]) / Salary[Year == 2000]) * 100)) %>%
filter(Year == 2020) %>%
arrange(desc(pct_salary_change)) %>%
select("Occupation","pct_salary_change")
df_salary_change
## # A tibble: 10 × 2
## # Groups: Occupation [10]
## Occupation pct_salary_change
## <chr> <dbl>
## 1 Physician assistants 91
## 2 Medical equipment preparers 85
## 3 Occupational therapy assistants 82
## 4 Physical therapist assistants 73
## 5 Speech-language pathologists 72
## 6 Diagnostic medical sonographers 70
## 7 Pharmacy technicians 69
## 8 Physical therapists 60
## 9 Medical assistants 55
## 10 Massage therapists 47
#Joining that dataframe with the employment growth dataframe
df_change <- left_join(df_salary_change, df_long_growth, by = "Occupation")
df_change
## # A tibble: 10 × 3
## # Groups: Occupation [10]
## Occupation pct_salary_change pct_rate_change
## <chr> <dbl> <dbl>
## 1 Physician assistants 91 126
## 2 Medical equipment preparers 85 74
## 3 Occupational therapy assistants 82 169
## 4 Physical therapist assistants 73 110
## 5 Speech-language pathologists 72 79
## 6 Diagnostic medical sonographers 70 133
## 7 Pharmacy technicians 69 118
## 8 Physical therapists 60 83
## 9 Medical assistants 55 115
## 10 Massage therapists 47 245
#Plotting the results
df_change %>%
ggplot(aes(x= pct_rate_change, y=pct_salary_change, color = Occupation))+
geom_point(size = 5)+
theme_bw()+
labs(title= "Employment Growth vs. Salary Change", x = "Employment Growth (%)", y= "Salary Growth (%)")
Of the 10 occupations with the highest rate of growth from 2000 to 2020, physician assistants have the highest salary increase. Surprisingly, massage therapists, who have the highest employment growth rate, had the least amount of salary growth. It doesn’t look like there is a trend between employment growth and salary growth. However,on the bright side, every occupation’s salary has grown within the past 20 years.