GitHub:
* https://github.com/CUNY-SPS-Data-Science-Program/story-4-seung-m1nsong
I have introduced the term “Data Practitioner” as a generic job descriptor because we have so many different job role titles for individuals whose work activities overlap including Data Scientist, Data Engineer, Data Analyst, Business Analyst, Data Architect, etc.
For this story we will answer the question, “How much do we get paid?” Your analysis and data visualizations must address the variation in average salary based on role descriptor and state.
Used dataset provided from U.S. Bureau of Labor Statics
Reference:
* https://data.bls.gov/oes/#/home
* https://www.monster.com/career-advice/article/tech-jobs-easiest-states-hire
#Data Scientist
data <- read.csv("https://raw.githubusercontent.com/seung-m1nsong/608/main/OES_Report.csv")
#DS_salary_df <- data[-c(1:5), ]
DS_salary_df <- data[, -c(3:7)]
colnames(DS_salary_df) <- gsub("\\.", "_", colnames(DS_salary_df))
colnames(DS_salary_df) <- sub("_2_", "", colnames(DS_salary_df))
DS_salary_df$Area_Name <- gsub("\\(.*\\)", "", DS_salary_df$Area_Name)
colnames(DS_salary_df)[colnames(DS_salary_df) == "Annual_mean_wage"] <- "DS_Annual_mean_wage"
str(DS_salary_df)
## 'data.frame': 51 obs. of 2 variables:
## $ Area_Name : chr "Alabama" "Alaska" "Arizona" "California" ...
## $ DS_Annual_mean_wage: chr " 91570" " 103810" " 104220" " 147390" ...
DS_salary_df$DS_Annual_mean_wage <- as.numeric(DS_salary_df$DS_Annual_mean_wage)
## Warning: NAs introduced by coercion
print(DS_salary_df)
## Area_Name DS_Annual_mean_wage
## 1 Alabama 91570
## 2 Alaska 103810
## 3 Arizona 104220
## 4 California 147390
## 5 Colorado 116560
## 6 Connecticut 110220
## 7 Delaware 133320
## 8 District of Columbia 110190
## 9 Florida 100780
## 10 Georgia 106400
## 11 Hawaii 103470
## 12 Idaho 95520
## 13 Illinois 109870
## 14 Indiana 78890
## 15 Iowa 94190
## 16 Kansas 94920
## 17 Kentucky 86470
## 18 Louisiana 71710
## 19 Maine 101080
## 20 Maryland 118960
## 21 Massachusetts 120340
## 22 Michigan 92820
## 23 Minnesota 111740
## 24 Mississippi 65940
## 25 Missouri 81860
## 26 Montana 74720
## 27 Nebraska 89340
## 28 Nevada 96870
## 29 New Hampshire 87300
## 30 New Jersey 129980
## 31 New Mexico 87090
## 32 New York 128200
## 33 North Carolina 119960
## 34 North Dakota 93340
## 35 Ohio 92080
## 36 Oklahoma 87810
## 37 Oregon 111090
## 38 Pennsylvania 95940
## 39 Puerto Rico 64910
## 40 Rhode Island 117570
## 41 South Carolina 100550
## 42 South Dakota 108430
## 43 Tennessee 112070
## 44 Texas 102230
## 45 Utah 88660
## 46 Vermont NA
## 47 Virginia 133990
## 48 Washington 140780
## 49 West Virginia 72550
## 50 Wisconsin 90760
## 51 Wyoming 128860
#Database Administrators
data1 <- read.csv("https://raw.githubusercontent.com/seung-m1nsong/608/main/OES_Report%20(1).csv")
#DA_salary_df <- data[-c(1:5), ]
DA_salary_df <- data1[, -c(3:7)]
colnames(DA_salary_df) <- gsub("\\.", "_", colnames(DA_salary_df))
colnames(DA_salary_df) <- sub("_2_", "", colnames(DA_salary_df))
DA_salary_df$Area_Name <- gsub("\\(.*\\)", "", DA_salary_df$Area_Name)
colnames(DA_salary_df)[colnames(DA_salary_df) == "Annual_mean_wage"] <- "DA_Annual_mean_wage"
DA_salary_df$DA_Annual_mean_wage <- as.numeric(DA_salary_df$DA_Annual_mean_wage)
print(DA_salary_df)
## Area_Name DA_Annual_mean_wage
## 1 Alabama 87090
## 2 Alaska 92040
## 3 Arizona 99370
## 4 Arkansas 77560
## 5 California 114240
## 6 Colorado 111210
## 7 Connecticut 114910
## 8 Delaware 104620
## 9 District of Columbia 111530
## 10 Florida 100260
## 11 Georgia 99220
## 12 Hawaii 96040
## 13 Idaho 76200
## 14 Illinois 109270
## 15 Indiana 81630
## 16 Iowa 92130
## 17 Kansas 99070
## 18 Kentucky 82420
## 19 Louisiana 102810
## 20 Maine 97330
## 21 Maryland 116970
## 22 Massachusetts 112300
## 23 Michigan 86470
## 24 Minnesota 97640
## 25 Mississippi 75980
## 26 Missouri 90560
## 27 Montana 77750
## 28 Nebraska 90020
## 29 Nevada 90850
## 30 New Hampshire 111680
## 31 New Jersey 122410
## 32 New Mexico 86150
## 33 New York 109150
## 34 North Carolina 102300
## 35 North Dakota 76720
## 36 Ohio 91200
## 37 Oklahoma 79190
## 38 Oregon 99770
## 39 Pennsylvania 89590
## 40 Puerto Rico 57410
## 41 Rhode Island 115340
## 42 South Carolina 92710
## 43 South Dakota 80780
## 44 Tennessee 105920
## 45 Texas 103910
## 46 Utah 94610
## 47 Virginia 103300
## 48 Washington 109590
## 49 West Virginia 70180
## 50 Wisconsin 93590
## 51 Wyoming 73420
#Database Architects
data2 <- read.csv("https://raw.githubusercontent.com/seung-m1nsong/608/main/OES_Report%20(2).csv")
#DA_salary_df <- data[-c(1:5), ]
DArch_salary_df <- data2[, -c(3:7)]
colnames(DArch_salary_df) <- gsub("\\.", "_", colnames(DArch_salary_df))
colnames(DArch_salary_df) <- sub("_2_", "", colnames(DArch_salary_df))
DArch_salary_df$Area_Name <- gsub("\\(.*\\)", "", DArch_salary_df$Area_Name)
colnames(DArch_salary_df)[colnames(DArch_salary_df) == "Annual_mean_wage"] <- "DArch_Annual_mean_wage"
DArch_salary_df$DArch_Annual_mean_wage <- as.numeric(DArch_salary_df$DArch_Annual_mean_wage)
## Warning: NAs introduced by coercion
print(DArch_salary_df)
## Area_Name DArch_Annual_mean_wage
## 1 Alabama 119590
## 2 Alaska 108900
## 3 Arizona 122560
## 4 Arkansas 97210
## 5 California 152960
## 6 Colorado 129930
## 7 Connecticut 127060
## 8 Delaware 142470
## 9 District of Columbia 141820
## 10 Florida 132370
## 11 Georgia 138450
## 12 Hawaii 108910
## 13 Idaho 126450
## 14 Illinois 132120
## 15 Indiana 91780
## 16 Iowa 124760
## 17 Kentucky 99400
## 18 Maine 130480
## 19 Maryland 152230
## 20 Massachusetts 144770
## 21 Michigan 90420
## 22 Minnesota 120830
## 23 Mississippi 96260
## 24 Missouri 107150
## 25 Montana 105250
## 26 Nebraska 116500
## 27 Nevada 87110
## 28 New Hampshire 125420
## 29 New Jersey 145870
## 30 New Mexico 110740
## 31 New York 137390
## 32 North Carolina 124060
## 33 North Dakota 97790
## 34 Ohio 115860
## 35 Oklahoma 120310
## 36 Oregon 125200
## 37 Pennsylvania 120180
## 38 Puerto Rico 70010
## 39 Rhode Island 126390
## 40 South Carolina 116040
## 41 Tennessee 114130
## 42 Texas 146970
## 43 Utah 145760
## 44 Vermont 121650
## 45 Virginia NA
## 46 Washington 159650
## 47 West Virginia 126100
## 48 Wisconsin 121770
#Database Architects
data3 <- read.csv("https://raw.githubusercontent.com/seung-m1nsong/608/main/OES_Report%20(4).csv")
#DA_salary_df <- data[-c(1:5), ]
CP_salary_df <- data3[, -c(3:7)]
colnames(CP_salary_df) <- gsub("\\.", "_", colnames(CP_salary_df))
colnames(CP_salary_df) <- sub("_2_", "", colnames(CP_salary_df))
CP_salary_df$Area_Name <- gsub("\\(.*\\)", "", CP_salary_df$Area_Name)
colnames(CP_salary_df)[colnames(CP_salary_df) == "Annual_mean_wage"] <- "CP_Annual_mean_wage"
CP_salary_df$CP_Annual_mean_wage <- as.numeric(CP_salary_df$CP_Annual_mean_wage)
## Warning: NAs introduced by coercion
print(CP_salary_df)
## Area_Name CP_Annual_mean_wage
## 1 Alabama 91450
## 2 Alaska 97440
## 3 Arizona 86700
## 4 Arkansas 83910
## 5 California 122660
## 6 Colorado 120370
## 7 Connecticut 93320
## 8 Delaware 96650
## 9 District of Columbia 111350
## 10 Florida 93320
## 11 Georgia 94330
## 12 Hawaii NA
## 13 Idaho 82660
## 14 Illinois 84670
## 15 Indiana 94180
## 16 Iowa 87770
## 17 Kansas 64350
## 18 Kentucky 83340
## 19 Louisiana NA
## 20 Maine 81750
## 21 Maryland 106800
## 22 Massachusetts 111270
## 23 Michigan 94500
## 24 Minnesota NA
## 25 Mississippi 71770
## 26 Missouri 76360
## 27 Montana 94920
## 28 Nebraska 92860
## 29 Nevada 97550
## 30 New Hampshire 79060
## 31 New Jersey 112070
## 32 New Mexico 74420
## 33 New York 108940
## 34 North Carolina 104890
## 35 North Dakota 89220
## 36 Ohio 85630
## 37 Oklahoma 88590
## 38 Oregon 102490
## 39 Pennsylvania 97590
## 40 Puerto Rico 52100
## 41 Rhode Island 89170
## 42 South Carolina 98810
## 43 South Dakota 65450
## 44 Tennessee NA
## 45 Texas 93570
## 46 Utah 99840
## 47 Vermont 81600
## 48 Virginia 119410
## 49 Washington 126320
## 50 West Virginia 66400
## 51 Wisconsin 101510
## 52 Wyoming 80850
#Database Architects
data4 <- read.csv("https://raw.githubusercontent.com/seung-m1nsong/608/main/OES_Report%20(6).csv")
#DA_salary_df <- data[-c(1:5), ]
MA_salary_df <- data4[, -c(3:7)]
colnames(MA_salary_df) <- gsub("\\.", "_", colnames(MA_salary_df))
colnames(MA_salary_df) <- sub("_2_", "", colnames(MA_salary_df))
MA_salary_df$Area_Name <- gsub("\\(.*\\)", "", MA_salary_df$Area_Name)
colnames(MA_salary_df)[colnames(MA_salary_df) == "Annual_mean_wage"] <- "MA_Annual_mean_wage"
MA_salary_df$MA_Annual_mean_wage <- as.numeric(MA_salary_df$MA_Annual_mean_wage)
## Warning: NAs introduced by coercion
print(MA_salary_df)
## Area_Name MA_Annual_mean_wage
## 1 Alabama 100680
## 2 Alaska 101820
## 3 Arizona 92950
## 4 Arkansas 78380
## 5 California 103180
## 6 Colorado 104980
## 7 Connecticut 105650
## 8 Delaware 89700
## 9 District of Columbia 115720
## 10 Florida 89760
## 11 Georgia 108840
## 12 Guam 68540
## 13 Hawaii 89950
## 14 Idaho 87090
## 15 Illinois 116650
## 16 Indiana 88770
## 17 Iowa 89390
## 18 Kansas 80940
## 19 Kentucky 95040
## 20 Louisiana 96870
## 21 Maine NA
## 22 Maryland 110260
## 23 Massachusetts 123010
## 24 Michigan 102780
## 25 Minnesota 104710
## 26 Mississippi 96520
## 27 Missouri 102790
## 28 Montana 91540
## 29 Nebraska 83710
## 30 Nevada 82570
## 31 New Hampshire 121320
## 32 New Jersey 126000
## 33 New Mexico 81840
## 34 New York 120730
## 35 North Carolina 102310
## 36 North Dakota 87950
## 37 Ohio 94970
## 38 Oklahoma 87410
## 39 Oregon 94640
## 40 Pennsylvania 94630
## 41 Puerto Rico 59400
## 42 Rhode Island 109920
## 43 South Carolina 87780
## 44 South Dakota 86120
## 45 Tennessee 92970
## 46 Texas 101760
## 47 Utah 87340
## 48 Vermont NA
## 49 Virgin Islands 80220
## 50 Virginia 113020
## 51 Washington 109560
## 52 West Virginia 90030
## 53 Wisconsin 96980
## 54 Wyoming 87600
combined_df <- DS_salary_df %>%
full_join(DA_salary_df, by = "Area_Name") %>%
full_join(DArch_salary_df, by = "Area_Name") %>%
full_join(CP_salary_df, by = "Area_Name") %>%
full_join(MA_salary_df, by = "Area_Name")
head(combined_df)
## Area_Name DS_Annual_mean_wage DA_Annual_mean_wage DArch_Annual_mean_wage
## 1 Alabama 91570 87090 119590
## 2 Alaska 103810 92040 108900
## 3 Arizona 104220 99370 122560
## 4 California 147390 114240 152960
## 5 Colorado 116560 111210 129930
## 6 Connecticut 110220 114910 127060
## CP_Annual_mean_wage MA_Annual_mean_wage
## 1 91450 100680
## 2 97440 101820
## 3 86700 92950
## 4 122660 103180
## 5 120370 104980
## 6 93320 105650
summary(DS_salary_df, na.rm = TRUE)
## Area_Name DS_Annual_mean_wage
## Length:51 Min. : 64910
## Class :character 1st Qu.: 89695
## Mode :character Median :100930
## Mean :102146
## 3rd Qu.:111988
## Max. :147390
## NA's :1
summary(DA_salary_df, na.rm = TRUE)
## Area_Name DA_Annual_mean_wage
## Length:51 Min. : 57410
## Class :character 1st Qu.: 86310
## Mode :character Median : 96040
## Mean : 95224
## 3rd Qu.:105270
## Max. :122410
summary(DArch_salary_df, na.rm = TRUE)
## Area_Name DArch_Annual_mean_wage
## Length:48 Min. : 70010
## Class :character 1st Qu.:109825
## Mode :character Median :122560
## Mean :121681
## 3rd Qu.:132245
## Max. :159650
## NA's :1
summary(CP_salary_df, na.rm = TRUE)
## Area_Name CP_Annual_mean_wage
## Length:52 Min. : 52100
## Class :character 1st Qu.: 83170
## Mode :character Median : 93320
## Mean : 92379
## 3rd Qu.:100258
## Max. :126320
## NA's :4
summary(MA_salary_df, na.rm = TRUE)
## Area_Name MA_Annual_mean_wage
## Length:54 Min. : 59400
## Class :character 1st Qu.: 87735
## Mode :character Median : 94805
## Mean : 96486
## 3rd Qu.:104778
## Max. :126000
## NA's :2
Median: The median salary is the value in the middle when the data is sorted by salary. There is relatively little influence from outliers or extremely high paid individuals. Therefore, because we wanted to identify the central tendency more robustly, we used the median value.
median_values <- c(median(combined_df$DS_Annual_mean_wage, na.rm = TRUE),
median(combined_df$DA_Annual_mean_wage, na.rm = TRUE),
median(combined_df$DArch_Annual_mean_wage, na.rm = TRUE),
median(combined_df$CP_Annual_mean_wage, na.rm = TRUE),
median(combined_df$DA_Annual_mean_wage, na.rm = TRUE))
median_df <- data.frame(Career = c("Data Scientists", "Database Administrators", "Database Architects", "Computer Programmers", "Management Analysts"), Median = median_values)
p <- ggplot(median_df, aes(x = Career, y = Median, fill = Career)) +
geom_bar(stat = "identity", position = "dodge", width = 0.6) +
labs(title = "National Median Annual Wages by Career 2022",
subtitle = "Period: May 2022",
x = "Career",
y = "Wage") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
plot.title = element_text(size = 12),
plot.subtitle = element_text(size = 8),
panel.grid = element_blank())
p + geom_text(aes(label = paste0("$", scales::comma(Median))), vjust = -0.5, size = 2) +
scale_fill_manual(values = c(
"Data Scientists" = "#91b1fd",
"Database Administrators" = "#fd9394",
"Database Architects" = "#fd9394",
"Computer Programmers" = "#fd9394",
"Management Analysts" = "#fd9394")) +
scale_y_continuous(labels = scales::comma)
California, Washington, Texas, New York, New Jersey, Massachusetts, and Florida stand out as states with significant tech-related job opportunities nationwide. This analysis examines wage disparities across different occupations within these states.
data <- data.table(
Career = c("Data Scientist", "Database Administrator", "Database Architect", "Computer Programmer", "Management Analyst"),
CA = c(147390, 114240, 152960, 122660, 103180),
WA = c(140780, 109590, 129930, 120370, 104980),
TX = c(102230, 103910, 108910, 111350, 115720),
NY = c(128200, 109150, 137390, 108940, 120730),
NJ = c(129980, 122410, 145870, 112070, 126000),
MA = c(120340, 112300, 144770, 111270, 123010),
FL = c(100780, 100260, 132370, 93320, 89760)
)
data_melted <- melt(data, id.vars = "Career")
# Data Scientist - Median
ds_mean <- 102146
# Database Administrator - Median
da_mean <- 95224
# Database Architect - Mean
darch_mean <- 121681
# Computer Programmer - Mean
cp_mean <- 92379
# Management Analyst - Mean
ma_mean <- 96486
mean_data <- data.frame(
Career = c("Data Scientist", "Database Administrator", "Database Architect", "Computer Programmer", "Management Analyst"),
Mean = c(ds_mean, da_mean, darch_mean, cp_mean, ma_mean)
)
# melt and combine with data_melted
data_melted <- merge(data_melted, mean_data, by = "Career")
# graph
plot <- ggplot(data_melted, aes(x = Career, y = value, fill = variable, label = variable)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "white") +
geom_text(position = position_dodge(width = 0.9), vjust = 1.5, size = 3, format = "comma") +
geom_point(aes(x = Career, y = Mean), color = "red", size = 0.05) +
labs(title = "Annual mean wage by the best states in the US for tech jobs",
subtitle = "Period: May 2022\n(Red dot = National average)",
x = "Career",
y = "Wage") +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
scale_fill_manual(values = c(
"CA" = "#a5c3f2",
"WA" = "#a5c3f2",
"TX" = "#a5c3f2",
"NY" = "#a5c3f2",
"NJ" = "#a5c3f2",
"MA" = "#a5c3f2",
"FL" = "#a5c3f2"
)) +
theme(legend.position = "none",
plot.title = element_text(size = 12),
plot.subtitle = element_text(size = 8),
panel.grid = element_blank())
## Warning in geom_text(position = position_dodge(width = 0.9), vjust = 1.5, :
## Ignoring unknown parameters: `format`
# Save the graph as an image file
ggsave("salary.png", plot, width = 10, height = 8, bg = "white") # width & height
image <- readPNG("C:/Users/SeungminSong/Downloads/608R/salary_final.png")
grid.newpage()
grid.raster(image)