# data and libraries:

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package 'tidyr' was built under R version 4.3.2

## Warning: package 'readr' was built under R version 4.3.2

## Warning: package 'purrr' was built under R version 4.3.2

## Warning: package 'stringr' was built under R version 4.3.2

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(scales)

## Warning: package 'scales' was built under R version 4.3.2

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

data= read.csv("C:/Users/Chafiaa/Downloads/salaries.csv")
head(data)

##   AREA AREA_TITLE AREA_TYPE PRIM_STATE NAICS    NAICS_TITLE        I_GROUP
## 1    1    Alabama         2         AL     0 Cross-industry cross-industry
## 2    2     Alaska         2         AK     0 Cross-industry cross-industry
## 3    4    Arizona         2         AZ     0 Cross-industry cross-industry
## 4    5   Arkansas         2         AR     0 Cross-industry cross-industry
## 5    6 California         2         CA     0 Cross-industry cross-industry
## 6    8   Colorado         2         CO     0 Cross-industry cross-industry
##   OWN_CODE OCC_CODE               OCC_TITLE  O_GROUP TOT_EMP EMP_PRSE JOBS_1000
## 1     1235  15-1242 Database Administrators detailed    1740      4.8     0.868
## 2     1235  15-1242 Database Administrators detailed     110     10.7     0.347
## 3     1235  15-1242 Database Administrators detailed    1960      5.4     0.647
## 4     1235  15-1242 Database Administrators detailed     400      9.6     0.321
## 5     1235  15-1242 Database Administrators detailed    7830      3.4     0.444
## 6     1235  15-1242 Database Administrators detailed    2200       12     0.797
##   LOC_QUOTIENT PCT_TOTAL PCT_RPT H_MEAN A_MEAN MEAN_PRSE H_PCT10 H_PCT25
## 1         1.59        NA      NA  41.87  87090       1.2   25.16   31.94
## 2         0.64        NA      NA  44.25  92040       3.9   28.35   33.39
## 3         1.19        NA      NA  47.78  99370       1.4   25.37   35.08
## 4         0.59        NA      NA  37.29  77560       1.8   21.22   27.78
## 5         0.82        NA      NA  54.92 114240       1.7   27.53   36.65
## 6         1.46        NA      NA  53.47 111210       3.5   28.12   40.44
##   H_MEDIAN H_PCT75 H_PCT90     A_PCT10     A_PCT25     A_MEDIAN      A_PCT75
## 1    39.33   50.78   62.10  52,340.00   66,440.00    81,810.00   105,630.00 
## 2    40.06   52.57   60.24  58,970.00   69,450.00    83,330.00   109,340.00 
## 3    49.84   58.24   66.07  52,770.00   72,970.00   103,670.00   121,140.00 
## 4    36.67   44.05   53.55  44,130.00   57,780.00    76,280.00    91,620.00 
## 5    52.88   68.62   82.97  57,260.00   76,230.00   109,990.00   142,720.00 
## 6    51.82   64.99   75.81  58,500.00   84,100.00   107,780.00   135,180.00 
##        A_PCT90 ANNUAL HOURLY
## 1  129,160.00      NA     NA
## 2  125,300.00      NA     NA
## 3  137,430.00      NA     NA
## 4  111,380.00      NA     NA
## 5  172,580.00      NA     NA
## 6  157,680.00      NA     NA

I found the data on the Department of Labor OES 2022. I chose to work only some job titles: Data Scientists, Database Administrators, Database Architects and Management Analysts.

I started by the following bar plot:

palette_rainbow <- c("red", "blue", "green", "purple")

title_bp <- ggplot(data, aes(x=" ", y=A_MEAN, group=OCC_TITLE)) + 
  geom_col(aes(fill=OCC_TITLE)) + theme_minimal()
title_bp <- title_bp + scale_y_continuous(labels = label_comma())
title_bp <- title_bp + facet_grid(. ~ OCC_TITLE)
title_bp <- title_bp + scale_fill_manual(values=palette_rainbow)
title_bp <- title_bp + theme(legend.position="none") 
title_bp <- title_bp + theme(text = element_text(size=12), axis.title=element_text(size=12))
title_bp <- title_bp + labs(title = "Salaries in USA", x= " ", y= "Salary")

title_bp

## Warning: Removed 4 rows containing missing values (`position_stack()`).

# From the bar plot “salaries in USA” that I created we can see that Database Architects have the highest salaries.

#bar plot of the salaries by state:

state_bp <- ggplot(data, aes(x=PRIM_STATE, y=A_MEAN, fill=PRIM_STATE)) + 
  geom_col() + theme_minimal() + coord_flip()
state_bp <- state_bp + scale_y_continuous(labels = label_comma())
state_bp <- state_bp + theme(legend.position="none") 
state_bp <- state_bp + theme(text = element_text(size=8), axis.title=element_text(size=12)) 
state_bp <- state_bp + labs(title = "Salaries in USA by state", x= "State", y= "Salary")
state_bp <- state_bp + theme(plot.title = element_text(size=8))
state_bp

## Warning: Removed 4 rows containing missing values (`position_stack()`).

# From the plot “salaries in USA by state” we can say that there is a couple states with high paid salaries such as: WA, CA, NY, NJ, MA for the job titles (Data Scientists, Database Administrators, Database Architects and Management Analysts).

# Data filtering & rename:

data1 <- data %>%
  filter(OCC_TITLE == "Data Scientists")

data2<- data %>%
  filter(OCC_TITLE == "Database Administrators")

data3 <- data %>%
  filter(OCC_TITLE == "Database Architects")

data4 <- data %>%
  filter(OCC_TITLE == "Management Analysts")

# plot each job title by state in USA:


ggplot(data1) +
  geom_point(aes(x = reorder(PRIM_STATE, -A_MEAN), y = A_MEAN, fill = A_MEAN), stat = "identity", position = "dodge") + coord_flip() +
  theme(legend.position = "none", text = element_text(size=8)) +
  labs( title = "Data Scientists salary by state in USA", x = "states", y = "salaries", fill = "Source")

## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`

## Warning: Removed 1 rows containing missing values (`geom_point()`).

The states where the data scientists get paid well according to ” Data scientists salaries by satae in USA” plot is CA & WA.

ggplot(data2) +
  geom_point(aes(x = reorder(PRIM_STATE, -A_MEAN), y = A_MEAN, fill = A_MEAN), stat = "identity", position = "dodge") + coord_flip() +
  theme(legend.position = "none", text = element_text(size=8)) +
  labs( title = "Database Administraters salary by atate", x = "states", y = "salaries", fill = "Source")

## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`

# For databas administraters we can see from ” database administatraters salaries by satate” plot that NJ and MD are the satates with high pay for this job title.

ggplot(data3) +
  geom_point(aes(x = reorder(PRIM_STATE, -A_MEAN), y = A_MEAN, fill = A_MEAN), stat = "identity", position = "dodge") + coord_flip() +
  theme(legend.position = "none", text = element_text(size=8)) +
  labs( title = "Database Architects salary by state", x = " states", y = "salaries", fill = "Source")

## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`

## Warning: Removed 1 rows containing missing values (`geom_point()`).

# Database Architects have better salaries in: WA, CA, MD , this is shown very clearly on “Database Architects salary by state” plot.

ggplot(data4) +
  geom_point(aes(x = reorder(PRIM_STATE, -A_MEAN), y = A_MEAN, fill = A_MEAN), stat = "identity", position = "dodge") + coord_flip() + theme(legend.position = "none", text = element_text(size=8)) + labs( title = "Management Analysts Average Salaries by State", x = "", y = "", fill = "Source")

## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`

## Warning: Removed 2 rows containing missing values (`geom_point()`).

# and Management analysts show on “Management Analysts Average Salaries by State” plot they get high salaries from states: NJ, MA

Conclusion: from today analysis we can say that in the big states such as NJ, NY, MD, CA, WA, there is a need for data jobs.These states have a lot of competitive oportunities because of the high demande from different companies and sectors that offer high salaries to attract people with good skills to work for them.

Story 4

CN

2024-03-18

I found the data on the Department of Labor OES 2022. I chose to work only some job titles: Data Scientists, Database Administrators, Database Architects and Management Analysts.

The states where the data scientists get paid well according to ” Data scientists salaries by satae in USA” plot is CA & WA.