# data and libraries:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
## Warning: package 'scales' was built under R version 4.3.2
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
data= read.csv("C:/Users/Chafiaa/Downloads/salaries.csv")
head(data)
## AREA AREA_TITLE AREA_TYPE PRIM_STATE NAICS NAICS_TITLE I_GROUP
## 1 1 Alabama 2 AL 0 Cross-industry cross-industry
## 2 2 Alaska 2 AK 0 Cross-industry cross-industry
## 3 4 Arizona 2 AZ 0 Cross-industry cross-industry
## 4 5 Arkansas 2 AR 0 Cross-industry cross-industry
## 5 6 California 2 CA 0 Cross-industry cross-industry
## 6 8 Colorado 2 CO 0 Cross-industry cross-industry
## OWN_CODE OCC_CODE OCC_TITLE O_GROUP TOT_EMP EMP_PRSE JOBS_1000
## 1 1235 15-1242 Database Administrators detailed 1740 4.8 0.868
## 2 1235 15-1242 Database Administrators detailed 110 10.7 0.347
## 3 1235 15-1242 Database Administrators detailed 1960 5.4 0.647
## 4 1235 15-1242 Database Administrators detailed 400 9.6 0.321
## 5 1235 15-1242 Database Administrators detailed 7830 3.4 0.444
## 6 1235 15-1242 Database Administrators detailed 2200 12 0.797
## LOC_QUOTIENT PCT_TOTAL PCT_RPT H_MEAN A_MEAN MEAN_PRSE H_PCT10 H_PCT25
## 1 1.59 NA NA 41.87 87090 1.2 25.16 31.94
## 2 0.64 NA NA 44.25 92040 3.9 28.35 33.39
## 3 1.19 NA NA 47.78 99370 1.4 25.37 35.08
## 4 0.59 NA NA 37.29 77560 1.8 21.22 27.78
## 5 0.82 NA NA 54.92 114240 1.7 27.53 36.65
## 6 1.46 NA NA 53.47 111210 3.5 28.12 40.44
## H_MEDIAN H_PCT75 H_PCT90 A_PCT10 A_PCT25 A_MEDIAN A_PCT75
## 1 39.33 50.78 62.10 52,340.00 66,440.00 81,810.00 105,630.00
## 2 40.06 52.57 60.24 58,970.00 69,450.00 83,330.00 109,340.00
## 3 49.84 58.24 66.07 52,770.00 72,970.00 103,670.00 121,140.00
## 4 36.67 44.05 53.55 44,130.00 57,780.00 76,280.00 91,620.00
## 5 52.88 68.62 82.97 57,260.00 76,230.00 109,990.00 142,720.00
## 6 51.82 64.99 75.81 58,500.00 84,100.00 107,780.00 135,180.00
## A_PCT90 ANNUAL HOURLY
## 1 129,160.00 NA NA
## 2 125,300.00 NA NA
## 3 137,430.00 NA NA
## 4 111,380.00 NA NA
## 5 172,580.00 NA NA
## 6 157,680.00 NA NA
I started by the following bar plot:
palette_rainbow <- c("red", "blue", "green", "purple")
title_bp <- ggplot(data, aes(x=" ", y=A_MEAN, group=OCC_TITLE)) +
geom_col(aes(fill=OCC_TITLE)) + theme_minimal()
title_bp <- title_bp + scale_y_continuous(labels = label_comma())
title_bp <- title_bp + facet_grid(. ~ OCC_TITLE)
title_bp <- title_bp + scale_fill_manual(values=palette_rainbow)
title_bp <- title_bp + theme(legend.position="none")
title_bp <- title_bp + theme(text = element_text(size=12), axis.title=element_text(size=12))
title_bp <- title_bp + labs(title = "Salaries in USA", x= " ", y= "Salary")
title_bp
## Warning: Removed 4 rows containing missing values (`position_stack()`).
# From the bar plot “salaries in USA” that I created we can see that
Database Architects have the highest salaries.
#bar plot of the salaries by state:
state_bp <- ggplot(data, aes(x=PRIM_STATE, y=A_MEAN, fill=PRIM_STATE)) +
geom_col() + theme_minimal() + coord_flip()
state_bp <- state_bp + scale_y_continuous(labels = label_comma())
state_bp <- state_bp + theme(legend.position="none")
state_bp <- state_bp + theme(text = element_text(size=8), axis.title=element_text(size=12))
state_bp <- state_bp + labs(title = "Salaries in USA by state", x= "State", y= "Salary")
state_bp <- state_bp + theme(plot.title = element_text(size=8))
state_bp
## Warning: Removed 4 rows containing missing values (`position_stack()`).
# From the plot “salaries in USA by state” we can say that there is a
couple states with high paid salaries such as: WA, CA, NY, NJ, MA for
the job titles (Data Scientists, Database Administrators, Database
Architects and Management Analysts).
# Data filtering & rename:
data1 <- data %>%
filter(OCC_TITLE == "Data Scientists")
data2<- data %>%
filter(OCC_TITLE == "Database Administrators")
data3 <- data %>%
filter(OCC_TITLE == "Database Architects")
data4 <- data %>%
filter(OCC_TITLE == "Management Analysts")
# plot each job title by state in USA:
ggplot(data1) +
geom_point(aes(x = reorder(PRIM_STATE, -A_MEAN), y = A_MEAN, fill = A_MEAN), stat = "identity", position = "dodge") + coord_flip() +
theme(legend.position = "none", text = element_text(size=8)) +
labs( title = "Data Scientists salary by state in USA", x = "states", y = "salaries", fill = "Source")
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 1 rows containing missing values (`geom_point()`).
ggplot(data2) +
geom_point(aes(x = reorder(PRIM_STATE, -A_MEAN), y = A_MEAN, fill = A_MEAN), stat = "identity", position = "dodge") + coord_flip() +
theme(legend.position = "none", text = element_text(size=8)) +
labs( title = "Database Administraters salary by atate", x = "states", y = "salaries", fill = "Source")
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
# For databas administraters we can see from ” database administatraters
salaries by satate” plot that NJ and MD are the satates with high pay
for this job title.
ggplot(data3) +
geom_point(aes(x = reorder(PRIM_STATE, -A_MEAN), y = A_MEAN, fill = A_MEAN), stat = "identity", position = "dodge") + coord_flip() +
theme(legend.position = "none", text = element_text(size=8)) +
labs( title = "Database Architects salary by state", x = " states", y = "salaries", fill = "Source")
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 1 rows containing missing values (`geom_point()`).
# Database Architects have better salaries in: WA, CA, MD , this is
shown very clearly on “Database Architects salary by state” plot.
ggplot(data4) +
geom_point(aes(x = reorder(PRIM_STATE, -A_MEAN), y = A_MEAN, fill = A_MEAN), stat = "identity", position = "dodge") + coord_flip() + theme(legend.position = "none", text = element_text(size=8)) + labs( title = "Management Analysts Average Salaries by State", x = "", y = "", fill = "Source")
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 2 rows containing missing values (`geom_point()`).
# and Management analysts show on “Management Analysts Average Salaries
by State” plot they get high salaries from states: NJ, MA