data_salary <- read.csv('https://raw.githubusercontent.com/Kingtilon1/DATA608/main/data_cleaned_2021.csv')
glimpse(data_salary)
## Rows: 742
## Columns: 42
## $ index <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ Job.Title <chr> "Data Scientist", "Healthcare Data Scientist", "Dat…
## $ Salary.Estimate <chr> "$53K-$91K (Glassdoor est.)", "$63K-$112K (Glassdoo…
## $ Job.Description <chr> "Data Scientist\nLocation: Albuquerque, NM\nEducati…
## $ Rating <dbl> 3.8, 3.4, 4.8, 3.8, 2.9, 3.4, 4.1, 3.8, 3.3, 4.6, 3…
## $ Company.Name <chr> "Tecolote Research\n3.8", "University of Maryland M…
## $ Location <chr> "Albuquerque, NM", "Linthicum, MD", "Clearwater, FL…
## $ Headquarters <chr> "Goleta, CA", "Baltimore, MD", "Clearwater, FL", "R…
## $ Size <chr> "501 - 1000 ", "10000+ ", "501 - 1000 ", "1001 - 50…
## $ Founded <int> 1973, 1984, 2010, 1965, 1998, 2000, 2008, 2005, 201…
## $ Type.of.ownership <chr> "Company - Private", "Other Organization", "Company…
## $ Industry <chr> "Aerospace & Defense", "Health Care Services & Hosp…
## $ Sector <chr> "Aerospace & Defense", "Health Care", "Business Ser…
## $ Revenue <chr> "$50 to $100 million (USD)", "$2 to $5 billion (USD…
## $ Competitors <chr> "-1", "-1", "-1", "Oak Ridge National Laboratory, N…
## $ Hourly <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Employer.provided <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Lower.Salary <int> 53, 63, 80, 56, 86, 71, 54, 86, 38, 120, 126, 64, 1…
## $ Upper.Salary <int> 91, 112, 90, 97, 143, 119, 93, 142, 84, 160, 201, 1…
## $ Avg.Salary.K. <dbl> 72.0, 87.5, 85.0, 76.5, 114.5, 95.0, 73.5, 114.0, 6…
## $ company_txt <chr> "Tecolote Research", "University of Maryland Medica…
## $ Job.Location <chr> "NM", "MD", "FL", "WA", "NY", "TX", "MD", "CA", "NY…
## $ Age <int> 48, 37, 11, 56, 23, 21, 13, 16, 7, 12, 10, 53, 59, …
## $ Python <int> 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, …
## $ spark <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, …
## $ aws <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ excel <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ sql <int> 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ sas <int> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ keras <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pytorch <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ scikit <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tensor <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ hadoop <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ tableau <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ bi <int> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ flink <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ mongo <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ google_an <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ job_title_sim <chr> "data scientist", "data scientist", "data scientist…
## $ seniority_by_title <chr> "na", "na", "na", "na", "na", "na", "na", "na", "na…
## $ Degree <chr> "M", "M", "M", "na", "na", "na", "na", "M", "P", "n…
##See the unique job roles
unique(data_salary$job_title_sim)
## [1] "data scientist" "other scientist"
## [3] "analyst" "data engineer"
## [5] "data analitics" "na"
## [7] "data modeler" "Data scientist project manager"
## [9] "machine learning engineer" "director"
data_salary = subset(data_salary, select= -c(Competitors,Hourly,Employer.provided,Age,Type.of.ownership,Founded,Size,Location,Rating,Job.Description,Headquarters))
data_salary <- data_salary[data_salary$job_title_sim != "na", ]
summary(data_salary$Avg.Salary.K.)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.5 73.5 97.5 101.6 122.8 254.0
hist(data_salary$Avg.Salary.K.,
col="green",
border="black",
prob = TRUE,
xlab = "Avg.Salary.K.",
main = "Dsitirbution of average salaries")
lines(density(data_salary$Avg.Salary.K.),
lwd = 2,
col = "chocolate3")
#### The Bar chart looks slightly rightly skewed, but overall
there is a normal distribution, since there is a normal distribution, we
can see that the median and mean is around 100k which is confirmed by
the previous code chunk
Means <- data_salary %>%
group_by(Job.Location) %>%
summarise(avg = mean(Avg.Salary.K.)) %>%
arrange(desc(avg))
Means
## # A tibble: 37 × 2
## Job.Location avg
## <chr> <dbl>
## 1 CA 124.
## 2 IL 117.
## 3 DC 110.
## 4 MA 107.
## 5 NJ 105.
## 6 MI 100.
## 7 RI 100
## 8 NY 98.7
## 9 NC 98.5
## 10 MD 97.7
## # ℹ 27 more rows
ps <- ggplot(Means, aes(x = Job.Location, y = avg)) +
geom_bar(stat = "identity", fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) +
ggtitle("Average Salary by State") +
xlab("Job Location") +
ylab("Average Salary") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1) # Rotate x-axis labels for better readability
)
# Print the plot
print(ps)
state_lookup <- data.frame(
abbreviation = c("al", "az", "ca", "co", "ct", "dc", "de", "fl", "ga",
"ia", "id", "il", "in", "ks", "ky", "la", "ma", "md",
"mi", "mn", "mo", "nc", "ne", "nj", "nm", "ny", "oh",
"or", "pa", "ri", "sc", "tn", "tx", "ut", "va", "wa", "wi"),
full_name = c("alabama", "arizona", "california", "colorado", "connecticut",
"district of columbia", "delaware", "florida", "georgia",
"iowa", "idaho", "illinois", "indiana", "kansas", "kentucky",
"louisiana", "massachusetts", "maryland", "michigan", "minnesota",
"missouri", "north carolina", "nebraska", "new jersey", "new mexico",
"new york", "ohio", "oregon", "pennsylvania", "rhode island",
"south carolina", "tennessee", "texas", "utah", "virginia",
"washington", "wisconsin")
)
agg_data <- data_salary %>%
group_by(Job.Location) %>%
summarise(Avg_Salary_K = mean(Avg.Salary.K., na.rm = TRUE))
agg_data$Job_Location <- tolower(agg_data$Job.Location)
states_map <- map_data("state")
agg_data <- agg_data %>%
left_join(state_lookup, by = c("Job_Location" = "abbreviation")) %>%
left_join(states_map, by = c("full_name" = "region"))
# Plot
# Define a color palette
red_palette <- colorRampPalette(c("#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#a50f15"))
ggplot() +
geom_polygon(data = agg_data, aes(x = long, y = lat, group = group, fill = Avg_Salary_K),
color = "black", size = 0.1) +
scale_fill_gradientn(colors = red_palette(5)) +
coord_fixed(1.3) +
labs(
title = "Data Scientist Salary by State",
fill = "Average Salary (thousands USD)"
) +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
box_plot <- ggplot(data_salary, aes(x = job_title_sim, y = Avg.Salary.K.,fill=job_title_sim)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
ggtitle("Distribution of Average Salary by Job Title") +
theme(
legend.position="none",
plot.title = element_text(size=11),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
xlab("Job Title") +
ylab("Average Salary")
# Print the plot
print(box_plot)